zulip/zerver/data_import/slack_message_conversion.py

import re
from typing import Any, Dict, List, Optional, Tuple

# stubs
ZerverFieldsT = Dict[str, Any]
SlackToZulipUserIDT = Dict[str, int]
AddedChannelsT = Dict[str, Tuple[str, int]]

# Slack link can be in the format <http://www.foo.com|www.foo.com> and <http://foo.com/>
LINK_REGEX = r"""
              (<)                                                              # match '>'
              (http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/|ftp:\/\/)?  # protocol and www
                  ([a-z0-9]+([\-\.]{1}[a-z0-9]+)*)(\.)                         # domain name
                      ([a-z]{2,63}(:[0-9]{1,5})?)                              # domain
                  (\/[^>]*)?                                                   # path
              (\|)?(?:\|([^>]+))?                                # char after pipe (for slack links)
              (>)
              """

SLACK_MAILTO_REGEX = r"""
                      <((mailto:)?                     # match  `<mailto:`
                      ([\w\.-]+@[\w\.-]+(\.[\w]+)+))   # match email
                          (\|)?                        # match pipe
                      ([\w\.-]+@[\w\.-]+(\.[\w]+)+)?>  # match email
                      """

SLACK_USERMENTION_REGEX = r"""
                           (<@)                  # Start with '<@'
                               ([a-zA-Z0-9]+)    # Here we have the Slack id
                           (\|)?                 # We not always have a Vertical line in mention
                               ([a-zA-Z0-9]+)?   # If Vertical line is present, this is short name
                           (>)                   # ends with '>'
                           """
# Slack doesn't have mid-word message-formatting like Zulip.
# Hence, ~stri~ke doesn't format the word in slack, but ~~stri~~ke
# formats the word in Zulip
SLACK_STRIKETHROUGH_REGEX = r"""
                             (^|[ -(]|[+-/]|\*|\_|[:-?]|\{|\[|\||\^)     # Start after specified characters
                             (\~)                                  # followed by an asterisk
                                 ([ -)+-}—]*)([ -}]+)              # any character except asterisk
                             (\~)                                  # followed by an asterisk
                             ($|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^)  # ends with specified characters
                             """
SLACK_ITALIC_REGEX = r"""
                      (^|[ -*]|[+-/]|[:-?]|\{|\[|\||\^|~)
                      (\_)
                          ([ -^`~—]*)([ -^`-~]+)                  # any character
                      (\_)
                      ($|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~)
                      """
SLACK_BOLD_REGEX = r"""
                    (^|[ -(]|[+-/]|[:-?]|\{|\[|\_|\||\^|~)
                    (\*)
                        ([ -)+-~—]*)([ -)+-~]+)                   # any character
                    (\*)
                    ($|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~)
                    """

def get_user_full_name(user: ZerverFieldsT) -> str:
    if "deleted" in user and user['deleted'] is False:
        return user['real_name'] or user['name']
    elif user["is_mirror_dummy"]:
        return user["profile"].get("real_name", user["name"])
    else:
        return user['name']

# Markdown mapping
def convert_to_zulip_markdown(text: str, users: List[ZerverFieldsT],
                              added_channels: AddedChannelsT,
                              slack_user_id_to_zulip_user_id: SlackToZulipUserIDT) -> \
        Tuple[str, List[int], bool]:
    mentioned_users_id = []
    text = convert_markdown_syntax(text, SLACK_BOLD_REGEX, "**")
    text = convert_markdown_syntax(text, SLACK_STRIKETHROUGH_REGEX, "~~")
    text = convert_markdown_syntax(text, SLACK_ITALIC_REGEX, "*")

    # Map Slack's mention all: '<!everyone>' to '@**all** '
    # Map Slack's mention all: '<!channel>' to '@**all** '
    # Map Slack's mention all: '<!here>' to '@**all** '
    # No regex for this as it can be present anywhere in the sentence
    text = text.replace('<!everyone>', '@**all**')
    text = text.replace('<!channel>', '@**all**')
    text = text.replace('<!here>', '@**all**')

    # Map Slack channel mention: '<#C5Z73A7RA|general>' to '#**general**'
    for cname, ids in added_channels.items():
        cid = ids[0]
        text = text.replace(f'<#{cid}|{cname}>', '#**' + cname + '**')

    tokens = text.split(' ')
    for iterator in range(len(tokens)):

        # Check user mentions and change mention format from
        # '<@slack_id|short_name>' to '@**full_name**'
        if (re.findall(SLACK_USERMENTION_REGEX, tokens[iterator], re.VERBOSE)):
            tokens[iterator], user_id = get_user_mentions(tokens[iterator], users,
                                                          slack_user_id_to_zulip_user_id)
            if user_id is not None:
                mentioned_users_id.append(user_id)

    text = ' '.join(tokens)

    # Check and convert link format
    text, has_link = convert_link_format(text)
    # convert `<mailto:foo@foo.com>` to `mailto:foo@foo.com`
    text, has_mailto_link = convert_mailto_format(text)

    if has_link is True or has_mailto_link is True:
        message_has_link = True
    else:
        message_has_link = False

    return text, mentioned_users_id, message_has_link

def get_user_mentions(token: str, users: List[ZerverFieldsT],
                      slack_user_id_to_zulip_user_id: SlackToZulipUserIDT) -> Tuple[str, Optional[int]]:
    slack_usermention_match = re.search(SLACK_USERMENTION_REGEX, token, re.VERBOSE)
    assert slack_usermention_match is not None
    short_name = slack_usermention_match.group(4)
    slack_id = slack_usermention_match.group(2)
    for user in users:
        if (user['id'] == slack_id and user['name'] == short_name and short_name) or \
           (user['id'] == slack_id and short_name is None):
            full_name = get_user_full_name(user)
            user_id = slack_user_id_to_zulip_user_id[slack_id]
            mention = "@**" + full_name + "**"
            token = re.sub(SLACK_USERMENTION_REGEX, mention, token, flags=re.VERBOSE)
            return token, user_id
    return token, None

# Map italic, bold and strikethrough Markdown
def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str:
    """
    Returns:
    1. For strikethrough formatting: This maps Slack's '~strike~' to Zulip's '~~strike~~'
    2. For bold formatting: This maps Slack's '*bold*' to Zulip's '**bold**'
    3. For italic formatting: This maps Slack's '_italic_' to Zulip's '*italic*'
    """
    for match in re.finditer(regex, text, re.VERBOSE):
        converted_token = (match.group(1) + zulip_keyword + match.group(3)
                           + match.group(4) + zulip_keyword + match.group(6))
        text = text.replace(match.group(0), converted_token)
    return text

def convert_link_format(text: str) -> Tuple[str, bool]:
    """
    1. Converts '<https://foo.com>' to 'https://foo.com'
    2. Converts '<https://foo.com|foo>' to 'https://foo.com|foo'
    """
    has_link = False
    for match in re.finditer(LINK_REGEX, text, re.VERBOSE):
        converted_text = match.group(0).replace('>', '').replace('<', '')
        has_link = True
        text = text.replace(match.group(0), converted_text)
    return text, has_link

def convert_mailto_format(text: str) -> Tuple[str, bool]:
    """
    1. Converts '<mailto:foo@foo.com>' to 'mailto:foo@foo.com'
    2. Converts '<mailto:foo@foo.com|foo@foo.com>' to 'mailto:foo@foo.com'
    """
    has_link = False
    for match in re.finditer(SLACK_MAILTO_REGEX, text, re.VERBOSE):
        has_link = True
        text = text.replace(match.group(0), match.group(1))
    return text, has_link
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`import re`
python: Sort imports with isort. Fixes #2665. Regenerated by tabbott with `lint --fix` after a rebase and change in parameters. Note from tabbott: In a few cases, this converts technical debt in the form of unsorted imports into different technical debt in the form of our largest files having very long, ugly import sequences at the start. I expect this change will increase pressure for us to split those files, which isn't a bad thing. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-06-11 00:54:34 +02:00			`from typing import Any, Dict, List, Optional, Tuple`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
			`# stubs`
			`ZerverFieldsT = Dict[str, Any]`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`SlackToZulipUserIDT = Dict[str, int]`
Slack importer: Map Slack channel mentions to Zulip stream mentions. 2018-04-07 00:00:05 +02:00			`AddedChannelsT = Dict[str, Tuple[str, int]]`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
			`# Slack link can be in the format <http://www.foo.com\|www.foo.com> and <http://foo.com/>`
			`LINK_REGEX = r"""`
slack importer: Fix link regex and add test case. 2018-01-21 09:44:46 +01:00			`(<) # match '>'`
			`(http:\/\/www\.\|https:\/\/www\.\|http:\/\/\|https:\/\/\|ftp:\/\/)? # protocol and www`
			`([a-z0-9]+([\-\.]{1}[a-z0-9]+)*)(\.) # domain name`
			`([a-z]{2,63}(:[0-9]{1,5})?) # domain`
			`(\/[^>]*)? # path`
			`(\\|)?(?:\\|([^>]+))? # char after pipe (for slack links)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`(>)`
			`"""`
slack importer: Add regex for mail links. `<mailto:foo@foo.com>` is changed to `mailto:foo@foo.com`. 2018-01-21 12:30:54 +01:00
			`SLACK_MAILTO_REGEX = r"""`
			<((mailto:)? # match `<mailto:`
			`([\w\.-]+@[\w\.-]+(\.[\w]+)+)) # match email`
			`(\\|)? # match pipe`
			`([\w\.-]+@[\w\.-]+(\.[\w]+)+)?> # match email`
			`"""`

slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`SLACK_USERMENTION_REGEX = r"""`
			`(<@) # Start with '<@'`
			`([a-zA-Z0-9]+) # Here we have the Slack id`
			`(\\|)? # We not always have a Vertical line in mention`
			`([a-zA-Z0-9]+)? # If Vertical line is present, this is short name`
			`(>) # ends with '>'`
			`"""`
			`# Slack doesn't have mid-word message-formatting like Zulip.`
			`# Hence, ~stri~ke doesn't format the word in slack, but ~~stri~~ke`
			`# formats the word in Zulip`
			`SLACK_STRIKETHROUGH_REGEX = r"""`
			`(^\|[ -(]\|[+-/]\|\*\|\_\|[:-?]\|\{\|\[\|\\|\|\^) # Start after specified characters`
			`(\~) # followed by an asterisk`
			`([ -)+-}—]*)([ -}]+) # any character except asterisk`
			`(\~) # followed by an asterisk`
			`($\|[ -']\|[+-/]\|[:-?]\|\*\|\_\|\}\|\)\|\]\|\\|\|\^) # ends with specified characters`
			`"""`
			`SLACK_ITALIC_REGEX = r"""`
slack import: Add support for bold-italics formatting. Fixes #8927 2018-06-02 00:27:02 +02:00			`(^\|[ -*]\|[+-/]\|[:-?]\|\{\|\[\|\\|\|\^\|~)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`(\_)`
			([ -^`~—]*)([ -^`-~]+) # any character
			`(\_)`
slack import: Add support for bold-italics formatting. Fixes #8927 2018-06-02 00:27:02 +02:00			`($\|[ -']\|[+-/]\|[:-?]\|\}\|\)\|\]\|\*\|\\|\|\^\|~)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`"""`
			`SLACK_BOLD_REGEX = r"""`
slack import: Add support for bold-italics formatting. Fixes #8927 2018-06-02 00:27:02 +02:00			`(^\|[ -(]\|[+-/]\|[:-?]\|\{\|\[\|\_\|\\|\|\^\|~)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`(\*)`
			`([ -)+-~—]*)([ -)+-~]+) # any character`
			`(\*)`
slack import: Add support for bold-italics formatting. Fixes #8927 2018-06-02 00:27:02 +02:00			`($\|[ -']\|[+-/]\|[:-?]\|\}\|\)\|\]\|\_\|\\|\|\^\|~)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`"""`

			`def get_user_full_name(user: ZerverFieldsT) -> str:`
slack: Support importing shared channels. 2019-08-08 19:39:26 +02:00			`if "deleted" in user and user['deleted'] is False:`
			`return user['real_name'] or user['name']`
			`elif user["is_mirror_dummy"]:`
			`return user["profile"].get("real_name", user["name"])`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`else:`
			`return user['name']`

			`# Markdown mapping`
			`def convert_to_zulip_markdown(text: str, users: List[ZerverFieldsT],`
Slack importer: Map Slack channel mentions to Zulip stream mentions. 2018-04-07 00:00:05 +02:00			`added_channels: AddedChannelsT,`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`slack_user_id_to_zulip_user_id: SlackToZulipUserIDT) -> \`
			`Tuple[str, List[int], bool]:`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`mentioned_users_id = []`
			`text = convert_markdown_syntax(text, SLACK_BOLD_REGEX, "**")`
			`text = convert_markdown_syntax(text, SLACK_STRIKETHROUGH_REGEX, "~~")`
			`text = convert_markdown_syntax(text, SLACK_ITALIC_REGEX, "*")`

			`# Map Slack's mention all: '<!everyone>' to '@all '`
Slack importer: Map Slack command for mentions to Zulip's all. Fixes #9003. 2018-04-07 00:55:36 +02:00			`# Map Slack's mention all: '<!channel>' to '@all '`
			`# Map Slack's mention all: '<!here>' to '@all '`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`# No regex for this as it can be present anywhere in the sentence`
			`text = text.replace('<!everyone>', '@all')`
Slack importer: Map Slack command for mentions to Zulip's all. Fixes #9003. 2018-04-07 00:55:36 +02:00			`text = text.replace('<!channel>', '@all')`
			`text = text.replace('<!here>', '@all')`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
Slack importer: Map Slack channel mentions to Zulip stream mentions. 2018-04-07 00:00:05 +02:00			`# Map Slack channel mention: '<#C5Z73A7RA\|general>' to '#general'`
			`for cname, ids in added_channels.items():`
			`cid = ids[0]`
python: Convert percent formatting to Python 3.6 f-strings. Generated by pyupgrade --py36-plus. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-06-10 06:41:04 +02:00			`text = text.replace(f'<#{cid}\|{cname}>', '#' + cname + '')`
Slack importer: Map Slack channel mentions to Zulip stream mentions. 2018-04-07 00:00:05 +02:00
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`tokens = text.split(' ')`
			`for iterator in range(len(tokens)):`

			`# Check user mentions and change mention format from`
			`# '<@slack_id\|short_name>' to '@full_name'`
			`if (re.findall(SLACK_USERMENTION_REGEX, tokens[iterator], re.VERBOSE)):`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`tokens[iterator], user_id = get_user_mentions(tokens[iterator], users,`
			`slack_user_id_to_zulip_user_id)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`if user_id is not None:`
			`mentioned_users_id.append(user_id)`

			`text = ' '.join(tokens)`

			`# Check and convert link format`
			`text, has_link = convert_link_format(text)`
slack importer: Add regex for mail links. `<mailto:foo@foo.com>` is changed to `mailto:foo@foo.com`. 2018-01-21 12:30:54 +01:00			# convert `<mailto:foo@foo.com>` to `mailto:foo@foo.com`
slack importer: Add test for checking link in message conversion. 2018-01-28 14:37:35 +01:00			`text, has_mailto_link = convert_mailto_format(text)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
slack importer: Add test for checking link in message conversion. 2018-01-28 14:37:35 +01:00			`if has_link is True or has_mailto_link is True:`
			`message_has_link = True`
			`else:`
			`message_has_link = False`

			`return text, mentioned_users_id, message_has_link`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
			`def get_user_mentions(token: str, users: List[ZerverFieldsT],`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`slack_user_id_to_zulip_user_id: SlackToZulipUserIDT) -> Tuple[str, Optional[int]]:`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`slack_usermention_match = re.search(SLACK_USERMENTION_REGEX, token, re.VERBOSE)`
slack_message_conversion: Clean up type ignores. Signed-off-by: Anders Kaseorg <anders@zulipchat.com> 2019-08-10 00:30:33 +02:00			`assert slack_usermention_match is not None`
			`short_name = slack_usermention_match.group(4)`
			`slack_id = slack_usermention_match.group(2)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`for user in users:`
			`if (user['id'] == slack_id and user['name'] == short_name and short_name) or \`
			`(user['id'] == slack_id and short_name is None):`
tools: Upgrade Pycodestyle and fix new linter errors. Here, we are upgrading pycodestyle version from 2.4.0 to 2.5.0. Fixes: #11396. 2019-01-31 14:32:37 +01:00			`full_name = get_user_full_name(user)`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`user_id = slack_user_id_to_zulip_user_id[slack_id]`
tools: Upgrade Pycodestyle and fix new linter errors. Here, we are upgrading pycodestyle version from 2.4.0 to 2.5.0. Fixes: #11396. 2019-01-31 14:32:37 +01:00			`mention = "@" + full_name + ""`
			`token = re.sub(SLACK_USERMENTION_REGEX, mention, token, flags=re.VERBOSE)`
			`return token, user_id`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`return token, None`

docs: Capitalize Markdown consistently. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-08-11 01:47:49 +02:00			`# Map italic, bold and strikethrough Markdown`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str:`
			`"""`
			`Returns:`
			`1. For strikethrough formatting: This maps Slack's '~strike~' to Zulip's '~~strike~~'`
			`2. For bold formatting: This maps Slack's 'bold' to Zulip's 'bold'`
			`3. For italic formatting: This maps Slack's '_italic_' to Zulip's 'italic'`
			`"""`
			`for match in re.finditer(regex, text, re.VERBOSE):`
			`converted_token = (match.group(1) + zulip_keyword + match.group(3)`
			`+ match.group(4) + zulip_keyword + match.group(6))`
			`text = text.replace(match.group(0), converted_token)`
			`return text`

			`def convert_link_format(text: str) -> Tuple[str, bool]:`
			`"""`
			`1. Converts '<https://foo.com>' to 'https://foo.com'`
			`2. Converts '<https://foo.com\|foo>' to 'https://foo.com\|foo'`
			`"""`
			`has_link = False`
			`for match in re.finditer(LINK_REGEX, text, re.VERBOSE):`
			`converted_text = match.group(0).replace('>', '').replace('<', '')`
			`has_link = True`
			`text = text.replace(match.group(0), converted_text)`
			`return text, has_link`
slack importer: Add regex for mail links. `<mailto:foo@foo.com>` is changed to `mailto:foo@foo.com`. 2018-01-21 12:30:54 +01:00
			`def convert_mailto_format(text: str) -> Tuple[str, bool]:`
			`"""`
			`1. Converts '<mailto:foo@foo.com>' to 'mailto:foo@foo.com'`
			`2. Converts '<mailto:foo@foo.com\|foo@foo.com>' to 'mailto:foo@foo.com'`
			`"""`
			`has_link = False`
			`for match in re.finditer(SLACK_MAILTO_REGEX, text, re.VERBOSE):`
			`has_link = True`
			`text = text.replace(match.group(0), match.group(1))`
			`return text, has_link`