zulip/zerver/data_import/slack_message_conversion.py

import re
from typing import Any, Dict, Tuple, List, Optional

# stubs
ZerverFieldsT = Dict[str, Any]
SlackToZulipUserIDT = Dict[str, int]
AddedChannelsT = Dict[str, Tuple[str, int]]

# Slack link can be in the format <http://www.foo.com|www.foo.com> and <http://foo.com/>
LINK_REGEX = r"""
              (<)                                                              # match '>'
              (http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/|ftp:\/\/)?  # protocol and www
                  ([a-z0-9]+([\-\.]{1}[a-z0-9]+)*)(\.)                         # domain name
                      ([a-z]{2,63}(:[0-9]{1,5})?)                              # domain
                  (\/[^>]*)?                                                   # path
              (\|)?(?:\|([^>]+))?                                # char after pipe (for slack links)
              (>)
              """

SLACK_MAILTO_REGEX = r"""
                      <((mailto:)?                     # match  `<mailto:`
                      ([\w\.-]+@[\w\.-]+(\.[\w]+)+))   # match email
                          (\|)?                        # match pipe
                      ([\w\.-]+@[\w\.-]+(\.[\w]+)+)?>  # match email
                      """

SLACK_USERMENTION_REGEX = r"""
                           (<@)                  # Start with '<@'
                               ([a-zA-Z0-9]+)    # Here we have the Slack id
                           (\|)?                 # We not always have a Vertical line in mention
                               ([a-zA-Z0-9]+)?   # If Vertical line is present, this is short name
                           (>)                   # ends with '>'
                           """
# Slack doesn't have mid-word message-formatting like Zulip.
# Hence, ~stri~ke doesn't format the word in slack, but ~~stri~~ke
# formats the word in Zulip
SLACK_STRIKETHROUGH_REGEX = r"""
                             (^|[ -(]|[+-/]|\*|\_|[:-?]|\{|\[|\||\^)     # Start after specified characters
                             (\~)                                  # followed by an asterisk
                                 ([ -)+-}—]*)([ -}]+)              # any character except asterisk
                             (\~)                                  # followed by an asterisk
                             ($|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^)  # ends with specified characters
                             """
SLACK_ITALIC_REGEX = r"""
                      (^|[ -*]|[+-/]|[:-?]|\{|\[|\||\^|~)
                      (\_)
                          ([ -^`~—]*)([ -^`-~]+)                  # any character
                      (\_)
                      ($|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~)
                      """
SLACK_BOLD_REGEX = r"""
                    (^|[ -(]|[+-/]|[:-?]|\{|\[|\_|\||\^|~)
                    (\*)
                        ([ -)+-~—]*)([ -)+-~]+)                   # any character
                    (\*)
                    ($|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~)
                    """

def get_user_full_name(user: ZerverFieldsT) -> str:
    if "deleted" in user and user['deleted'] is False:
        return user['real_name'] or user['name']
    elif user["is_mirror_dummy"]:
        return user["profile"].get("real_name", user["name"])
    else:
        return user['name']

# Markdown mapping
def convert_to_zulip_markdown(text: str, users: List[ZerverFieldsT],
                              added_channels: AddedChannelsT,
                              slack_user_id_to_zulip_user_id: SlackToZulipUserIDT) -> \
        Tuple[str, List[int], bool]:
    mentioned_users_id = []
    text = convert_markdown_syntax(text, SLACK_BOLD_REGEX, "**")
    text = convert_markdown_syntax(text, SLACK_STRIKETHROUGH_REGEX, "~~")
    text = convert_markdown_syntax(text, SLACK_ITALIC_REGEX, "*")

    # Map Slack's mention all: '<!everyone>' to '@**all** '
    # Map Slack's mention all: '<!channel>' to '@**all** '
    # Map Slack's mention all: '<!here>' to '@**all** '
    # No regex for this as it can be present anywhere in the sentence
    text = text.replace('<!everyone>', '@**all**')
    text = text.replace('<!channel>', '@**all**')
    text = text.replace('<!here>', '@**all**')

    # Map Slack channel mention: '<#C5Z73A7RA|general>' to '#**general**'
    for cname, ids in added_channels.items():
        cid = ids[0]
        text = text.replace('<#%s|%s>' % (cid, cname), '#**' + cname + '**')

    tokens = text.split(' ')
    for iterator in range(len(tokens)):

        # Check user mentions and change mention format from
        # '<@slack_id|short_name>' to '@**full_name**'
        if (re.findall(SLACK_USERMENTION_REGEX, tokens[iterator], re.VERBOSE)):
            tokens[iterator], user_id = get_user_mentions(tokens[iterator], users,
                                                          slack_user_id_to_zulip_user_id)
            if user_id is not None:
                mentioned_users_id.append(user_id)

    text = ' '.join(tokens)

    # Check and convert link format
    text, has_link = convert_link_format(text)
    # convert `<mailto:foo@foo.com>` to `mailto:foo@foo.com`
    text, has_mailto_link = convert_mailto_format(text)

    if has_link is True or has_mailto_link is True:
        message_has_link = True
    else:
        message_has_link = False

    return text, mentioned_users_id, message_has_link

def get_user_mentions(token: str, users: List[ZerverFieldsT],
                      slack_user_id_to_zulip_user_id: SlackToZulipUserIDT) -> Tuple[str, Optional[int]]:
    slack_usermention_match = re.search(SLACK_USERMENTION_REGEX, token, re.VERBOSE)
    assert slack_usermention_match is not None
    short_name = slack_usermention_match.group(4)
    slack_id = slack_usermention_match.group(2)
    for user in users:
        if (user['id'] == slack_id and user['name'] == short_name and short_name) or \
           (user['id'] == slack_id and short_name is None):
            full_name = get_user_full_name(user)
            user_id = slack_user_id_to_zulip_user_id[slack_id]
            mention = "@**" + full_name + "**"
            token = re.sub(SLACK_USERMENTION_REGEX, mention, token, flags=re.VERBOSE)
            return token, user_id
    return token, None

# Map italic, bold and strikethrough markdown
def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str:
    """
    Returns:
    1. For strikethrough formatting: This maps Slack's '~strike~' to Zulip's '~~strike~~'
    2. For bold formatting: This maps Slack's '*bold*' to Zulip's '**bold**'
    3. For italic formatting: This maps Slack's '_italic_' to Zulip's '*italic*'
    """
    for match in re.finditer(regex, text, re.VERBOSE):
        converted_token = (match.group(1) + zulip_keyword + match.group(3)
                           + match.group(4) + zulip_keyword + match.group(6))
        text = text.replace(match.group(0), converted_token)
    return text

def convert_link_format(text: str) -> Tuple[str, bool]:
    """
    1. Converts '<https://foo.com>' to 'https://foo.com'
    2. Converts '<https://foo.com|foo>' to 'https://foo.com|foo'
    """
    has_link = False
    for match in re.finditer(LINK_REGEX, text, re.VERBOSE):
        converted_text = match.group(0).replace('>', '').replace('<', '')
        has_link = True
        text = text.replace(match.group(0), converted_text)
    return text, has_link

def convert_mailto_format(text: str) -> Tuple[str, bool]:
    """
    1. Converts '<mailto:foo@foo.com>' to 'mailto:foo@foo.com'
    2. Converts '<mailto:foo@foo.com|foo@foo.com>' to 'mailto:foo@foo.com'
    """
    has_link = False
    for match in re.finditer(SLACK_MAILTO_REGEX, text, re.VERBOSE):
        has_link = True
        text = text.replace(match.group(0), match.group(1))
    return text, has_link
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`import re`
import: Add slack import files in zerver/data_import directory. 2018-08-01 00:18:04 +02:00			`from typing import Any, Dict, Tuple, List, Optional`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
			`# stubs`
			`ZerverFieldsT = Dict[str, Any]`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`SlackToZulipUserIDT = Dict[str, int]`
Slack importer: Map Slack channel mentions to Zulip stream mentions. 2018-04-07 00:00:05 +02:00			`AddedChannelsT = Dict[str, Tuple[str, int]]`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
			`# Slack link can be in the format <http://www.foo.com\|www.foo.com> and <http://foo.com/>`
			`LINK_REGEX = r"""`
slack importer: Fix link regex and add test case. 2018-01-21 09:44:46 +01:00			`(<) # match '>'`
			`(http:\/\/www\.\|https:\/\/www\.\|http:\/\/\|https:\/\/\|ftp:\/\/)? # protocol and www`
			`([a-z0-9]+([\-\.]{1}[a-z0-9]+)*)(\.) # domain name`
			`([a-z]{2,63}(:[0-9]{1,5})?) # domain`
			`(\/[^>]*)? # path`
			`(\\|)?(?:\\|([^>]+))? # char after pipe (for slack links)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`(>)`
			`"""`
slack importer: Add regex for mail links. `<mailto:foo@foo.com>` is changed to `mailto:foo@foo.com`. 2018-01-21 12:30:54 +01:00
			`SLACK_MAILTO_REGEX = r"""`
			<((mailto:)? # match `<mailto:`
			`([\w\.-]+@[\w\.-]+(\.[\w]+)+)) # match email`
			`(\\|)? # match pipe`
			`([\w\.-]+@[\w\.-]+(\.[\w]+)+)?> # match email`
			`"""`

slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`SLACK_USERMENTION_REGEX = r"""`
			`(<@) # Start with '<@'`
			`([a-zA-Z0-9]+) # Here we have the Slack id`
			`(\\|)? # We not always have a Vertical line in mention`
			`([a-zA-Z0-9]+)? # If Vertical line is present, this is short name`
			`(>) # ends with '>'`
			`"""`
			`# Slack doesn't have mid-word message-formatting like Zulip.`
			`# Hence, ~stri~ke doesn't format the word in slack, but ~~stri~~ke`
			`# formats the word in Zulip`
			`SLACK_STRIKETHROUGH_REGEX = r"""`
			`(^\|[ -(]\|[+-/]\|\*\|\_\|[:-?]\|\{\|\[\|\\|\|\^) # Start after specified characters`
			`(\~) # followed by an asterisk`
			`([ -)+-}—]*)([ -}]+) # any character except asterisk`
			`(\~) # followed by an asterisk`
			`($\|[ -']\|[+-/]\|[:-?]\|\*\|\_\|\}\|\)\|\]\|\\|\|\^) # ends with specified characters`
			`"""`
			`SLACK_ITALIC_REGEX = r"""`
slack import: Add support for bold-italics formatting. Fixes #8927 2018-06-02 00:27:02 +02:00			`(^\|[ -*]\|[+-/]\|[:-?]\|\{\|\[\|\\|\|\^\|~)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`(\_)`
			([ -^`~—]*)([ -^`-~]+) # any character
			`(\_)`
slack import: Add support for bold-italics formatting. Fixes #8927 2018-06-02 00:27:02 +02:00			`($\|[ -']\|[+-/]\|[:-?]\|\}\|\)\|\]\|\*\|\\|\|\^\|~)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`"""`
			`SLACK_BOLD_REGEX = r"""`
slack import: Add support for bold-italics formatting. Fixes #8927 2018-06-02 00:27:02 +02:00			`(^\|[ -(]\|[+-/]\|[:-?]\|\{\|\[\|\_\|\\|\|\^\|~)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`(\*)`
			`([ -)+-~—]*)([ -)+-~]+) # any character`
			`(\*)`
slack import: Add support for bold-italics formatting. Fixes #8927 2018-06-02 00:27:02 +02:00			`($\|[ -']\|[+-/]\|[:-?]\|\}\|\)\|\]\|\_\|\\|\|\^\|~)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`"""`

			`def get_user_full_name(user: ZerverFieldsT) -> str:`
slack: Support importing shared channels. 2019-08-08 19:39:26 +02:00			`if "deleted" in user and user['deleted'] is False:`
			`return user['real_name'] or user['name']`
			`elif user["is_mirror_dummy"]:`
			`return user["profile"].get("real_name", user["name"])`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`else:`
			`return user['name']`

			`# Markdown mapping`
			`def convert_to_zulip_markdown(text: str, users: List[ZerverFieldsT],`
Slack importer: Map Slack channel mentions to Zulip stream mentions. 2018-04-07 00:00:05 +02:00			`added_channels: AddedChannelsT,`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`slack_user_id_to_zulip_user_id: SlackToZulipUserIDT) -> \`
			`Tuple[str, List[int], bool]:`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`mentioned_users_id = []`
			`text = convert_markdown_syntax(text, SLACK_BOLD_REGEX, "**")`
			`text = convert_markdown_syntax(text, SLACK_STRIKETHROUGH_REGEX, "~~")`
			`text = convert_markdown_syntax(text, SLACK_ITALIC_REGEX, "*")`

			`# Map Slack's mention all: '<!everyone>' to '@all '`
Slack importer: Map Slack command for mentions to Zulip's all. Fixes #9003. 2018-04-07 00:55:36 +02:00			`# Map Slack's mention all: '<!channel>' to '@all '`
			`# Map Slack's mention all: '<!here>' to '@all '`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`# No regex for this as it can be present anywhere in the sentence`
			`text = text.replace('<!everyone>', '@all')`
Slack importer: Map Slack command for mentions to Zulip's all. Fixes #9003. 2018-04-07 00:55:36 +02:00			`text = text.replace('<!channel>', '@all')`
			`text = text.replace('<!here>', '@all')`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
Slack importer: Map Slack channel mentions to Zulip stream mentions. 2018-04-07 00:00:05 +02:00			`# Map Slack channel mention: '<#C5Z73A7RA\|general>' to '#general'`
			`for cname, ids in added_channels.items():`
			`cid = ids[0]`
			`text = text.replace('<#%s\|%s>' % (cid, cname), '#' + cname + '')`

slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`tokens = text.split(' ')`
			`for iterator in range(len(tokens)):`

			`# Check user mentions and change mention format from`
			`# '<@slack_id\|short_name>' to '@full_name'`
			`if (re.findall(SLACK_USERMENTION_REGEX, tokens[iterator], re.VERBOSE)):`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`tokens[iterator], user_id = get_user_mentions(tokens[iterator], users,`
			`slack_user_id_to_zulip_user_id)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`if user_id is not None:`
			`mentioned_users_id.append(user_id)`

			`text = ' '.join(tokens)`

			`# Check and convert link format`
			`text, has_link = convert_link_format(text)`
slack importer: Add regex for mail links. `<mailto:foo@foo.com>` is changed to `mailto:foo@foo.com`. 2018-01-21 12:30:54 +01:00			# convert `<mailto:foo@foo.com>` to `mailto:foo@foo.com`
slack importer: Add test for checking link in message conversion. 2018-01-28 14:37:35 +01:00			`text, has_mailto_link = convert_mailto_format(text)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
slack importer: Add test for checking link in message conversion. 2018-01-28 14:37:35 +01:00			`if has_link is True or has_mailto_link is True:`
			`message_has_link = True`
			`else:`
			`message_has_link = False`

			`return text, mentioned_users_id, message_has_link`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00
			`def get_user_mentions(token: str, users: List[ZerverFieldsT],`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`slack_user_id_to_zulip_user_id: SlackToZulipUserIDT) -> Tuple[str, Optional[int]]:`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`slack_usermention_match = re.search(SLACK_USERMENTION_REGEX, token, re.VERBOSE)`
slack_message_conversion: Clean up type ignores. Signed-off-by: Anders Kaseorg <anders@zulipchat.com> 2019-08-10 00:30:33 +02:00			`assert slack_usermention_match is not None`
			`short_name = slack_usermention_match.group(4)`
			`slack_id = slack_usermention_match.group(2)`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`for user in users:`
			`if (user['id'] == slack_id and user['name'] == short_name and short_name) or \`
			`(user['id'] == slack_id and short_name is None):`
tools: Upgrade Pycodestyle and fix new linter errors. Here, we are upgrading pycodestyle version from 2.4.0 to 2.5.0. Fixes: #11396. 2019-01-31 14:32:37 +01:00			`full_name = get_user_full_name(user)`
slack: Rename added_users to slack_user_id_to_zulip_user_id. 2019-08-12 13:44:07 +02:00			`user_id = slack_user_id_to_zulip_user_id[slack_id]`
tools: Upgrade Pycodestyle and fix new linter errors. Here, we are upgrading pycodestyle version from 2.4.0 to 2.5.0. Fixes: #11396. 2019-01-31 14:32:37 +01:00			`mention = "@" + full_name + ""`
			`token = re.sub(SLACK_USERMENTION_REGEX, mention, token, flags=re.VERBOSE)`
			`return token, user_id`
slack importer: Move message conversions to a new module. 2018-01-11 15:52:31 +01:00			`return token, None`

			`# Map italic, bold and strikethrough markdown`
			`def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str:`
			`"""`
			`Returns:`
			`1. For strikethrough formatting: This maps Slack's '~strike~' to Zulip's '~~strike~~'`
			`2. For bold formatting: This maps Slack's 'bold' to Zulip's 'bold'`
			`3. For italic formatting: This maps Slack's '_italic_' to Zulip's 'italic'`
			`"""`
			`for match in re.finditer(regex, text, re.VERBOSE):`
			`converted_token = (match.group(1) + zulip_keyword + match.group(3)`
			`+ match.group(4) + zulip_keyword + match.group(6))`
			`text = text.replace(match.group(0), converted_token)`
			`return text`

			`def convert_link_format(text: str) -> Tuple[str, bool]:`
			`"""`
			`1. Converts '<https://foo.com>' to 'https://foo.com'`
			`2. Converts '<https://foo.com\|foo>' to 'https://foo.com\|foo'`
			`"""`
			`has_link = False`
			`for match in re.finditer(LINK_REGEX, text, re.VERBOSE):`
			`converted_text = match.group(0).replace('>', '').replace('<', '')`
			`has_link = True`
			`text = text.replace(match.group(0), converted_text)`
			`return text, has_link`
slack importer: Add regex for mail links. `<mailto:foo@foo.com>` is changed to `mailto:foo@foo.com`. 2018-01-21 12:30:54 +01:00
			`def convert_mailto_format(text: str) -> Tuple[str, bool]:`
			`"""`
			`1. Converts '<mailto:foo@foo.com>' to 'mailto:foo@foo.com'`
			`2. Converts '<mailto:foo@foo.com\|foo@foo.com>' to 'mailto:foo@foo.com'`
			`"""`
			`has_link = False`
			`for match in re.finditer(SLACK_MAILTO_REGEX, text, re.VERBOSE):`
			`has_link = True`
			`text = text.replace(match.group(0), match.group(1))`
			`return text, has_link`