slack_regex: Fix overlapping capture group in Slack regex.

The Slack text regexes match specific characters before and after the formatted string to prevent cases like reformatting already formatted text and test like `invlaid_bold_test` in `slack_message_conversion.json`. However, because the capture groups consume each matching character, two formatted strings separated by a single matching character result in one string not being matched, as the character is used to match the other string. e.g., (`*abc*b*def*` → **abc**b*def*) in this case, "b" is consumed to match *abc*. This prep commit modifies the Slack text regex in `slack_message_conversion.py` to use a non-greedy quantifier for the last capture group.
2024-11-18 14:18:47 +07:00 · 2024-11-18 14:18:47 +07:00 · 8764e15599
parent 5c8f10179a
commit 8764e15599
2 changed files with 9 additions and 6 deletions
--- a/zerver/data_import/slack_message_conversion.py
+++ b/zerver/data_import/slack_message_conversion.py
@ -52,21 +52,21 @@ SLACK_STRIKETHROUGH_REGEX = r"""
                             (\~)                                  # followed by an ~
                                ([^~]*)            # any character except ~
                             (\~)                                  # followed by an ~
-                             (\n|$|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^)  # ends with specified characters
+                             (?=\n|$|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^)  # ends with specified characters
                             """
 SLACK_ITALIC_REGEX = r"""
                      (\n|^|[ -*]|[+-/]|[:-?]|\{|\[|\||\^|~)
                      (\_)
                          ([^_]*)                # any character except _
                      (\_)
-                      (\n|$|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~)
+                      (?=\n|$|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~)
                      """
 SLACK_BOLD_REGEX = r"""
                    (\n|^|[ -(]|[+-/]|[:-?]|\{|\[|\_|\||\^|~)
                    (\*)
                        ([^*]*)                 # any character except *
                    (\*)
-                    (\n|$|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~)
+                    (?=\n|$|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~)
                    """
@ -139,9 +139,7 @@ def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str:
    3. For italic formatting: This maps Slack's '_italic_' to Zulip's '*italic*'
    """
    for match in re.finditer(regex, text, re.VERBOSE):
-        converted_token = (
+        converted_token = match.group(1) + zulip_keyword + match.group(3) + zulip_keyword
            match.group(1) + zulip_keyword + match.group(3) + zulip_keyword + match.group(5)
        )
        text = text.replace(match.group(0), converted_token)
    return text
--- a/zerver/tests/fixtures/slack_message_conversion.json
+++ b/zerver/tests/fixtures/slack_message_conversion.json
@ -94,6 +94,11 @@
      "name": "format_emoji_test",
      "input": "*1️⃣ bold* some _1️⃣ italic_ word ~1️⃣ strike~",
      "conversion_output": "**1️⃣ bold** some *1️⃣ italic* word ~~1️⃣ strike~~"
    },
    {
      "name": "overlapping_capture_group_test",
      "input": "*abc*\n*def*\n*ghi*\n*jkl*\n\n*ABC ABC*",
      "conversion_output": "**abc**\n**def**\n**ghi**\n**jkl**\n\n**ABC ABC**"
    }
  ]
 }