slack_regex: Fix overlapping capture group in Slack regex.

The Slack text regexes match specific characters before and after the
formatted string to prevent cases like reformatting already formatted
text and test like `invlaid_bold_test` in
`slack_message_conversion.json`.

However, because the capture groups consume each matching character, two
formatted strings separated by a single matching character result in one
string not being matched, as the character is used to match the other
string.

e.g., (`*abc*b*def*` → **abc**b*def*) in this case, "b" is consumed to
match *abc*.

This prep commit modifies the Slack text regex in
`slack_message_conversion.py` to use a non-greedy quantifier for the
last capture group.
This commit is contained in:
PieterCK 2024-11-18 14:18:47 +07:00
parent 5c8f10179a
commit 8764e15599
2 changed files with 9 additions and 6 deletions

View File

@ -52,21 +52,21 @@ SLACK_STRIKETHROUGH_REGEX = r"""
(\~) # followed by an ~
([^~]*) # any character except ~
(\~) # followed by an ~
(\n|$|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^) # ends with specified characters
(?=\n|$|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^) # ends with specified characters
"""
SLACK_ITALIC_REGEX = r"""
(\n|^|[ -*]|[+-/]|[:-?]|\{|\[|\||\^|~)
(\_)
([^_]*) # any character except _
(\_)
(\n|$|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~)
(?=\n|$|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~)
"""
SLACK_BOLD_REGEX = r"""
(\n|^|[ -(]|[+-/]|[:-?]|\{|\[|\_|\||\^|~)
(\*)
([^*]*) # any character except *
(\*)
(\n|$|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~)
(?=\n|$|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~)
"""
@ -139,9 +139,7 @@ def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str:
3. For italic formatting: This maps Slack's '_italic_' to Zulip's '*italic*'
"""
for match in re.finditer(regex, text, re.VERBOSE):
converted_token = (
match.group(1) + zulip_keyword + match.group(3) + zulip_keyword + match.group(5)
)
converted_token = match.group(1) + zulip_keyword + match.group(3) + zulip_keyword
text = text.replace(match.group(0), converted_token)
return text

View File

@ -94,6 +94,11 @@
"name": "format_emoji_test",
"input": "*1⃣ bold* some _1⃣ italic_ word ~1⃣ strike~",
"conversion_output": "**1⃣ bold** some *1⃣ italic* word ~~1⃣ strike~~"
},
{
"name": "overlapping_capture_group_test",
"input": "*abc*\n*def*\n*ghi*\n*jkl*\n\n*ABC ABC*",
"conversion_output": "**abc**\n**def**\n**ghi**\n**jkl**\n\n**ABC ABC**"
}
]
}