slack_data_import: Improve topic names for imported Slack threads.

In this commit, the topic names for imported Slack threads now include
extra details like a snippet of the original message and the thread's
date.

`thread_key` is used to map the converted thread conversations because
mapping with `thread_ts_str` had a small chance of combining threads
with the same timestamp under one topic.

Fixes #27661.
This commit is contained in:
PieterCK 2024-05-24 17:05:03 +07:00
parent fd647ae16f
commit 435ad8a111
3 changed files with 84 additions and 21 deletions

View File

@ -154,8 +154,8 @@ in mind about the import process:
| Multi Channel Guest | Guest | | Multi Channel Guest | Guest |
| Channel creator | none | | Channel creator | none |
- Slack threads are imported as topics with names like "2023-05-30 - Slack threads are imported as topics with names that include snippets of the
Slack thread 1". original message, such as "2023-05-30 Hi, can anyone reply if you're o…".
- Message edit history and `@user joined #channel_name` messages are not imported. - Message edit history and `@user joined #channel_name` messages are not imported.

View File

@ -874,6 +874,37 @@ def get_messages_iterator(
yield from sorted(messages_for_one_day, key=get_timestamp_from_message) yield from sorted(messages_for_one_day, key=get_timestamp_from_message)
def get_parent_user_id(message: ZerverFieldsT, subtype: str) -> str | None:
"""Retrieves the parent user ID based on message subtype."""
if subtype == "thread_broadcast":
return message.get("root", {}).get("user")
return message.get("parent_user_id")
def get_zulip_thread_topic_name(
message: ZerverFieldsT, thread_ts: datetime, thread_counter: dict[str, int]
) -> str:
"""
The topic name format is date + message snippet + counter.
e.g "2024-05-22 Hello this is a long message that will be c... (1)"
"""
THREAD_TOPIC_SNIPPET_LENGTH = 45
thread_snippet = (
message["text"]
if len(message["text"]) <= THREAD_TOPIC_SNIPPET_LENGTH
else message["text"][: THREAD_TOPIC_SNIPPET_LENGTH - 3] + "..."
)
thread_date = thread_ts.strftime(r"%Y-%m-%d")
base_zulip_topic_name = thread_date + thread_snippet
collision = thread_counter[base_zulip_topic_name]
thread_counter[base_zulip_topic_name] += 1
count = (f" ({collision+1})") if collision > 0 else ""
return f"{thread_date} {thread_snippet}{count}"
def channel_message_to_zerver_message( def channel_message_to_zerver_message(
realm_id: int, realm_id: int,
realm: ZerverFieldsT, realm: ZerverFieldsT,
@ -1012,25 +1043,23 @@ def channel_message_to_zerver_message(
has_image = file_info["has_image"] has_image = file_info["has_image"]
# Slack's unthreaded messages go into a single topic, while # Slack's unthreaded messages go into a single topic, while
# threads each generate a unique topic labeled by the date and # threads each generate a unique topic labeled by the date,
# a counter among topics on that day. # a snippet of the original message and a counter if there
# are any thread with the same topic name
topic_name = MAIN_IMPORT_TOPIC topic_name = MAIN_IMPORT_TOPIC
if convert_slack_threads and "thread_ts" in message: if convert_slack_threads and "thread_ts" in message:
thread_ts = datetime.fromtimestamp(float(message["thread_ts"]), tz=timezone.utc) thread_ts = datetime.fromtimestamp(float(message["thread_ts"]), tz=timezone.utc)
message_ts = datetime.fromtimestamp(float(message["ts"]), tz=timezone.utc) message_ts = datetime.fromtimestamp(float(message["ts"]), tz=timezone.utc)
thread_ts_str = thread_ts.strftime(r"%Y/%m/%d %H:%M:%S") thread_ts_str = thread_ts.strftime(r"%Y/%m/%d %H:%M:%S")
parent_user_id = get_parent_user_id(message, subtype)
thread_key = f"{thread_ts_str}-{parent_user_id}"
if thread_ts == message_ts: if thread_ts == message_ts:
# If the message is at the start of a thread, send it to the # If the message is at the start of a thread, send it to the
# main import channel and append a cross-linking notification # main import channel and append a cross-linking notification
# message to it. # message to it.
thread_topic_name = get_zulip_thread_topic_name(message, thread_ts, thread_counter)
# The topic name is "2015-08-18 Slack thread 2", where the counter at the end is to disambiguate
# threads with the same date.
thread_date = thread_ts.strftime(r"%Y-%m-%d")
thread_counter[thread_date] += 1
count = thread_counter[thread_date]
thread_topic_name = f"{thread_date} Slack thread {count}"
thread_topic_link_str = f"#**{import_channel_name}>{thread_topic_name}**" thread_topic_link_str = f"#**{import_channel_name}>{thread_topic_name}**"
message_dict = { message_dict = {
@ -1040,8 +1069,9 @@ def channel_message_to_zerver_message(
"topic": topic_name, "topic": topic_name,
"display_recipient": import_channel_name, "display_recipient": import_channel_name,
} }
thread_key = f"{thread_ts_str}-{slack_user_id}"
thread_map[thread_ts_str] = { thread_map[thread_key] = {
"thread_topic_name": thread_topic_name, "thread_topic_name": thread_topic_name,
"thread_head_message_index": len(zerver_message), "thread_head_message_index": len(zerver_message),
"thread_topic_link_str": thread_topic_link_str, "thread_topic_link_str": thread_topic_link_str,
@ -1052,11 +1082,10 @@ def channel_message_to_zerver_message(
), ),
"thread_head_message": content, "thread_head_message": content,
} }
elif thread_key in thread_map:
elif thread_ts_str in thread_map:
# The first thread reply will quote and reply to the original # The first thread reply will quote and reply to the original
# thread message/thread head in the main import topic. # thread message/thread head in the main import topic.
thread_metadata = thread_map[thread_ts_str] thread_metadata = thread_map[thread_key]
topic_name = thread_metadata.get("thread_topic_name", topic_name) topic_name = thread_metadata.get("thread_topic_name", topic_name)
if thread_metadata["thread_length"] == 0: if thread_metadata["thread_length"] == 0:
thread_metadata.update({"first_thread_reply": content}) thread_metadata.update({"first_thread_reply": content})

View File

@ -1140,6 +1140,7 @@ class SlackImporter(ZulipTestCase):
"user": "U061A5N1G", "user": "U061A5N1G",
"ts": "1434139102.000002", "ts": "1434139102.000002",
# Start of thread 1! # Start of thread 1!
"parent_user_id": "U061A5N1G",
"thread_ts": "1434139102.000002", "thread_ts": "1434139102.000002",
"channel_name": "random", "channel_name": "random",
}, },
@ -1148,6 +1149,7 @@ class SlackImporter(ZulipTestCase):
"user": "U061A5N1G", "user": "U061A5N1G",
"ts": "1439868294.000007", "ts": "1439868294.000007",
# A reply to thread 1 # A reply to thread 1
"parent_user_id": "U061A5N1G",
"thread_ts": "1434139102.000002", "thread_ts": "1434139102.000002",
"channel_name": "random", "channel_name": "random",
}, },
@ -1156,6 +1158,7 @@ class SlackImporter(ZulipTestCase):
"user": "U061A5N1G", "user": "U061A5N1G",
"ts": "1439868294.000008", "ts": "1439868294.000008",
# Start of thread 2! # Start of thread 2!
"parent_user_id": "U061A5N1G",
"thread_ts": "1439868294.000008", "thread_ts": "1439868294.000008",
"channel_name": "random", "channel_name": "random",
}, },
@ -1164,6 +1167,7 @@ class SlackImporter(ZulipTestCase):
"user": "U061A1R2R", "user": "U061A1R2R",
"ts": "1439869294.000008", "ts": "1439869294.000008",
# A reply to thread 2 # A reply to thread 2
"parent_user_id": "U061A5N1G",
"thread_ts": "1439868294.000008", "thread_ts": "1439868294.000008",
"channel_name": "random", "channel_name": "random",
}, },
@ -1172,6 +1176,7 @@ class SlackImporter(ZulipTestCase):
"user": "U061A1R2R", "user": "U061A1R2R",
"ts": "1439869494.000008", "ts": "1439869494.000008",
# A reply to thread 2 # A reply to thread 2
"parent_user_id": "U061A5N1G",
"thread_ts": "1439868294.000008", "thread_ts": "1439868294.000008",
"channel_name": "random", "channel_name": "random",
}, },
@ -1180,6 +1185,7 @@ class SlackImporter(ZulipTestCase):
"user": "U061A5N1G", "user": "U061A5N1G",
"ts": "1439868295.000008", "ts": "1439868295.000008",
# Start of thread 3! # Start of thread 3!
"parent_user_id": "U061A5N1G",
"thread_ts": "1439868295.000008", "thread_ts": "1439868295.000008",
"channel_name": "random", "channel_name": "random",
}, },
@ -1189,6 +1195,7 @@ class SlackImporter(ZulipTestCase):
"ts": "1439869295.000008", "ts": "1439869295.000008",
"subtype": "thread_broadcast", "subtype": "thread_broadcast",
# A broadcasted thread reply in thread 3! # A broadcasted thread reply in thread 3!
"root": {"user": "U061A5N1G"},
"thread_ts": "1439868295.000008", "thread_ts": "1439868295.000008",
"channel_name": "random", "channel_name": "random",
}, },
@ -1198,9 +1205,28 @@ class SlackImporter(ZulipTestCase):
"ts": "1439869395.000008", "ts": "1439869395.000008",
"subtype": "thread_broadcast", "subtype": "thread_broadcast",
# Another broadcasted thread reply in thread 3! # Another broadcasted thread reply in thread 3!
"root": {"user": "U061A5N1G"},
"thread_ts": "1439868295.000008", "thread_ts": "1439868295.000008",
"channel_name": "random", "channel_name": "random",
}, },
{
"text": "random",
"user": "U061A1R2R",
"ts": "1439868294.000008",
# Start of thread 4!
"parent_user_id": "U061A1R2R",
"thread_ts": "1439868294.000008",
"channel_name": "random",
},
{
"text": "replying to the fourth thread :)",
"user": "U061A5N1G",
"ts": "1439869294.000008",
# A reply to thread 4
"parent_user_id": "U061A1R2R",
"thread_ts": "1439868294.000008",
"channel_name": "random",
},
] ]
slack_recipient_name_to_zulip_recipient_id = { slack_recipient_name_to_zulip_recipient_id = {
@ -1246,7 +1272,7 @@ class SlackImporter(ZulipTestCase):
# functioning already tested in helper function # functioning already tested in helper function
self.assertEqual(zerver_usermessage, []) self.assertEqual(zerver_usermessage, [])
# subtype: channel_join is filtered # subtype: channel_join is filtered
self.assert_length(zerver_message, 11) self.assert_length(zerver_message, 13)
self.assertEqual(uploads, []) self.assertEqual(uploads, [])
self.assertEqual(attachment, []) self.assertEqual(attachment, [])
@ -1257,7 +1283,7 @@ class SlackImporter(ZulipTestCase):
# Original thread message in the main topic will have additional cross-linking # Original thread message in the main topic will have additional cross-linking
# message appended to it # message appended to it
thread1_topic_name = "2015-06-12 Slack thread 1" thread1_topic_name = "2015-06-12 message body text"
original_thread1_message = ( original_thread1_message = (
f"message body text\n\n*1 reply in #**random>{thread1_topic_name}***" f"message body text\n\n*1 reply in #**random>{thread1_topic_name}***"
) )
@ -1279,7 +1305,7 @@ random
self.assertEqual(zerver_message[2][EXPORT_TOPIC_NAME], thread1_topic_name) self.assertEqual(zerver_message[2][EXPORT_TOPIC_NAME], thread1_topic_name)
# A new thread with a different date from 2015-06-12, starts the counter from 1. # A new thread with a different date from 2015-06-12, starts the counter from 1.
thread2_topic_name = "2015-08-18 Slack thread 1" thread2_topic_name = "2015-08-18 random"
original_thread2_message = f"random\n\n*2 replies in #**random>{thread2_topic_name}***" original_thread2_message = f"random\n\n*2 replies in #**random>{thread2_topic_name}***"
original_thread2_message_id = zerver_message[3]["id"] original_thread2_message_id = zerver_message[3]["id"]
self.assertEqual(zerver_message[3]["content"], original_thread2_message) self.assertEqual(zerver_message[3]["content"], original_thread2_message)
@ -1300,7 +1326,7 @@ replying to the second thread :)
self.assertEqual(zerver_message[5][EXPORT_TOPIC_NAME], thread2_topic_name) self.assertEqual(zerver_message[5][EXPORT_TOPIC_NAME], thread2_topic_name)
# The third thread is to test how broadcasted thread replies are converted. # The third thread is to test how broadcasted thread replies are converted.
thread3_topic_name = "2015-08-18 Slack thread 2" thread3_topic_name = "2015-08-18 original message for the third thread"
original_thread3_message = f"original message for the third thread\n\n*2 replies in #**random>{thread3_topic_name}***" original_thread3_message = f"original message for the third thread\n\n*2 replies in #**random>{thread3_topic_name}***"
original_thread3_message_id = zerver_message[6]["id"] original_thread3_message_id = zerver_message[6]["id"]
self.assertEqual(zerver_message[6]["content"], original_thread3_message) self.assertEqual(zerver_message[6]["content"], original_thread3_message)
@ -1318,7 +1344,7 @@ original message for the third thread
self.assertEqual(zerver_message[7]["content"], thread3_reply_1) self.assertEqual(zerver_message[7]["content"], thread3_reply_1)
self.assertEqual(zerver_message[7][EXPORT_TOPIC_NAME], thread3_topic_name) self.assertEqual(zerver_message[7][EXPORT_TOPIC_NAME], thread3_topic_name)
thread3_broadcasted_reply_1 = f""" thread3_broadcasted_reply_1 = f"""
*replied to a Slack thread: [{thread3_topic_name}](http://test-realm.testserver/#narrow/channel/2-random/topic/2015-08-18.20Slack.20thread.202/near/{thread3_reply_1_id})* *replied to a Slack thread: [{thread3_topic_name}](http://test-realm.testserver/#narrow/channel/2-random/topic/2015-08-18.20original.20message.20for.20the.20third.20thread/near/{thread3_reply_1_id})*
{thread3_reply_1_content} {thread3_reply_1_content}
""" """
@ -1331,7 +1357,7 @@ original message for the third thread
self.assertEqual(zerver_message[9]["content"], thread3_reply_2) self.assertEqual(zerver_message[9]["content"], thread3_reply_2)
self.assertEqual(zerver_message[9][EXPORT_TOPIC_NAME], thread3_topic_name) self.assertEqual(zerver_message[9][EXPORT_TOPIC_NAME], thread3_topic_name)
thread3_broadcasted_reply_2 = f""" thread3_broadcasted_reply_2 = f"""
*replied to a Slack thread: [{thread3_topic_name}](http://test-realm.testserver/#narrow/channel/2-random/topic/2015-08-18.20Slack.20thread.202/near/{thread3_reply_2_id})* *replied to a Slack thread: [{thread3_topic_name}](http://test-realm.testserver/#narrow/channel/2-random/topic/2015-08-18.20original.20message.20for.20the.20third.20thread/near/{thread3_reply_2_id})*
{thread3_reply_2} {thread3_reply_2}
""" """
@ -1339,6 +1365,14 @@ original message for the third thread
self.assertEqual(zerver_message[10][EXPORT_TOPIC_NAME], main_import_topic) self.assertEqual(zerver_message[10][EXPORT_TOPIC_NAME], main_import_topic)
self.assertIn(thread3_reply_2, zerver_message[10]["content"]) self.assertIn(thread3_reply_2, zerver_message[10]["content"])
# The fourth thread is to test how colliding thread topics are handled.
# Its topic will collide with thread two topic.
thread4_topic_name = "2015-08-18 random (2)"
original_thread4_message = f"random\n\n*1 reply in #**random>{thread4_topic_name}***"
self.assertEqual(zerver_message[11]["content"], original_thread4_message)
self.assertEqual(zerver_message[11][EXPORT_TOPIC_NAME], main_import_topic)
self.assertEqual(zerver_message[12][EXPORT_TOPIC_NAME], thread4_topic_name)
self.assertEqual( self.assertEqual(
zerver_message[1]["recipient"], slack_recipient_name_to_zulip_recipient_id["random"] zerver_message[1]["recipient"], slack_recipient_name_to_zulip_recipient_id["random"]
) )