rocketchat: Remove unnecessary SHA-1 hashing of direct message groups.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg 2024-07-17 14:25:45 -07:00 committed by Tim Abbott
parent 541699a1c6
commit 722842a0aa
2 changed files with 10 additions and 32 deletions

View File

@ -1,4 +1,3 @@
import hashlib
import logging import logging
import os import os
import random import random
@ -886,23 +885,6 @@ def map_receiver_id_to_recipient_id(
user_id_to_recipient_id[recipient["type_id"]] = recipient["id"] user_id_to_recipient_id[recipient["type_id"]] = recipient["id"]
# This is inspired by get_direct_message_group_hash
# from zerver/models/recipients.py. It expects strings
# identifying Rocket.Chat users, like `LdBZ7kPxtKESyHPEe`,
# not integer IDs.
#
# Its purpose is to be a stable map usable for deduplication/merging
# of Rocket.Chat threads involving the same set of people. Thus, its
# only important property is that if two sets of users S and T are
# equal and thus will have the same actual direct message group hash
# once imported, that get_string_direct_message_group_hash(S) =
# get_string_direct_message_group_hash(T).
def get_string_direct_message_group_hash(id_list: list[str]) -> str:
id_list = sorted(set(id_list))
hash_key = ",".join(str(x) for x in id_list)
return hashlib.sha1(hash_key.encode()).hexdigest()
def categorize_channels_and_map_with_id( def categorize_channels_and_map_with_id(
channel_data: list[dict[str, Any]], channel_data: list[dict[str, Any]],
room_id_to_room_map: dict[str, dict[str, Any]], room_id_to_room_map: dict[str, dict[str, Any]],
@ -912,18 +894,14 @@ def categorize_channels_and_map_with_id(
huddle_id_to_huddle_map: dict[str, dict[str, Any]], huddle_id_to_huddle_map: dict[str, dict[str, Any]],
livechat_id_to_livechat_map: dict[str, dict[str, Any]], livechat_id_to_livechat_map: dict[str, dict[str, Any]],
) -> None: ) -> None:
direct_message_group_hashed_channels: dict[str, Any] = {} direct_message_group_hashed_channels: dict[frozenset[str], Any] = {}
for channel in channel_data: for channel in channel_data:
if channel.get("prid"): if channel.get("prid"):
dsc_id_to_dsc_map[channel["_id"]] = channel dsc_id_to_dsc_map[channel["_id"]] = channel
elif channel["t"] == "d": elif channel["t"] == "d":
if len(channel["uids"]) > 2: if len(channel["uids"]) > 2:
direct_message_group_hash = get_string_direct_message_group_hash(channel["uids"]) direct_message_group_members = frozenset(channel["uids"])
logging.info( logging.info("Huddle channel found. UIDs: %r", channel["uids"])
"Huddle channel found. UIDs: %s -> hash %s",
channel["uids"],
direct_message_group_hash,
)
if channel["msgs"] == 0: # nocoverage if channel["msgs"] == 0: # nocoverage
# Rocket.Chat exports in the wild sometimes # Rocket.Chat exports in the wild sometimes
@ -935,15 +913,15 @@ def categorize_channels_and_map_with_id(
# value in Zulip's data model. # value in Zulip's data model.
logging.debug("Skipping direct message group with 0 messages: %s", channel) logging.debug("Skipping direct message group with 0 messages: %s", channel)
elif ( elif (
direct_message_group_hash in direct_message_group_hashed_channels direct_message_group_members in direct_message_group_hashed_channels
): # nocoverage ): # nocoverage
logging.info( logging.info(
"Mapping direct message group hash %s to existing channel: %s", "Mapping direct message group %r to existing channel: %s",
direct_message_group_hash, direct_message_group_members,
direct_message_group_hashed_channels[direct_message_group_hash], direct_message_group_hashed_channels[direct_message_group_members],
) )
huddle_id_to_huddle_map[channel["_id"]] = direct_message_group_hashed_channels[ huddle_id_to_huddle_map[channel["_id"]] = direct_message_group_hashed_channels[
direct_message_group_hash direct_message_group_members
] ]
# Ideally, we'd merge the duplicate direct message # Ideally, we'd merge the duplicate direct message
@ -961,7 +939,7 @@ def categorize_channels_and_map_with_id(
) )
else: else:
huddle_id_to_huddle_map[channel["_id"]] = channel huddle_id_to_huddle_map[channel["_id"]] = channel
direct_message_group_hashed_channels[direct_message_group_hash] = channel direct_message_group_hashed_channels[direct_message_group_members] = channel
else: else:
direct_id_to_direct_map[channel["_id"]] = channel direct_id_to_direct_map[channel["_id"]] = channel
elif channel["t"] == "l": elif channel["t"] == "l":

View File

@ -890,7 +890,7 @@ class RocketChatImporter(ZulipTestCase):
self.assertEqual( self.assertEqual(
info_log.output, info_log.output,
[ [
"INFO:root:Huddle channel found. UIDs: ['LdBZ7kPxtKESyHPEe', 'M2sXGqoQRJQwQoXY2', 'os6N2Xg2JkNMCSW9Z'] -> hash 752a5854d2b6eec337fe81f0066a5dd72c3f0639", "INFO:root:Huddle channel found. UIDs: ['LdBZ7kPxtKESyHPEe', 'M2sXGqoQRJQwQoXY2', 'os6N2Xg2JkNMCSW9Z']",
"INFO:root:Starting to process custom emoji", "INFO:root:Starting to process custom emoji",
"INFO:root:Done processing emoji", "INFO:root:Done processing emoji",
"INFO:root:skipping direct messages discussion mention: Discussion with Hermione", "INFO:root:skipping direct messages discussion mention: Discussion with Hermione",