diff --git a/zerver/data_import/import_util.py b/zerver/data_import/import_util.py index 2c6ad4a954..9061b1381b 100644 --- a/zerver/data_import/import_util.py +++ b/zerver/data_import/import_util.py @@ -264,9 +264,15 @@ def build_usermessages(zerver_usermessage: List[ZerverFieldsT], subscriber_map: Dict[int, Set[int]], recipient_id: int, mentioned_user_ids: List[int], - message_id: int) -> None: + message_id: int, + long_term_idle: Optional[Set[int]]=None) -> Tuple[int, int]: user_ids = subscriber_map.get(recipient_id, set()) + if long_term_idle is None: + long_term_idle = set() + + user_messages_created = 0 + user_messages_skipped = 0 if user_ids: for user_id in sorted(user_ids): is_mentioned = user_id in mentioned_user_ids @@ -275,6 +281,12 @@ def build_usermessages(zerver_usermessage: List[ZerverFieldsT], # It's possible we don't even get PMs from them. is_private = False + if not is_mentioned and not is_private and user_id in long_term_idle: + # these users are long-term idle + user_messages_skipped += 1 + continue + user_messages_created += 1 + usermessage = build_user_message( user_id=user_id, message_id=message_id, @@ -283,6 +295,7 @@ def build_usermessages(zerver_usermessage: List[ZerverFieldsT], ) zerver_usermessage.append(usermessage) + return (user_messages_created, user_messages_skipped) def build_user_message(user_id: int, message_id: int, diff --git a/zerver/data_import/slack.py b/zerver/data_import/slack.py index d5c3892288..a68242bab3 100755 --- a/zerver/data_import/slack.py +++ b/zerver/data_import/slack.py @@ -439,9 +439,61 @@ def get_subscription(channel_members: List[str], zerver_subscription: List[Zerve subscription_id += 1 return subscription_id +def process_long_term_idle_users(slack_data_dir: str, users: List[ZerverFieldsT], + added_users: AddedUsersT, added_channels: AddedChannelsT, + zerver_userprofile: List[ZerverFieldsT]) -> Set[int]: + """Algorithmically, we treat users who have sent at least 10 messages + or have sent a message within the last 60 days as active. + Everyone else is treated as long-term idle, which means they will + have a slighly slower first page load when coming back to + Zulip. + """ + all_messages = get_messages_iterator(slack_data_dir, added_channels) + + sender_counts = defaultdict(int) # type: Dict[str, int] + recent_senders = set() # type: Set[str] + NOW = float(timezone_now().timestamp()) + for message in all_messages: + timestamp = float(message['ts']) + slack_user_id = get_message_sending_user(message) + if not slack_user_id: + # Ignore messages without user names + continue + + if slack_user_id in recent_senders: + continue + + if NOW - timestamp < 60: + recent_senders.add(slack_user_id) + + sender_counts[slack_user_id] += 1 + for (slack_sender_id, count) in sender_counts.items(): + if count > 10: + recent_senders.add(slack_sender_id) + + long_term_idle = set() + + for slack_user in users: + if slack_user["id"] in recent_senders: + continue + zulip_user_id = added_users[slack_user['id']] + long_term_idle.add(zulip_user_id) + + # Record long-term idle status in zerver_userprofile + for user_profile_row in zerver_userprofile: + if user_profile_row['id'] in long_term_idle: + user_profile_row['long_term_idle'] = True + # Setting last_active_message_id to 1 means the user, if + # imported, will get the full message history for the + # streams they were on. + user_profile_row['last_active_message_id'] = 1 + + return long_term_idle + def convert_slack_workspace_messages(slack_data_dir: str, users: List[ZerverFieldsT], realm_id: int, added_users: AddedUsersT, added_recipient: AddedRecipientsT, added_channels: AddedChannelsT, realm: ZerverFieldsT, + zerver_userprofile: List[ZerverFieldsT], zerver_realmemoji: List[ZerverFieldsT], domain_name: str, output_dir: str, chunk_size: int=MESSAGE_BATCH_CHUNK_SIZE) -> Tuple[List[ZerverFieldsT], @@ -453,8 +505,12 @@ def convert_slack_workspace_messages(slack_data_dir: str, users: List[ZerverFiel 2. uploads, which is a list of uploads to be mapped in uploads records.json 3. attachment, which is a list of the attachments """ - all_messages = get_messages_iterator(slack_data_dir, added_channels) + long_term_idle = process_long_term_idle_users(slack_data_dir, users, added_users, + added_channels, zerver_userprofile) + + # Now, we actually import the messages. + all_messages = get_messages_iterator(slack_data_dir, added_channels) logging.info('######### IMPORTING MESSAGES STARTED #########\n') total_reactions = [] # type: List[ZerverFieldsT] @@ -483,7 +539,7 @@ def convert_slack_workspace_messages(slack_data_dir: str, users: List[ZerverFiel channel_message_to_zerver_message( realm_id, users, added_users, added_recipient, message_data, zerver_realmemoji, subscriber_map, added_channels, - domain_name) + domain_name, long_term_idle) message_json = dict( zerver_message=zerver_message, @@ -539,11 +595,12 @@ def channel_message_to_zerver_message(realm_id: int, zerver_realmemoji: List[ZerverFieldsT], subscriber_map: Dict[int, Set[int]], added_channels: AddedChannelsT, - domain_name: str) -> Tuple[List[ZerverFieldsT], - List[ZerverFieldsT], - List[ZerverFieldsT], - List[ZerverFieldsT], - List[ZerverFieldsT]]: + domain_name: str, + long_term_idle: Set[int]) -> Tuple[List[ZerverFieldsT], + List[ZerverFieldsT], + List[ZerverFieldsT], + List[ZerverFieldsT], + List[ZerverFieldsT]]: """ Returns: 1. zerver_message, which is a list of the messages @@ -562,6 +619,8 @@ def channel_message_to_zerver_message(realm_id: int, with open(NAME_TO_CODEPOINT_PATH) as fp: name_to_codepoint = ujson.load(fp) + total_user_messages = 0 + total_skipped_user_messages = 0 for message in all_messages: user = get_message_sending_user(message) if not user: @@ -639,14 +698,19 @@ def channel_message_to_zerver_message(realm_id: int, zerver_message.append(zulip_message) # construct usermessages - build_usermessages( + (num_created, num_skipped) = build_usermessages( zerver_usermessage=zerver_usermessage, subscriber_map=subscriber_map, recipient_id=recipient_id, mentioned_user_ids=mentioned_user_ids, message_id=message_id, + long_term_idle=long_term_idle, ) + total_user_messages += num_created + total_skipped_user_messages += num_skipped + logging.debug("Created %s UserMessages; deferred %s due to long-term idle" % ( + total_user_messages, total_skipped_user_messages)) return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, \ reaction_list @@ -821,7 +885,7 @@ def do_convert_data(slack_zip_file: str, output_dir: str, token: str, threads: i reactions, uploads_list, zerver_attachment = convert_slack_workspace_messages( slack_data_dir, user_list, realm_id, added_users, added_recipient, added_channels, - realm, realm['zerver_realmemoji'], domain_name, output_dir) + realm, realm['zerver_userprofile'], realm['zerver_realmemoji'], domain_name, output_dir) # Move zerver_reactions to realm.json file realm['zerver_reaction'] = reactions diff --git a/zerver/tests/test_slack_importer.py b/zerver/tests/test_slack_importer.py index 484b5c07d0..e0a8948241 100644 --- a/zerver/tests/test_slack_importer.py +++ b/zerver/tests/test_slack_importer.py @@ -20,6 +20,8 @@ from zerver.data_import.slack import ( do_convert_data, process_avatars, process_message_files, + AddedChannelsT, + ZerverFieldsT, ) from zerver.data_import.import_util import ( build_zerver_realm, @@ -58,7 +60,7 @@ import shutil import requests import os import mock -from typing import Any, AnyStr, Dict, List, Optional, Set, Tuple +from typing import Any, AnyStr, Dict, List, Optional, Set, Tuple, Iterator def remove_folder(path: str) -> None: if os.path.exists(path): @@ -408,7 +410,7 @@ class SlackImporter(ZulipTestCase): self.assertEqual(zerver_usermessage[3]['id'], um_id + 4) self.assertEqual(zerver_usermessage[3]['message'], message_id) - @mock.patch("zerver.data_import.slack.build_usermessages", return_value = 2) + @mock.patch("zerver.data_import.slack.build_usermessages", return_value = (2, 4)) def test_channel_message_to_zerver_message(self, mock_build_usermessage: mock.Mock) -> None: user_data = [{"id": "U066MTL5U", "name": "john doe", "deleted": False, "real_name": "John"}, @@ -449,7 +451,7 @@ class SlackImporter(ZulipTestCase): channel_message_to_zerver_message( 1, user_data, added_users, added_recipient, all_messages, [], subscriber_map, - added_channels, 'domain') + added_channels, 'domain', set()) # functioning already tested in helper function self.assertEqual(zerver_usermessage, []) # subtype: channel_join is filtered @@ -491,9 +493,14 @@ class SlackImporter(ZulipTestCase): mock_message: mock.Mock) -> None: os.makedirs('var/test-slack-import', exist_ok=True) added_channels = {'random': ('c5', 1), 'general': ('c6', 2)} # type: Dict[str, Tuple[str, int]] + time = float(timezone_now().timestamp()) zerver_message = [{'id': 1, 'ts': time}, {'id': 5, 'ts': time}] + def fake_get_messages_iter(slack_data_dir: str, added_channels: AddedChannelsT) -> Iterator[ZerverFieldsT]: + import copy + return iter(copy.deepcopy(zerver_message)) + realm = {'zerver_subscription': []} # type: Dict[str, Any] user_list = [] # type: List[Dict[str, Any]] reactions = [{"name": "grinning", "users": ["U061A5N1G"], "count": 1}] @@ -501,14 +508,15 @@ class SlackImporter(ZulipTestCase): zerver_usermessage = [{'id': 3}, {'id': 5}, {'id': 6}, {'id': 9}] - mock_get_messages_iterator.side_effect = [iter(zerver_message)] + mock_get_messages_iterator.side_effect = fake_get_messages_iter mock_message.side_effect = [[zerver_message[:1], zerver_usermessage[:2], attachments, uploads, reactions[:1]], [zerver_message[1:2], zerver_usermessage[2:5], attachments, uploads, reactions[1:1]]] + # Hacky: We should include a zerver_userprofile, not the empty [] test_reactions, uploads, zerver_attachment = convert_slack_workspace_messages( './random_path', user_list, 2, {}, {}, added_channels, - realm, [], 'domain', 'var/test-slack-import', chunk_size=1) + realm, [], [], 'domain', 'var/test-slack-import', chunk_size=1) messages_file_1 = os.path.join('var', 'test-slack-import', 'messages-000001.json') self.assertTrue(os.path.exists(messages_file_1)) messages_file_2 = os.path.join('var', 'test-slack-import', 'messages-000002.json')