slack import: Import long-inactive users as long-term idle.

This avoids creating UserMessage rows for long-inactive users in
organizations with many thousands of users.
This commit is contained in:
Tim Abbott 2018-12-16 17:26:51 -08:00
parent 37189e1f9d
commit 8a90441d2f
3 changed files with 100 additions and 15 deletions

View File

@ -264,9 +264,15 @@ def build_usermessages(zerver_usermessage: List[ZerverFieldsT],
subscriber_map: Dict[int, Set[int]],
recipient_id: int,
mentioned_user_ids: List[int],
message_id: int) -> None:
message_id: int,
long_term_idle: Optional[Set[int]]=None) -> Tuple[int, int]:
user_ids = subscriber_map.get(recipient_id, set())
if long_term_idle is None:
long_term_idle = set()
user_messages_created = 0
user_messages_skipped = 0
if user_ids:
for user_id in sorted(user_ids):
is_mentioned = user_id in mentioned_user_ids
@ -275,6 +281,12 @@ def build_usermessages(zerver_usermessage: List[ZerverFieldsT],
# It's possible we don't even get PMs from them.
is_private = False
if not is_mentioned and not is_private and user_id in long_term_idle:
# these users are long-term idle
user_messages_skipped += 1
continue
user_messages_created += 1
usermessage = build_user_message(
user_id=user_id,
message_id=message_id,
@ -283,6 +295,7 @@ def build_usermessages(zerver_usermessage: List[ZerverFieldsT],
)
zerver_usermessage.append(usermessage)
return (user_messages_created, user_messages_skipped)
def build_user_message(user_id: int,
message_id: int,

View File

@ -439,9 +439,61 @@ def get_subscription(channel_members: List[str], zerver_subscription: List[Zerve
subscription_id += 1
return subscription_id
def process_long_term_idle_users(slack_data_dir: str, users: List[ZerverFieldsT],
added_users: AddedUsersT, added_channels: AddedChannelsT,
zerver_userprofile: List[ZerverFieldsT]) -> Set[int]:
"""Algorithmically, we treat users who have sent at least 10 messages
or have sent a message within the last 60 days as active.
Everyone else is treated as long-term idle, which means they will
have a slighly slower first page load when coming back to
Zulip.
"""
all_messages = get_messages_iterator(slack_data_dir, added_channels)
sender_counts = defaultdict(int) # type: Dict[str, int]
recent_senders = set() # type: Set[str]
NOW = float(timezone_now().timestamp())
for message in all_messages:
timestamp = float(message['ts'])
slack_user_id = get_message_sending_user(message)
if not slack_user_id:
# Ignore messages without user names
continue
if slack_user_id in recent_senders:
continue
if NOW - timestamp < 60:
recent_senders.add(slack_user_id)
sender_counts[slack_user_id] += 1
for (slack_sender_id, count) in sender_counts.items():
if count > 10:
recent_senders.add(slack_sender_id)
long_term_idle = set()
for slack_user in users:
if slack_user["id"] in recent_senders:
continue
zulip_user_id = added_users[slack_user['id']]
long_term_idle.add(zulip_user_id)
# Record long-term idle status in zerver_userprofile
for user_profile_row in zerver_userprofile:
if user_profile_row['id'] in long_term_idle:
user_profile_row['long_term_idle'] = True
# Setting last_active_message_id to 1 means the user, if
# imported, will get the full message history for the
# streams they were on.
user_profile_row['last_active_message_id'] = 1
return long_term_idle
def convert_slack_workspace_messages(slack_data_dir: str, users: List[ZerverFieldsT], realm_id: int,
added_users: AddedUsersT, added_recipient: AddedRecipientsT,
added_channels: AddedChannelsT, realm: ZerverFieldsT,
zerver_userprofile: List[ZerverFieldsT],
zerver_realmemoji: List[ZerverFieldsT], domain_name: str,
output_dir: str,
chunk_size: int=MESSAGE_BATCH_CHUNK_SIZE) -> Tuple[List[ZerverFieldsT],
@ -453,8 +505,12 @@ def convert_slack_workspace_messages(slack_data_dir: str, users: List[ZerverFiel
2. uploads, which is a list of uploads to be mapped in uploads records.json
3. attachment, which is a list of the attachments
"""
all_messages = get_messages_iterator(slack_data_dir, added_channels)
long_term_idle = process_long_term_idle_users(slack_data_dir, users, added_users,
added_channels, zerver_userprofile)
# Now, we actually import the messages.
all_messages = get_messages_iterator(slack_data_dir, added_channels)
logging.info('######### IMPORTING MESSAGES STARTED #########\n')
total_reactions = [] # type: List[ZerverFieldsT]
@ -483,7 +539,7 @@ def convert_slack_workspace_messages(slack_data_dir: str, users: List[ZerverFiel
channel_message_to_zerver_message(
realm_id, users, added_users, added_recipient, message_data,
zerver_realmemoji, subscriber_map, added_channels,
domain_name)
domain_name, long_term_idle)
message_json = dict(
zerver_message=zerver_message,
@ -539,11 +595,12 @@ def channel_message_to_zerver_message(realm_id: int,
zerver_realmemoji: List[ZerverFieldsT],
subscriber_map: Dict[int, Set[int]],
added_channels: AddedChannelsT,
domain_name: str) -> Tuple[List[ZerverFieldsT],
List[ZerverFieldsT],
List[ZerverFieldsT],
List[ZerverFieldsT],
List[ZerverFieldsT]]:
domain_name: str,
long_term_idle: Set[int]) -> Tuple[List[ZerverFieldsT],
List[ZerverFieldsT],
List[ZerverFieldsT],
List[ZerverFieldsT],
List[ZerverFieldsT]]:
"""
Returns:
1. zerver_message, which is a list of the messages
@ -562,6 +619,8 @@ def channel_message_to_zerver_message(realm_id: int,
with open(NAME_TO_CODEPOINT_PATH) as fp:
name_to_codepoint = ujson.load(fp)
total_user_messages = 0
total_skipped_user_messages = 0
for message in all_messages:
user = get_message_sending_user(message)
if not user:
@ -639,14 +698,19 @@ def channel_message_to_zerver_message(realm_id: int,
zerver_message.append(zulip_message)
# construct usermessages
build_usermessages(
(num_created, num_skipped) = build_usermessages(
zerver_usermessage=zerver_usermessage,
subscriber_map=subscriber_map,
recipient_id=recipient_id,
mentioned_user_ids=mentioned_user_ids,
message_id=message_id,
long_term_idle=long_term_idle,
)
total_user_messages += num_created
total_skipped_user_messages += num_skipped
logging.debug("Created %s UserMessages; deferred %s due to long-term idle" % (
total_user_messages, total_skipped_user_messages))
return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, \
reaction_list
@ -821,7 +885,7 @@ def do_convert_data(slack_zip_file: str, output_dir: str, token: str, threads: i
reactions, uploads_list, zerver_attachment = convert_slack_workspace_messages(
slack_data_dir, user_list, realm_id, added_users, added_recipient, added_channels,
realm, realm['zerver_realmemoji'], domain_name, output_dir)
realm, realm['zerver_userprofile'], realm['zerver_realmemoji'], domain_name, output_dir)
# Move zerver_reactions to realm.json file
realm['zerver_reaction'] = reactions

View File

@ -20,6 +20,8 @@ from zerver.data_import.slack import (
do_convert_data,
process_avatars,
process_message_files,
AddedChannelsT,
ZerverFieldsT,
)
from zerver.data_import.import_util import (
build_zerver_realm,
@ -58,7 +60,7 @@ import shutil
import requests
import os
import mock
from typing import Any, AnyStr, Dict, List, Optional, Set, Tuple
from typing import Any, AnyStr, Dict, List, Optional, Set, Tuple, Iterator
def remove_folder(path: str) -> None:
if os.path.exists(path):
@ -408,7 +410,7 @@ class SlackImporter(ZulipTestCase):
self.assertEqual(zerver_usermessage[3]['id'], um_id + 4)
self.assertEqual(zerver_usermessage[3]['message'], message_id)
@mock.patch("zerver.data_import.slack.build_usermessages", return_value = 2)
@mock.patch("zerver.data_import.slack.build_usermessages", return_value = (2, 4))
def test_channel_message_to_zerver_message(self, mock_build_usermessage: mock.Mock) -> None:
user_data = [{"id": "U066MTL5U", "name": "john doe", "deleted": False, "real_name": "John"},
@ -449,7 +451,7 @@ class SlackImporter(ZulipTestCase):
channel_message_to_zerver_message(
1, user_data, added_users, added_recipient,
all_messages, [], subscriber_map,
added_channels, 'domain')
added_channels, 'domain', set())
# functioning already tested in helper function
self.assertEqual(zerver_usermessage, [])
# subtype: channel_join is filtered
@ -491,9 +493,14 @@ class SlackImporter(ZulipTestCase):
mock_message: mock.Mock) -> None:
os.makedirs('var/test-slack-import', exist_ok=True)
added_channels = {'random': ('c5', 1), 'general': ('c6', 2)} # type: Dict[str, Tuple[str, int]]
time = float(timezone_now().timestamp())
zerver_message = [{'id': 1, 'ts': time}, {'id': 5, 'ts': time}]
def fake_get_messages_iter(slack_data_dir: str, added_channels: AddedChannelsT) -> Iterator[ZerverFieldsT]:
import copy
return iter(copy.deepcopy(zerver_message))
realm = {'zerver_subscription': []} # type: Dict[str, Any]
user_list = [] # type: List[Dict[str, Any]]
reactions = [{"name": "grinning", "users": ["U061A5N1G"], "count": 1}]
@ -501,14 +508,15 @@ class SlackImporter(ZulipTestCase):
zerver_usermessage = [{'id': 3}, {'id': 5}, {'id': 6}, {'id': 9}]
mock_get_messages_iterator.side_effect = [iter(zerver_message)]
mock_get_messages_iterator.side_effect = fake_get_messages_iter
mock_message.side_effect = [[zerver_message[:1], zerver_usermessage[:2],
attachments, uploads, reactions[:1]],
[zerver_message[1:2], zerver_usermessage[2:5],
attachments, uploads, reactions[1:1]]]
# Hacky: We should include a zerver_userprofile, not the empty []
test_reactions, uploads, zerver_attachment = convert_slack_workspace_messages(
'./random_path', user_list, 2, {}, {}, added_channels,
realm, [], 'domain', 'var/test-slack-import', chunk_size=1)
realm, [], [], 'domain', 'var/test-slack-import', chunk_size=1)
messages_file_1 = os.path.join('var', 'test-slack-import', 'messages-000001.json')
self.assertTrue(os.path.exists(messages_file_1))
messages_file_2 = os.path.join('var', 'test-slack-import', 'messages-000002.json')