mirror of https://github.com/zulip/zulip.git
slack import: Import long-inactive users as long-term idle.
This avoids creating UserMessage rows for long-inactive users in organizations with many thousands of users.
This commit is contained in:
parent
37189e1f9d
commit
8a90441d2f
|
@ -264,9 +264,15 @@ def build_usermessages(zerver_usermessage: List[ZerverFieldsT],
|
|||
subscriber_map: Dict[int, Set[int]],
|
||||
recipient_id: int,
|
||||
mentioned_user_ids: List[int],
|
||||
message_id: int) -> None:
|
||||
message_id: int,
|
||||
long_term_idle: Optional[Set[int]]=None) -> Tuple[int, int]:
|
||||
user_ids = subscriber_map.get(recipient_id, set())
|
||||
|
||||
if long_term_idle is None:
|
||||
long_term_idle = set()
|
||||
|
||||
user_messages_created = 0
|
||||
user_messages_skipped = 0
|
||||
if user_ids:
|
||||
for user_id in sorted(user_ids):
|
||||
is_mentioned = user_id in mentioned_user_ids
|
||||
|
@ -275,6 +281,12 @@ def build_usermessages(zerver_usermessage: List[ZerverFieldsT],
|
|||
# It's possible we don't even get PMs from them.
|
||||
is_private = False
|
||||
|
||||
if not is_mentioned and not is_private and user_id in long_term_idle:
|
||||
# these users are long-term idle
|
||||
user_messages_skipped += 1
|
||||
continue
|
||||
user_messages_created += 1
|
||||
|
||||
usermessage = build_user_message(
|
||||
user_id=user_id,
|
||||
message_id=message_id,
|
||||
|
@ -283,6 +295,7 @@ def build_usermessages(zerver_usermessage: List[ZerverFieldsT],
|
|||
)
|
||||
|
||||
zerver_usermessage.append(usermessage)
|
||||
return (user_messages_created, user_messages_skipped)
|
||||
|
||||
def build_user_message(user_id: int,
|
||||
message_id: int,
|
||||
|
|
|
@ -439,9 +439,61 @@ def get_subscription(channel_members: List[str], zerver_subscription: List[Zerve
|
|||
subscription_id += 1
|
||||
return subscription_id
|
||||
|
||||
def process_long_term_idle_users(slack_data_dir: str, users: List[ZerverFieldsT],
|
||||
added_users: AddedUsersT, added_channels: AddedChannelsT,
|
||||
zerver_userprofile: List[ZerverFieldsT]) -> Set[int]:
|
||||
"""Algorithmically, we treat users who have sent at least 10 messages
|
||||
or have sent a message within the last 60 days as active.
|
||||
Everyone else is treated as long-term idle, which means they will
|
||||
have a slighly slower first page load when coming back to
|
||||
Zulip.
|
||||
"""
|
||||
all_messages = get_messages_iterator(slack_data_dir, added_channels)
|
||||
|
||||
sender_counts = defaultdict(int) # type: Dict[str, int]
|
||||
recent_senders = set() # type: Set[str]
|
||||
NOW = float(timezone_now().timestamp())
|
||||
for message in all_messages:
|
||||
timestamp = float(message['ts'])
|
||||
slack_user_id = get_message_sending_user(message)
|
||||
if not slack_user_id:
|
||||
# Ignore messages without user names
|
||||
continue
|
||||
|
||||
if slack_user_id in recent_senders:
|
||||
continue
|
||||
|
||||
if NOW - timestamp < 60:
|
||||
recent_senders.add(slack_user_id)
|
||||
|
||||
sender_counts[slack_user_id] += 1
|
||||
for (slack_sender_id, count) in sender_counts.items():
|
||||
if count > 10:
|
||||
recent_senders.add(slack_sender_id)
|
||||
|
||||
long_term_idle = set()
|
||||
|
||||
for slack_user in users:
|
||||
if slack_user["id"] in recent_senders:
|
||||
continue
|
||||
zulip_user_id = added_users[slack_user['id']]
|
||||
long_term_idle.add(zulip_user_id)
|
||||
|
||||
# Record long-term idle status in zerver_userprofile
|
||||
for user_profile_row in zerver_userprofile:
|
||||
if user_profile_row['id'] in long_term_idle:
|
||||
user_profile_row['long_term_idle'] = True
|
||||
# Setting last_active_message_id to 1 means the user, if
|
||||
# imported, will get the full message history for the
|
||||
# streams they were on.
|
||||
user_profile_row['last_active_message_id'] = 1
|
||||
|
||||
return long_term_idle
|
||||
|
||||
def convert_slack_workspace_messages(slack_data_dir: str, users: List[ZerverFieldsT], realm_id: int,
|
||||
added_users: AddedUsersT, added_recipient: AddedRecipientsT,
|
||||
added_channels: AddedChannelsT, realm: ZerverFieldsT,
|
||||
zerver_userprofile: List[ZerverFieldsT],
|
||||
zerver_realmemoji: List[ZerverFieldsT], domain_name: str,
|
||||
output_dir: str,
|
||||
chunk_size: int=MESSAGE_BATCH_CHUNK_SIZE) -> Tuple[List[ZerverFieldsT],
|
||||
|
@ -453,8 +505,12 @@ def convert_slack_workspace_messages(slack_data_dir: str, users: List[ZerverFiel
|
|||
2. uploads, which is a list of uploads to be mapped in uploads records.json
|
||||
3. attachment, which is a list of the attachments
|
||||
"""
|
||||
all_messages = get_messages_iterator(slack_data_dir, added_channels)
|
||||
|
||||
long_term_idle = process_long_term_idle_users(slack_data_dir, users, added_users,
|
||||
added_channels, zerver_userprofile)
|
||||
|
||||
# Now, we actually import the messages.
|
||||
all_messages = get_messages_iterator(slack_data_dir, added_channels)
|
||||
logging.info('######### IMPORTING MESSAGES STARTED #########\n')
|
||||
|
||||
total_reactions = [] # type: List[ZerverFieldsT]
|
||||
|
@ -483,7 +539,7 @@ def convert_slack_workspace_messages(slack_data_dir: str, users: List[ZerverFiel
|
|||
channel_message_to_zerver_message(
|
||||
realm_id, users, added_users, added_recipient, message_data,
|
||||
zerver_realmemoji, subscriber_map, added_channels,
|
||||
domain_name)
|
||||
domain_name, long_term_idle)
|
||||
|
||||
message_json = dict(
|
||||
zerver_message=zerver_message,
|
||||
|
@ -539,7 +595,8 @@ def channel_message_to_zerver_message(realm_id: int,
|
|||
zerver_realmemoji: List[ZerverFieldsT],
|
||||
subscriber_map: Dict[int, Set[int]],
|
||||
added_channels: AddedChannelsT,
|
||||
domain_name: str) -> Tuple[List[ZerverFieldsT],
|
||||
domain_name: str,
|
||||
long_term_idle: Set[int]) -> Tuple[List[ZerverFieldsT],
|
||||
List[ZerverFieldsT],
|
||||
List[ZerverFieldsT],
|
||||
List[ZerverFieldsT],
|
||||
|
@ -562,6 +619,8 @@ def channel_message_to_zerver_message(realm_id: int,
|
|||
with open(NAME_TO_CODEPOINT_PATH) as fp:
|
||||
name_to_codepoint = ujson.load(fp)
|
||||
|
||||
total_user_messages = 0
|
||||
total_skipped_user_messages = 0
|
||||
for message in all_messages:
|
||||
user = get_message_sending_user(message)
|
||||
if not user:
|
||||
|
@ -639,14 +698,19 @@ def channel_message_to_zerver_message(realm_id: int,
|
|||
zerver_message.append(zulip_message)
|
||||
|
||||
# construct usermessages
|
||||
build_usermessages(
|
||||
(num_created, num_skipped) = build_usermessages(
|
||||
zerver_usermessage=zerver_usermessage,
|
||||
subscriber_map=subscriber_map,
|
||||
recipient_id=recipient_id,
|
||||
mentioned_user_ids=mentioned_user_ids,
|
||||
message_id=message_id,
|
||||
long_term_idle=long_term_idle,
|
||||
)
|
||||
total_user_messages += num_created
|
||||
total_skipped_user_messages += num_skipped
|
||||
|
||||
logging.debug("Created %s UserMessages; deferred %s due to long-term idle" % (
|
||||
total_user_messages, total_skipped_user_messages))
|
||||
return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, \
|
||||
reaction_list
|
||||
|
||||
|
@ -821,7 +885,7 @@ def do_convert_data(slack_zip_file: str, output_dir: str, token: str, threads: i
|
|||
|
||||
reactions, uploads_list, zerver_attachment = convert_slack_workspace_messages(
|
||||
slack_data_dir, user_list, realm_id, added_users, added_recipient, added_channels,
|
||||
realm, realm['zerver_realmemoji'], domain_name, output_dir)
|
||||
realm, realm['zerver_userprofile'], realm['zerver_realmemoji'], domain_name, output_dir)
|
||||
|
||||
# Move zerver_reactions to realm.json file
|
||||
realm['zerver_reaction'] = reactions
|
||||
|
|
|
@ -20,6 +20,8 @@ from zerver.data_import.slack import (
|
|||
do_convert_data,
|
||||
process_avatars,
|
||||
process_message_files,
|
||||
AddedChannelsT,
|
||||
ZerverFieldsT,
|
||||
)
|
||||
from zerver.data_import.import_util import (
|
||||
build_zerver_realm,
|
||||
|
@ -58,7 +60,7 @@ import shutil
|
|||
import requests
|
||||
import os
|
||||
import mock
|
||||
from typing import Any, AnyStr, Dict, List, Optional, Set, Tuple
|
||||
from typing import Any, AnyStr, Dict, List, Optional, Set, Tuple, Iterator
|
||||
|
||||
def remove_folder(path: str) -> None:
|
||||
if os.path.exists(path):
|
||||
|
@ -408,7 +410,7 @@ class SlackImporter(ZulipTestCase):
|
|||
self.assertEqual(zerver_usermessage[3]['id'], um_id + 4)
|
||||
self.assertEqual(zerver_usermessage[3]['message'], message_id)
|
||||
|
||||
@mock.patch("zerver.data_import.slack.build_usermessages", return_value = 2)
|
||||
@mock.patch("zerver.data_import.slack.build_usermessages", return_value = (2, 4))
|
||||
def test_channel_message_to_zerver_message(self, mock_build_usermessage: mock.Mock) -> None:
|
||||
|
||||
user_data = [{"id": "U066MTL5U", "name": "john doe", "deleted": False, "real_name": "John"},
|
||||
|
@ -449,7 +451,7 @@ class SlackImporter(ZulipTestCase):
|
|||
channel_message_to_zerver_message(
|
||||
1, user_data, added_users, added_recipient,
|
||||
all_messages, [], subscriber_map,
|
||||
added_channels, 'domain')
|
||||
added_channels, 'domain', set())
|
||||
# functioning already tested in helper function
|
||||
self.assertEqual(zerver_usermessage, [])
|
||||
# subtype: channel_join is filtered
|
||||
|
@ -491,9 +493,14 @@ class SlackImporter(ZulipTestCase):
|
|||
mock_message: mock.Mock) -> None:
|
||||
os.makedirs('var/test-slack-import', exist_ok=True)
|
||||
added_channels = {'random': ('c5', 1), 'general': ('c6', 2)} # type: Dict[str, Tuple[str, int]]
|
||||
|
||||
time = float(timezone_now().timestamp())
|
||||
zerver_message = [{'id': 1, 'ts': time}, {'id': 5, 'ts': time}]
|
||||
|
||||
def fake_get_messages_iter(slack_data_dir: str, added_channels: AddedChannelsT) -> Iterator[ZerverFieldsT]:
|
||||
import copy
|
||||
return iter(copy.deepcopy(zerver_message))
|
||||
|
||||
realm = {'zerver_subscription': []} # type: Dict[str, Any]
|
||||
user_list = [] # type: List[Dict[str, Any]]
|
||||
reactions = [{"name": "grinning", "users": ["U061A5N1G"], "count": 1}]
|
||||
|
@ -501,14 +508,15 @@ class SlackImporter(ZulipTestCase):
|
|||
|
||||
zerver_usermessage = [{'id': 3}, {'id': 5}, {'id': 6}, {'id': 9}]
|
||||
|
||||
mock_get_messages_iterator.side_effect = [iter(zerver_message)]
|
||||
mock_get_messages_iterator.side_effect = fake_get_messages_iter
|
||||
mock_message.side_effect = [[zerver_message[:1], zerver_usermessage[:2],
|
||||
attachments, uploads, reactions[:1]],
|
||||
[zerver_message[1:2], zerver_usermessage[2:5],
|
||||
attachments, uploads, reactions[1:1]]]
|
||||
# Hacky: We should include a zerver_userprofile, not the empty []
|
||||
test_reactions, uploads, zerver_attachment = convert_slack_workspace_messages(
|
||||
'./random_path', user_list, 2, {}, {}, added_channels,
|
||||
realm, [], 'domain', 'var/test-slack-import', chunk_size=1)
|
||||
realm, [], [], 'domain', 'var/test-slack-import', chunk_size=1)
|
||||
messages_file_1 = os.path.join('var', 'test-slack-import', 'messages-000001.json')
|
||||
self.assertTrue(os.path.exists(messages_file_1))
|
||||
messages_file_2 = os.path.join('var', 'test-slack-import', 'messages-000002.json')
|
||||
|
|
Loading…
Reference in New Issue