2018-09-29 03:11:55 +02:00
|
|
|
import base64
|
2018-09-29 21:11:22 +02:00
|
|
|
import glob
|
2018-09-28 13:00:32 +02:00
|
|
|
import logging
|
|
|
|
import os
|
2018-10-25 00:57:11 +02:00
|
|
|
import re
|
2018-09-29 15:43:53 +02:00
|
|
|
import shutil
|
2018-09-28 13:00:32 +02:00
|
|
|
import subprocess
|
2018-10-15 13:03:44 +02:00
|
|
|
from typing import Any, Callable, Dict, List, Optional, Set
|
2018-09-28 13:00:32 +02:00
|
|
|
|
2020-06-11 00:54:34 +02:00
|
|
|
import dateutil
|
|
|
|
import hypchat
|
|
|
|
import ujson
|
2018-09-28 13:00:32 +02:00
|
|
|
from django.conf import settings
|
|
|
|
from django.utils.timezone import now as timezone_now
|
|
|
|
|
2020-06-11 00:54:34 +02:00
|
|
|
from zerver.data_import.hipchat_attachment import AttachmentHandler
|
|
|
|
from zerver.data_import.hipchat_user import UserHandler
|
2018-09-28 13:00:32 +02:00
|
|
|
from zerver.data_import.import_util import (
|
2020-06-11 00:54:34 +02:00
|
|
|
SubscriberHandler,
|
2018-09-29 21:11:22 +02:00
|
|
|
build_message,
|
2020-06-11 00:54:34 +02:00
|
|
|
build_personal_subscriptions,
|
|
|
|
build_public_stream_subscriptions,
|
2018-09-28 13:00:32 +02:00
|
|
|
build_realm,
|
2018-09-29 15:43:53 +02:00
|
|
|
build_realm_emoji,
|
2018-09-28 13:00:32 +02:00
|
|
|
build_recipients,
|
|
|
|
build_stream,
|
2019-01-10 01:17:54 +01:00
|
|
|
build_stream_subscriptions,
|
2018-10-17 20:35:22 +02:00
|
|
|
build_user_profile,
|
2018-09-28 13:00:32 +02:00
|
|
|
build_zerver_realm,
|
2018-10-13 16:41:18 +02:00
|
|
|
create_converted_data_files,
|
2018-10-23 16:53:09 +02:00
|
|
|
make_subscriber_map,
|
2019-03-26 12:46:29 +01:00
|
|
|
make_user_messages,
|
2018-09-29 03:11:55 +02:00
|
|
|
write_avatar_png,
|
2018-09-28 13:00:32 +02:00
|
|
|
)
|
2018-11-13 23:03:18 +01:00
|
|
|
from zerver.data_import.sequencer import NEXT_ID, IdMapper
|
2020-06-11 00:54:34 +02:00
|
|
|
from zerver.lib.utils import process_list_in_batches
|
|
|
|
from zerver.models import RealmEmoji, Recipient, UserProfile
|
2018-10-13 15:13:35 +02:00
|
|
|
|
2018-09-28 13:00:32 +02:00
|
|
|
# stubs
|
|
|
|
ZerverFieldsT = Dict[str, Any]
|
|
|
|
|
2018-09-29 21:11:22 +02:00
|
|
|
def str_date_to_float(date_str: str) -> float:
|
|
|
|
'''
|
|
|
|
Dates look like this:
|
|
|
|
|
|
|
|
"2018-08-08T14:23:54Z 626267"
|
|
|
|
'''
|
|
|
|
|
|
|
|
parts = date_str.split(' ')
|
|
|
|
time_str = parts[0].replace('T', ' ')
|
|
|
|
date_time = dateutil.parser.parse(time_str)
|
|
|
|
timestamp = date_time.timestamp()
|
|
|
|
if len(parts) == 2:
|
|
|
|
microseconds = int(parts[1])
|
|
|
|
timestamp += microseconds / 1000000.0
|
|
|
|
return timestamp
|
|
|
|
|
2018-09-28 13:00:32 +02:00
|
|
|
def untar_input_file(tar_file: str) -> str:
|
|
|
|
data_dir = tar_file.replace('.tar', '')
|
|
|
|
data_dir = os.path.abspath(data_dir)
|
|
|
|
|
|
|
|
if os.path.exists(data_dir):
|
2020-05-02 08:44:14 +02:00
|
|
|
logging.info('input data was already untarred to %s, we will use it', data_dir)
|
2018-09-28 13:00:32 +02:00
|
|
|
return data_dir
|
|
|
|
|
|
|
|
os.makedirs(data_dir)
|
|
|
|
|
|
|
|
subprocess.check_call(['tar', '-xf', tar_file, '-C', data_dir])
|
|
|
|
|
2020-05-02 08:44:14 +02:00
|
|
|
logging.info('input data was untarred to %s', data_dir)
|
2018-09-28 13:00:32 +02:00
|
|
|
|
|
|
|
return data_dir
|
|
|
|
|
|
|
|
def read_user_data(data_dir: str) -> List[ZerverFieldsT]:
|
|
|
|
fn = 'users.json'
|
|
|
|
data_file = os.path.join(data_dir, fn)
|
2020-04-09 21:51:58 +02:00
|
|
|
with open(data_file) as fp:
|
2018-10-17 01:14:09 +02:00
|
|
|
return ujson.load(fp)
|
2018-09-28 13:00:32 +02:00
|
|
|
|
2018-10-14 23:23:34 +02:00
|
|
|
def convert_user_data(user_handler: UserHandler,
|
2018-11-17 16:50:55 +01:00
|
|
|
slim_mode: bool,
|
2018-11-13 23:03:18 +01:00
|
|
|
user_id_mapper: IdMapper,
|
2018-10-14 23:23:34 +02:00
|
|
|
raw_data: List[ZerverFieldsT],
|
|
|
|
realm_id: int) -> None:
|
2018-09-28 13:00:32 +02:00
|
|
|
flat_data = [
|
|
|
|
d['User']
|
|
|
|
for d in raw_data
|
|
|
|
]
|
|
|
|
|
|
|
|
def process(in_dict: ZerverFieldsT) -> ZerverFieldsT:
|
|
|
|
delivery_email = in_dict['email']
|
|
|
|
email = in_dict['email']
|
|
|
|
full_name = in_dict['name']
|
2018-11-13 23:03:18 +01:00
|
|
|
id = user_id_mapper.get(in_dict['id'])
|
2018-10-15 13:03:44 +02:00
|
|
|
is_mirror_dummy = False
|
2018-09-28 13:00:32 +02:00
|
|
|
short_name = in_dict['mention_name']
|
|
|
|
timezone = in_dict['timezone']
|
|
|
|
|
2019-10-05 02:35:07 +02:00
|
|
|
role = UserProfile.ROLE_MEMBER
|
|
|
|
if in_dict['account_type'] == 'admin':
|
|
|
|
role = UserProfile.ROLE_REALM_ADMINISTRATOR
|
|
|
|
if in_dict['account_type'] == 'guest':
|
|
|
|
role = UserProfile.ROLE_GUEST
|
|
|
|
|
2018-09-28 13:00:32 +02:00
|
|
|
date_joined = int(timezone_now().timestamp())
|
2018-09-28 23:49:59 +02:00
|
|
|
is_active = not in_dict['is_deleted']
|
2018-09-28 13:00:32 +02:00
|
|
|
|
2018-11-01 01:07:41 +01:00
|
|
|
if not email:
|
2019-10-05 02:35:07 +02:00
|
|
|
if role == UserProfile.ROLE_GUEST:
|
2018-11-01 01:07:41 +01:00
|
|
|
# Hipchat guest users don't have emails, so
|
|
|
|
# we just fake them.
|
2020-06-09 00:25:09 +02:00
|
|
|
email = f'guest-{id}@example.com'
|
2018-11-01 01:07:41 +01:00
|
|
|
delivery_email = email
|
|
|
|
else:
|
|
|
|
# Hipchat sometimes doesn't export an email for deactivated users.
|
|
|
|
assert not is_active
|
2020-06-09 00:25:09 +02:00
|
|
|
email = delivery_email = f"deactivated-{id}@example.com"
|
2018-11-01 01:07:41 +01:00
|
|
|
|
2018-09-28 13:00:32 +02:00
|
|
|
# unmapped fields:
|
|
|
|
# title - Developer, Project Manager, etc.
|
|
|
|
# rooms - no good sample data
|
|
|
|
# created - we just use "now"
|
|
|
|
# roles - we just use account_type
|
2018-10-12 12:50:01 +02:00
|
|
|
|
|
|
|
if in_dict.get('avatar'):
|
|
|
|
avatar_source = 'U'
|
|
|
|
else:
|
|
|
|
avatar_source = 'G'
|
|
|
|
|
2018-10-17 20:35:22 +02:00
|
|
|
return build_user_profile(
|
2018-10-12 12:50:01 +02:00
|
|
|
avatar_source=avatar_source,
|
2018-09-28 13:00:32 +02:00
|
|
|
date_joined=date_joined,
|
|
|
|
delivery_email=delivery_email,
|
|
|
|
email=email,
|
|
|
|
full_name=full_name,
|
|
|
|
id=id,
|
2018-09-28 23:49:59 +02:00
|
|
|
is_active=is_active,
|
2019-10-05 02:35:07 +02:00
|
|
|
role=role,
|
2018-10-15 13:03:44 +02:00
|
|
|
is_mirror_dummy=is_mirror_dummy,
|
2018-09-28 13:00:32 +02:00
|
|
|
realm_id=realm_id,
|
|
|
|
short_name=short_name,
|
|
|
|
timezone=timezone,
|
|
|
|
)
|
|
|
|
|
2018-10-14 23:23:34 +02:00
|
|
|
for raw_item in flat_data:
|
|
|
|
user = process(raw_item)
|
|
|
|
user_handler.add_user(user)
|
2018-09-28 13:00:32 +02:00
|
|
|
|
2018-09-29 03:11:55 +02:00
|
|
|
def convert_avatar_data(avatar_folder: str,
|
|
|
|
raw_data: List[ZerverFieldsT],
|
2018-11-22 20:57:18 +01:00
|
|
|
user_id_mapper: IdMapper,
|
2018-09-29 03:11:55 +02:00
|
|
|
realm_id: int) -> List[ZerverFieldsT]:
|
|
|
|
'''
|
|
|
|
This code is pretty specific to how Hipchat sends us data.
|
|
|
|
They give us the avatar payloads in base64 in users.json.
|
|
|
|
|
|
|
|
We process avatars in our own pass of that data, rather
|
|
|
|
than doing it while we're getting other user data. I
|
|
|
|
chose to keep this separate, as otherwise you have a lot
|
|
|
|
of extraneous data getting passed around.
|
|
|
|
|
|
|
|
This code has MAJOR SIDE EFFECTS--namely writing a bunch
|
|
|
|
of files to the avatars directory.
|
|
|
|
'''
|
|
|
|
|
2018-11-22 20:57:18 +01:00
|
|
|
avatar_records = []
|
|
|
|
|
|
|
|
for d in raw_data:
|
|
|
|
raw_user = d['User']
|
|
|
|
avatar_payload = raw_user.get('avatar')
|
|
|
|
if not avatar_payload:
|
|
|
|
continue
|
2018-09-29 03:11:55 +02:00
|
|
|
|
|
|
|
bits = base64.b64decode(avatar_payload)
|
2018-11-22 20:57:18 +01:00
|
|
|
|
|
|
|
raw_user_id = raw_user['id']
|
|
|
|
if not user_id_mapper.has(raw_user_id):
|
|
|
|
continue
|
|
|
|
|
|
|
|
user_id = user_id_mapper.get(raw_user_id)
|
2018-09-29 03:11:55 +02:00
|
|
|
|
|
|
|
metadata = write_avatar_png(
|
|
|
|
avatar_folder=avatar_folder,
|
|
|
|
realm_id=realm_id,
|
|
|
|
user_id=user_id,
|
|
|
|
bits=bits,
|
|
|
|
)
|
2018-11-22 20:57:18 +01:00
|
|
|
avatar_records.append(metadata)
|
2018-09-29 03:11:55 +02:00
|
|
|
|
|
|
|
return avatar_records
|
|
|
|
|
2018-09-28 13:00:32 +02:00
|
|
|
def read_room_data(data_dir: str) -> List[ZerverFieldsT]:
|
|
|
|
fn = 'rooms.json'
|
|
|
|
data_file = os.path.join(data_dir, fn)
|
2018-10-17 01:14:09 +02:00
|
|
|
with open(data_file) as f:
|
|
|
|
data = ujson.load(f)
|
2018-09-28 13:00:32 +02:00
|
|
|
return data
|
|
|
|
|
2018-10-24 18:15:29 +02:00
|
|
|
def convert_room_data(raw_data: List[ZerverFieldsT],
|
|
|
|
subscriber_handler: SubscriberHandler,
|
2018-11-13 23:03:18 +01:00
|
|
|
stream_id_mapper: IdMapper,
|
|
|
|
user_id_mapper: IdMapper,
|
2019-01-10 00:42:38 +01:00
|
|
|
realm_id: int,
|
|
|
|
api_token: Optional[str]=None) -> List[ZerverFieldsT]:
|
2018-09-28 13:00:32 +02:00
|
|
|
flat_data = [
|
|
|
|
d['Room']
|
|
|
|
for d in raw_data
|
|
|
|
]
|
|
|
|
|
2018-11-17 16:44:58 +01:00
|
|
|
def get_invite_only(v: str) -> bool:
|
2018-09-28 13:00:32 +02:00
|
|
|
if v == 'public':
|
|
|
|
return False
|
|
|
|
elif v == 'private':
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
raise Exception('unexpected value')
|
|
|
|
|
2018-11-17 16:44:58 +01:00
|
|
|
streams = []
|
|
|
|
|
|
|
|
for in_dict in flat_data:
|
2018-09-28 13:00:32 +02:00
|
|
|
now = int(timezone_now().timestamp())
|
2018-11-13 23:03:18 +01:00
|
|
|
stream_id = stream_id_mapper.get(in_dict['id'])
|
|
|
|
|
2018-11-17 16:44:58 +01:00
|
|
|
invite_only = get_invite_only(in_dict['privacy'])
|
|
|
|
|
|
|
|
stream = build_stream(
|
2018-09-28 13:00:32 +02:00
|
|
|
date_created=now,
|
|
|
|
realm_id=realm_id,
|
|
|
|
name=in_dict['name'],
|
|
|
|
description=in_dict['topic'],
|
2018-11-13 23:03:18 +01:00
|
|
|
stream_id=stream_id,
|
2018-09-28 23:49:59 +02:00
|
|
|
deactivated=in_dict['is_archived'],
|
2018-11-17 16:44:58 +01:00
|
|
|
invite_only=invite_only,
|
2018-09-28 13:00:32 +02:00
|
|
|
)
|
|
|
|
|
2018-11-17 16:44:58 +01:00
|
|
|
if invite_only:
|
python: Convert assignment type annotations to Python 3.6 style.
This commit was split by tabbott; this piece covers the vast majority
of files in Zulip, but excludes scripts/, tools/, and puppet/ to help
ensure we at least show the right error messages for Xenial systems.
We can likely further refine the remaining pieces with some testing.
Generated by com2ann, with whitespace fixes and various manual fixes
for runtime issues:
- invoiced_through: Optional[LicenseLedger] = models.ForeignKey(
+ invoiced_through: Optional["LicenseLedger"] = models.ForeignKey(
-_apns_client: Optional[APNsClient] = None
+_apns_client: Optional["APNsClient"] = None
- notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
+ author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
- bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
+ bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
- default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
- default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
-descriptors_by_handler_id: Dict[int, ClientDescriptor] = {}
+descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {}
-worker_classes: Dict[str, Type[QueueProcessingWorker]] = {}
-queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {}
+worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {}
+queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {}
-AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None
+AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-22 01:09:50 +02:00
|
|
|
users: Set[int] = {
|
2018-11-17 16:44:58 +01:00
|
|
|
user_id_mapper.get(key)
|
|
|
|
for key in in_dict['members']
|
|
|
|
if user_id_mapper.has(key)
|
python: Convert assignment type annotations to Python 3.6 style.
This commit was split by tabbott; this piece covers the vast majority
of files in Zulip, but excludes scripts/, tools/, and puppet/ to help
ensure we at least show the right error messages for Xenial systems.
We can likely further refine the remaining pieces with some testing.
Generated by com2ann, with whitespace fixes and various manual fixes
for runtime issues:
- invoiced_through: Optional[LicenseLedger] = models.ForeignKey(
+ invoiced_through: Optional["LicenseLedger"] = models.ForeignKey(
-_apns_client: Optional[APNsClient] = None
+_apns_client: Optional["APNsClient"] = None
- notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
+ author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
- bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
+ bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
- default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
- default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
-descriptors_by_handler_id: Dict[int, ClientDescriptor] = {}
+descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {}
-worker_classes: Dict[str, Type[QueueProcessingWorker]] = {}
-queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {}
+worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {}
+queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {}
-AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None
+AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-22 01:09:50 +02:00
|
|
|
}
|
2018-11-13 23:03:18 +01:00
|
|
|
|
2018-11-17 16:44:58 +01:00
|
|
|
if user_id_mapper.has(in_dict['owner']):
|
|
|
|
owner = user_id_mapper.get(in_dict['owner'])
|
|
|
|
users.add(owner)
|
2019-01-10 00:42:38 +01:00
|
|
|
else:
|
|
|
|
users = set()
|
|
|
|
if api_token is not None:
|
|
|
|
hc = hypchat.HypChat(api_token)
|
2020-04-09 21:51:58 +02:00
|
|
|
room_data = hc.fromurl('{}/v2/room/{}/member'.format(hc.endpoint, in_dict['id']))
|
2018-11-13 23:03:18 +01:00
|
|
|
|
2019-01-10 00:42:38 +01:00
|
|
|
for item in room_data['items']:
|
|
|
|
hipchat_user_id = item['id']
|
|
|
|
zulip_user_id = user_id_mapper.get(hipchat_user_id)
|
|
|
|
users.add(zulip_user_id)
|
2018-11-17 16:44:58 +01:00
|
|
|
|
2019-01-10 00:42:38 +01:00
|
|
|
if users:
|
2018-11-17 16:44:58 +01:00
|
|
|
subscriber_handler.set_info(
|
|
|
|
stream_id=stream_id,
|
|
|
|
users=users,
|
|
|
|
)
|
2018-10-24 18:15:29 +02:00
|
|
|
|
2018-09-28 13:00:32 +02:00
|
|
|
# unmapped fields:
|
|
|
|
# guest_access_url: no Zulip equivalent
|
|
|
|
# created: we just use "now"
|
|
|
|
# participants: no good sample data
|
|
|
|
|
2018-11-17 16:44:58 +01:00
|
|
|
streams.append(stream)
|
|
|
|
|
|
|
|
return streams
|
2018-09-28 13:00:32 +02:00
|
|
|
|
|
|
|
def make_realm(realm_id: int) -> ZerverFieldsT:
|
|
|
|
NOW = float(timezone_now().timestamp())
|
|
|
|
domain_name = settings.EXTERNAL_HOST
|
|
|
|
realm_subdomain = ""
|
|
|
|
zerver_realm = build_zerver_realm(realm_id, realm_subdomain, NOW, 'HipChat')
|
|
|
|
realm = build_realm(zerver_realm, realm_id, domain_name)
|
|
|
|
|
|
|
|
# We may override these later.
|
|
|
|
realm['zerver_defaultstream'] = []
|
|
|
|
|
|
|
|
return realm
|
|
|
|
|
2018-09-29 03:11:55 +02:00
|
|
|
def write_avatar_data(raw_user_data: List[ZerverFieldsT],
|
|
|
|
output_dir: str,
|
2018-11-22 20:57:18 +01:00
|
|
|
user_id_mapper: IdMapper,
|
2018-09-29 03:11:55 +02:00
|
|
|
realm_id: int) -> None:
|
2018-09-28 13:00:32 +02:00
|
|
|
avatar_folder = os.path.join(output_dir, 'avatars')
|
|
|
|
avatar_realm_folder = os.path.join(avatar_folder, str(realm_id))
|
|
|
|
os.makedirs(avatar_realm_folder, exist_ok=True)
|
2018-09-29 03:11:55 +02:00
|
|
|
|
|
|
|
avatar_records = convert_avatar_data(
|
|
|
|
avatar_folder=avatar_folder,
|
|
|
|
raw_data=raw_user_data,
|
2018-11-22 20:57:18 +01:00
|
|
|
user_id_mapper=user_id_mapper,
|
2018-09-29 03:11:55 +02:00
|
|
|
realm_id=realm_id,
|
|
|
|
)
|
|
|
|
|
2018-09-28 13:00:32 +02:00
|
|
|
create_converted_data_files(avatar_records, output_dir, '/avatars/records.json')
|
|
|
|
|
2018-09-29 15:43:53 +02:00
|
|
|
def write_emoticon_data(realm_id: int,
|
|
|
|
data_dir: str,
|
|
|
|
output_dir: str) -> List[ZerverFieldsT]:
|
|
|
|
'''
|
|
|
|
This function does most of the work for processing emoticons, the bulk
|
|
|
|
of which is copying files. We also write a json file with metadata.
|
|
|
|
Finally, we return a list of RealmEmoji dicts to our caller.
|
|
|
|
|
|
|
|
In our data_dir we have a pretty simple setup:
|
|
|
|
|
|
|
|
emoticons.json - has very simple metadata on emojis:
|
|
|
|
|
|
|
|
{
|
|
|
|
"Emoticon": {
|
|
|
|
"id": 9875487,
|
|
|
|
"path": "emoticons/yasss.jpg",
|
|
|
|
"shortcut": "yasss"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"Emoticon": {
|
|
|
|
"id": 718017,
|
|
|
|
"path": "emoticons/yayyyyy.gif",
|
|
|
|
"shortcut": "yayyyyy"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
emoticons/ - contains a bunch of image files:
|
|
|
|
|
|
|
|
slytherinsnake.gif
|
|
|
|
spanishinquisition.jpg
|
|
|
|
sparkle.png
|
|
|
|
spiderman.gif
|
|
|
|
stableparrot.gif
|
|
|
|
stalkerparrot.gif
|
|
|
|
supergirl.png
|
|
|
|
superman.png
|
|
|
|
|
|
|
|
We move all the relevant files to Zulip's more nested
|
|
|
|
directory structure.
|
|
|
|
'''
|
|
|
|
|
|
|
|
logging.info('Starting to process emoticons')
|
|
|
|
|
|
|
|
fn = 'emoticons.json'
|
|
|
|
data_file = os.path.join(data_dir, fn)
|
2018-12-30 03:52:32 +01:00
|
|
|
if not os.path.exists(data_file):
|
|
|
|
logging.warning("HipChat export does not contain emoticons.json.")
|
|
|
|
logging.warning("As a result, custom emoji cannot be imported.")
|
|
|
|
return []
|
|
|
|
|
2018-10-17 01:14:09 +02:00
|
|
|
with open(data_file) as f:
|
|
|
|
data = ujson.load(f)
|
2018-09-29 15:43:53 +02:00
|
|
|
|
2018-12-30 03:58:25 +01:00
|
|
|
if isinstance(data, dict) and 'Emoticons' in data:
|
|
|
|
# Handle the hc-migrate export format for emoticons.json.
|
|
|
|
flat_data = [
|
|
|
|
dict(
|
|
|
|
path=d['path'],
|
|
|
|
name=d['shortcut'],
|
|
|
|
)
|
|
|
|
for d in data['Emoticons']
|
|
|
|
]
|
|
|
|
else:
|
|
|
|
flat_data = [
|
|
|
|
dict(
|
|
|
|
path=d['Emoticon']['path'],
|
|
|
|
name=d['Emoticon']['shortcut'],
|
|
|
|
)
|
|
|
|
for d in data
|
|
|
|
]
|
2018-09-29 15:43:53 +02:00
|
|
|
|
|
|
|
emoji_folder = os.path.join(output_dir, 'emoji')
|
|
|
|
os.makedirs(emoji_folder, exist_ok=True)
|
|
|
|
|
|
|
|
def process(data: ZerverFieldsT) -> ZerverFieldsT:
|
|
|
|
source_sub_path = data['path']
|
|
|
|
source_fn = os.path.basename(source_sub_path)
|
|
|
|
source_path = os.path.join(data_dir, source_sub_path)
|
|
|
|
|
|
|
|
# Use our template from RealmEmoji
|
|
|
|
# PATH_ID_TEMPLATE = "{realm_id}/emoji/images/{emoji_file_name}"
|
|
|
|
target_fn = source_fn
|
|
|
|
target_sub_path = RealmEmoji.PATH_ID_TEMPLATE.format(
|
|
|
|
realm_id=realm_id,
|
|
|
|
emoji_file_name=target_fn,
|
|
|
|
)
|
|
|
|
target_path = os.path.join(emoji_folder, target_sub_path)
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
|
|
|
|
|
|
|
source_path = os.path.abspath(source_path)
|
|
|
|
target_path = os.path.abspath(target_path)
|
|
|
|
|
|
|
|
shutil.copyfile(source_path, target_path)
|
|
|
|
|
|
|
|
return dict(
|
|
|
|
path=target_path,
|
2018-11-06 22:02:04 +01:00
|
|
|
s3_path=target_path,
|
2018-09-29 15:43:53 +02:00
|
|
|
file_name=target_fn,
|
|
|
|
realm_id=realm_id,
|
|
|
|
name=data['name'],
|
|
|
|
)
|
|
|
|
|
|
|
|
emoji_records = list(map(process, flat_data))
|
|
|
|
create_converted_data_files(emoji_records, output_dir, '/emoji/records.json')
|
|
|
|
|
|
|
|
realmemoji = [
|
|
|
|
build_realm_emoji(
|
|
|
|
realm_id=realm_id,
|
|
|
|
name=rec['name'],
|
2018-10-13 15:13:35 +02:00
|
|
|
id=NEXT_ID('realmemoji'),
|
2018-09-29 15:43:53 +02:00
|
|
|
file_name=rec['file_name'],
|
|
|
|
)
|
2018-10-13 15:13:35 +02:00
|
|
|
for rec in emoji_records
|
2018-09-29 15:43:53 +02:00
|
|
|
]
|
|
|
|
logging.info('Done processing emoticons')
|
|
|
|
|
|
|
|
return realmemoji
|
|
|
|
|
2018-09-29 21:11:22 +02:00
|
|
|
def write_message_data(realm_id: int,
|
2018-11-21 19:46:00 +01:00
|
|
|
slim_mode: bool,
|
2018-10-13 01:17:19 +02:00
|
|
|
message_key: str,
|
2018-09-29 21:11:22 +02:00
|
|
|
zerver_recipient: List[ZerverFieldsT],
|
2018-10-23 16:53:09 +02:00
|
|
|
subscriber_map: Dict[int, Set[int]],
|
2018-09-29 21:11:22 +02:00
|
|
|
data_dir: str,
|
2018-10-13 16:25:44 +02:00
|
|
|
output_dir: str,
|
2018-10-25 00:57:11 +02:00
|
|
|
masking_content: bool,
|
2018-11-13 23:03:18 +01:00
|
|
|
stream_id_mapper: IdMapper,
|
|
|
|
user_id_mapper: IdMapper,
|
2018-10-14 23:23:34 +02:00
|
|
|
user_handler: UserHandler,
|
2018-10-13 16:25:44 +02:00
|
|
|
attachment_handler: AttachmentHandler) -> None:
|
2018-10-13 01:17:19 +02:00
|
|
|
|
|
|
|
stream_id_to_recipient_id = {
|
|
|
|
d['type_id']: d['id']
|
|
|
|
for d in zerver_recipient
|
|
|
|
if d['type'] == Recipient.STREAM
|
|
|
|
}
|
|
|
|
|
2018-10-13 13:41:18 +02:00
|
|
|
user_id_to_recipient_id = {
|
|
|
|
d['type_id']: d['id']
|
|
|
|
for d in zerver_recipient
|
|
|
|
if d['type'] == Recipient.PERSONAL
|
|
|
|
}
|
|
|
|
|
2018-10-13 01:17:19 +02:00
|
|
|
def get_stream_recipient_id(raw_message: ZerverFieldsT) -> int:
|
|
|
|
fn_id = raw_message['fn_id']
|
2018-11-13 23:03:18 +01:00
|
|
|
stream_id = stream_id_mapper.get(fn_id)
|
|
|
|
recipient_id = stream_id_to_recipient_id[stream_id]
|
2018-10-13 01:17:19 +02:00
|
|
|
return recipient_id
|
|
|
|
|
2018-10-13 13:41:18 +02:00
|
|
|
def get_pm_recipient_id(raw_message: ZerverFieldsT) -> int:
|
2018-11-13 23:03:18 +01:00
|
|
|
raw_user_id = raw_message['receiver_id']
|
|
|
|
assert(raw_user_id)
|
|
|
|
user_id = user_id_mapper.get(raw_user_id)
|
2018-10-13 13:41:18 +02:00
|
|
|
recipient_id = user_id_to_recipient_id[user_id]
|
|
|
|
return recipient_id
|
|
|
|
|
2018-10-15 13:03:44 +02:00
|
|
|
if message_key in ['UserMessage', 'NotificationMessage']:
|
2018-10-25 15:58:53 +02:00
|
|
|
is_pm_data = False
|
2018-10-13 01:17:19 +02:00
|
|
|
dir_glob = os.path.join(data_dir, 'rooms', '*', 'history.json')
|
|
|
|
get_recipient_id = get_stream_recipient_id
|
2018-10-13 16:25:44 +02:00
|
|
|
get_files_dir = lambda fn_id: os.path.join(data_dir, 'rooms', str(fn_id), 'files')
|
2018-10-13 01:17:19 +02:00
|
|
|
|
2018-10-13 13:41:18 +02:00
|
|
|
elif message_key == 'PrivateUserMessage':
|
2018-10-25 15:58:53 +02:00
|
|
|
is_pm_data = True
|
2018-10-13 13:41:18 +02:00
|
|
|
dir_glob = os.path.join(data_dir, 'users', '*', 'history.json')
|
|
|
|
get_recipient_id = get_pm_recipient_id
|
2018-10-13 16:25:44 +02:00
|
|
|
get_files_dir = lambda fn_id: os.path.join(data_dir, 'users', 'files')
|
2018-10-13 13:41:18 +02:00
|
|
|
|
2018-10-13 01:17:19 +02:00
|
|
|
else:
|
|
|
|
raise Exception('programming error: invalid message_key: ' + message_key)
|
2018-09-29 21:11:22 +02:00
|
|
|
|
2018-10-13 21:11:31 +02:00
|
|
|
history_files = glob.glob(dir_glob)
|
|
|
|
for fn in history_files:
|
2018-10-13 16:25:44 +02:00
|
|
|
dir = os.path.dirname(fn)
|
2018-11-13 23:03:18 +01:00
|
|
|
fn_id = os.path.basename(dir)
|
2018-10-13 16:25:44 +02:00
|
|
|
files_dir = get_files_dir(fn_id)
|
|
|
|
|
2018-10-13 21:11:31 +02:00
|
|
|
process_message_file(
|
2018-10-13 16:25:44 +02:00
|
|
|
realm_id=realm_id,
|
2018-11-21 19:46:00 +01:00
|
|
|
slim_mode=slim_mode,
|
2018-10-13 21:11:31 +02:00
|
|
|
fn=fn,
|
2018-10-13 16:25:44 +02:00
|
|
|
fn_id=fn_id,
|
|
|
|
files_dir=files_dir,
|
2018-10-13 21:11:31 +02:00
|
|
|
get_recipient_id=get_recipient_id,
|
|
|
|
message_key=message_key,
|
2018-10-23 16:53:09 +02:00
|
|
|
subscriber_map=subscriber_map,
|
2018-10-13 21:11:31 +02:00
|
|
|
data_dir=data_dir,
|
|
|
|
output_dir=output_dir,
|
2018-10-25 15:58:53 +02:00
|
|
|
is_pm_data=is_pm_data,
|
2018-10-25 00:57:11 +02:00
|
|
|
masking_content=masking_content,
|
2018-11-13 23:03:18 +01:00
|
|
|
user_id_mapper=user_id_mapper,
|
2018-10-14 23:23:34 +02:00
|
|
|
user_handler=user_handler,
|
2018-10-13 16:25:44 +02:00
|
|
|
attachment_handler=attachment_handler,
|
2018-10-13 21:11:31 +02:00
|
|
|
)
|
|
|
|
|
2018-10-23 17:08:12 +02:00
|
|
|
def get_hipchat_sender_id(realm_id: int,
|
2018-11-21 19:46:00 +01:00
|
|
|
slim_mode: bool,
|
2018-10-23 17:08:12 +02:00
|
|
|
message_dict: Dict[str, Any],
|
2018-11-13 23:03:18 +01:00
|
|
|
user_id_mapper: IdMapper,
|
2018-11-21 19:46:00 +01:00
|
|
|
user_handler: UserHandler) -> Optional[int]:
|
2018-10-23 17:08:12 +02:00
|
|
|
'''
|
|
|
|
The HipChat export is inconsistent in how it renders
|
|
|
|
senders, and sometimes we don't even get an id.
|
|
|
|
'''
|
|
|
|
if isinstance(message_dict['sender'], str):
|
2018-11-21 19:46:00 +01:00
|
|
|
if slim_mode:
|
|
|
|
return None
|
2018-10-23 17:08:12 +02:00
|
|
|
# Some Hipchat instances just give us a person's
|
|
|
|
# name in the sender field for NotificationMessage.
|
|
|
|
# We turn them into a mirror user.
|
|
|
|
mirror_user = user_handler.get_mirror_user(
|
|
|
|
realm_id=realm_id,
|
|
|
|
name=message_dict['sender'],
|
|
|
|
)
|
|
|
|
sender_id = mirror_user['id']
|
|
|
|
return sender_id
|
|
|
|
|
2018-11-13 23:03:18 +01:00
|
|
|
raw_sender_id = message_dict['sender']['id']
|
|
|
|
|
|
|
|
if raw_sender_id == 0:
|
2018-11-21 19:46:00 +01:00
|
|
|
if slim_mode:
|
|
|
|
return None
|
2018-11-13 23:03:18 +01:00
|
|
|
mirror_user = user_handler.get_mirror_user(
|
|
|
|
realm_id=realm_id,
|
python: Use trailing commas consistently.
Automatically generated by the following script, based on the output
of lint with flake8-comma:
import re
import sys
last_filename = None
last_row = None
lines = []
for msg in sys.stdin:
m = re.match(
r"\x1b\[35mflake8 \|\x1b\[0m \x1b\[1;31m(.+):(\d+):(\d+): (\w+)", msg
)
if m:
filename, row_str, col_str, err = m.groups()
row, col = int(row_str), int(col_str)
if filename == last_filename:
assert last_row != row
else:
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
with open(filename) as f:
lines = f.readlines()
last_filename = filename
last_row = row
line = lines[row - 1]
if err in ["C812", "C815"]:
lines[row - 1] = line[: col - 1] + "," + line[col - 1 :]
elif err in ["C819"]:
assert line[col - 2] == ","
lines[row - 1] = line[: col - 2] + line[col - 1 :].lstrip(" ")
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-10 05:23:40 +02:00
|
|
|
name=message_dict['sender']['name'],
|
2018-11-13 23:03:18 +01:00
|
|
|
)
|
|
|
|
sender_id = mirror_user['id']
|
|
|
|
return sender_id
|
2018-10-23 15:22:26 +02:00
|
|
|
|
2018-11-13 23:03:18 +01:00
|
|
|
if not user_id_mapper.has(raw_sender_id):
|
2018-11-21 19:46:00 +01:00
|
|
|
if slim_mode:
|
|
|
|
return None
|
2018-10-23 15:22:26 +02:00
|
|
|
mirror_user = user_handler.get_mirror_user(
|
|
|
|
realm_id=realm_id,
|
python: Use trailing commas consistently.
Automatically generated by the following script, based on the output
of lint with flake8-comma:
import re
import sys
last_filename = None
last_row = None
lines = []
for msg in sys.stdin:
m = re.match(
r"\x1b\[35mflake8 \|\x1b\[0m \x1b\[1;31m(.+):(\d+):(\d+): (\w+)", msg
)
if m:
filename, row_str, col_str, err = m.groups()
row, col = int(row_str), int(col_str)
if filename == last_filename:
assert last_row != row
else:
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
with open(filename) as f:
lines = f.readlines()
last_filename = filename
last_row = row
line = lines[row - 1]
if err in ["C812", "C815"]:
lines[row - 1] = line[: col - 1] + "," + line[col - 1 :]
elif err in ["C819"]:
assert line[col - 2] == ","
lines[row - 1] = line[: col - 2] + line[col - 1 :].lstrip(" ")
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-10 05:23:40 +02:00
|
|
|
name=message_dict['sender']['id'],
|
2018-10-23 15:22:26 +02:00
|
|
|
)
|
|
|
|
sender_id = mirror_user['id']
|
|
|
|
return sender_id
|
|
|
|
|
|
|
|
# HAPPY PATH: Hipchat just gave us an ordinary
|
|
|
|
# sender_id.
|
2018-11-13 23:03:18 +01:00
|
|
|
sender_id = user_id_mapper.get(raw_sender_id)
|
2018-10-23 17:08:12 +02:00
|
|
|
return sender_id
|
|
|
|
|
2018-10-13 16:25:44 +02:00
|
|
|
def process_message_file(realm_id: int,
|
2018-11-21 19:46:00 +01:00
|
|
|
slim_mode: bool,
|
2018-10-13 16:25:44 +02:00
|
|
|
fn: str,
|
2018-11-13 23:03:18 +01:00
|
|
|
fn_id: str,
|
2018-10-13 16:25:44 +02:00
|
|
|
files_dir: str,
|
2018-10-13 21:11:31 +02:00
|
|
|
get_recipient_id: Callable[[ZerverFieldsT], int],
|
|
|
|
message_key: str,
|
2018-10-23 16:53:09 +02:00
|
|
|
subscriber_map: Dict[int, Set[int]],
|
2018-10-13 21:11:31 +02:00
|
|
|
data_dir: str,
|
2018-10-13 16:25:44 +02:00
|
|
|
output_dir: str,
|
2018-10-25 15:58:53 +02:00
|
|
|
is_pm_data: bool,
|
2018-10-25 00:57:11 +02:00
|
|
|
masking_content: bool,
|
2018-11-13 23:03:18 +01:00
|
|
|
user_id_mapper: IdMapper,
|
2018-10-14 23:23:34 +02:00
|
|
|
user_handler: UserHandler,
|
2018-10-13 16:25:44 +02:00
|
|
|
attachment_handler: AttachmentHandler) -> None:
|
2018-10-13 21:11:31 +02:00
|
|
|
|
|
|
|
def get_raw_messages(fn: str) -> List[ZerverFieldsT]:
|
2018-10-17 01:14:09 +02:00
|
|
|
with open(fn) as f:
|
|
|
|
data = ujson.load(f)
|
2018-09-29 21:11:22 +02:00
|
|
|
|
|
|
|
flat_data = [
|
2018-10-13 01:17:19 +02:00
|
|
|
d[message_key]
|
2018-09-29 21:11:22 +02:00
|
|
|
for d in data
|
2018-10-13 01:17:19 +02:00
|
|
|
if message_key in d
|
2018-09-29 21:11:22 +02:00
|
|
|
]
|
|
|
|
|
2018-10-22 20:56:41 +02:00
|
|
|
def get_raw_message(d: Dict[str, Any]) -> Optional[ZerverFieldsT]:
|
2018-10-23 17:08:12 +02:00
|
|
|
sender_id = get_hipchat_sender_id(
|
|
|
|
realm_id=realm_id,
|
2018-11-21 19:46:00 +01:00
|
|
|
slim_mode=slim_mode,
|
2018-10-23 17:08:12 +02:00
|
|
|
message_dict=d,
|
2018-11-13 23:03:18 +01:00
|
|
|
user_id_mapper=user_id_mapper,
|
2018-10-23 17:08:12 +02:00
|
|
|
user_handler=user_handler,
|
|
|
|
)
|
2018-10-15 13:03:44 +02:00
|
|
|
|
2018-11-21 19:46:00 +01:00
|
|
|
if sender_id is None:
|
|
|
|
return None
|
|
|
|
|
2018-10-25 15:58:53 +02:00
|
|
|
if is_pm_data:
|
2019-01-31 21:36:09 +01:00
|
|
|
# We need to compare with str() on both sides here.
|
|
|
|
# In Stride, user IDs are strings, but in HipChat,
|
|
|
|
# they are integers, and fn_id is always a string.
|
|
|
|
if str(sender_id) != str(fn_id):
|
2018-10-22 20:56:41 +02:00
|
|
|
# PMs are in multiple places in the Hipchat export,
|
|
|
|
# and we only use the copy from the sender
|
|
|
|
return None
|
|
|
|
|
2018-10-25 00:57:11 +02:00
|
|
|
content = d['message']
|
|
|
|
|
|
|
|
if masking_content:
|
|
|
|
content = re.sub('[a-z]', 'x', content)
|
|
|
|
content = re.sub('[A-Z]', 'X', content)
|
|
|
|
|
2018-10-15 13:03:44 +02:00
|
|
|
return dict(
|
2018-10-13 01:17:19 +02:00
|
|
|
fn_id=fn_id,
|
2018-10-15 13:03:44 +02:00
|
|
|
sender_id=sender_id,
|
2018-10-13 13:41:18 +02:00
|
|
|
receiver_id=d.get('receiver', {}).get('id'),
|
2018-10-25 00:57:11 +02:00
|
|
|
content=content,
|
2018-10-15 13:03:44 +02:00
|
|
|
mention_user_ids=d.get('mentions', []),
|
2019-08-28 02:43:19 +02:00
|
|
|
date_sent=str_date_to_float(d['timestamp']),
|
2018-10-15 13:03:44 +02:00
|
|
|
attachment=d.get('attachment'),
|
2018-10-13 16:25:44 +02:00
|
|
|
files_dir=files_dir,
|
2018-09-29 21:11:22 +02:00
|
|
|
)
|
2018-10-15 13:03:44 +02:00
|
|
|
|
|
|
|
raw_messages = []
|
|
|
|
|
|
|
|
for d in flat_data:
|
|
|
|
raw_message = get_raw_message(d)
|
2018-10-22 20:56:41 +02:00
|
|
|
if raw_message is not None:
|
|
|
|
raw_messages.append(raw_message)
|
2018-10-15 13:03:44 +02:00
|
|
|
|
|
|
|
return raw_messages
|
2018-09-29 21:11:22 +02:00
|
|
|
|
2018-10-13 21:11:31 +02:00
|
|
|
raw_messages = get_raw_messages(fn)
|
2018-09-29 21:11:22 +02:00
|
|
|
|
2018-10-15 14:15:04 +02:00
|
|
|
def process_batch(lst: List[Any]) -> None:
|
|
|
|
process_raw_message_batch(
|
|
|
|
realm_id=realm_id,
|
|
|
|
raw_messages=lst,
|
2018-10-23 16:53:09 +02:00
|
|
|
subscriber_map=subscriber_map,
|
2018-11-13 23:03:18 +01:00
|
|
|
user_id_mapper=user_id_mapper,
|
2018-10-15 14:15:04 +02:00
|
|
|
user_handler=user_handler,
|
|
|
|
attachment_handler=attachment_handler,
|
|
|
|
get_recipient_id=get_recipient_id,
|
2018-10-25 15:58:53 +02:00
|
|
|
is_pm_data=is_pm_data,
|
2018-10-15 14:15:04 +02:00
|
|
|
output_dir=output_dir,
|
|
|
|
)
|
|
|
|
|
|
|
|
chunk_size = 1000
|
|
|
|
|
|
|
|
process_list_in_batches(
|
|
|
|
lst=raw_messages,
|
|
|
|
chunk_size=chunk_size,
|
|
|
|
process_batch=process_batch,
|
|
|
|
)
|
|
|
|
|
|
|
|
def process_raw_message_batch(realm_id: int,
|
|
|
|
raw_messages: List[Dict[str, Any]],
|
2018-10-23 16:53:09 +02:00
|
|
|
subscriber_map: Dict[int, Set[int]],
|
2018-11-13 23:03:18 +01:00
|
|
|
user_id_mapper: IdMapper,
|
2018-10-15 14:15:04 +02:00
|
|
|
user_handler: UserHandler,
|
|
|
|
attachment_handler: AttachmentHandler,
|
|
|
|
get_recipient_id: Callable[[ZerverFieldsT], int],
|
2018-10-25 15:58:53 +02:00
|
|
|
is_pm_data: bool,
|
2018-10-15 14:15:04 +02:00
|
|
|
output_dir: str) -> None:
|
|
|
|
|
2018-10-15 14:00:13 +02:00
|
|
|
def fix_mentions(content: str,
|
2018-11-13 23:03:18 +01:00
|
|
|
mention_user_ids: Set[int]) -> str:
|
2018-10-15 14:00:13 +02:00
|
|
|
for user_id in mention_user_ids:
|
|
|
|
user = user_handler.get_user(user_id=user_id)
|
|
|
|
hipchat_mention = '@{short_name}'.format(**user)
|
|
|
|
zulip_mention = '@**{full_name}**'.format(**user)
|
|
|
|
content = content.replace(hipchat_mention, zulip_mention)
|
|
|
|
|
|
|
|
content = content.replace('@here', '@**all**')
|
|
|
|
return content
|
|
|
|
|
python: Convert assignment type annotations to Python 3.6 style.
This commit was split by tabbott; this piece covers the vast majority
of files in Zulip, but excludes scripts/, tools/, and puppet/ to help
ensure we at least show the right error messages for Xenial systems.
We can likely further refine the remaining pieces with some testing.
Generated by com2ann, with whitespace fixes and various manual fixes
for runtime issues:
- invoiced_through: Optional[LicenseLedger] = models.ForeignKey(
+ invoiced_through: Optional["LicenseLedger"] = models.ForeignKey(
-_apns_client: Optional[APNsClient] = None
+_apns_client: Optional["APNsClient"] = None
- notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
+ author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
- bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
+ bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
- default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
- default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
-descriptors_by_handler_id: Dict[int, ClientDescriptor] = {}
+descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {}
-worker_classes: Dict[str, Type[QueueProcessingWorker]] = {}
-queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {}
+worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {}
+queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {}
-AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None
+AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-22 01:09:50 +02:00
|
|
|
mention_map: Dict[int, Set[int]] = dict()
|
2018-09-29 21:11:22 +02:00
|
|
|
|
2018-11-17 16:50:24 +01:00
|
|
|
zerver_message = []
|
|
|
|
|
2019-01-10 01:53:06 +01:00
|
|
|
import html2text
|
|
|
|
h = html2text.HTML2Text()
|
|
|
|
|
2018-11-17 16:50:24 +01:00
|
|
|
for raw_message in raw_messages:
|
2018-09-29 21:11:22 +02:00
|
|
|
# One side effect here:
|
2018-11-17 16:50:24 +01:00
|
|
|
|
|
|
|
message_id = NEXT_ID('message')
|
2018-11-13 23:03:18 +01:00
|
|
|
mention_user_ids = {
|
|
|
|
user_id_mapper.get(id)
|
|
|
|
for id in set(raw_message['mention_user_ids'])
|
|
|
|
if user_id_mapper.has(id)
|
|
|
|
}
|
|
|
|
mention_map[message_id] = mention_user_ids
|
2018-09-29 21:11:22 +02:00
|
|
|
|
|
|
|
content = fix_mentions(
|
|
|
|
content=raw_message['content'],
|
2018-11-13 23:03:18 +01:00
|
|
|
mention_user_ids=mention_user_ids,
|
2018-09-29 21:11:22 +02:00
|
|
|
)
|
2019-01-10 01:53:06 +01:00
|
|
|
content = h.handle(content)
|
2018-11-21 20:16:28 +01:00
|
|
|
|
|
|
|
if len(content) > 10000:
|
2020-05-02 08:44:14 +02:00
|
|
|
logging.info('skipping too-long message of length %s', len(content))
|
2018-11-21 20:16:28 +01:00
|
|
|
continue
|
|
|
|
|
2019-08-28 02:43:19 +02:00
|
|
|
date_sent = raw_message['date_sent']
|
2018-11-17 16:50:24 +01:00
|
|
|
|
|
|
|
try:
|
|
|
|
recipient_id = get_recipient_id(raw_message)
|
|
|
|
except KeyError:
|
|
|
|
logging.debug("Could not find recipient_id for a message, skipping.")
|
|
|
|
continue
|
|
|
|
|
2018-09-29 21:11:22 +02:00
|
|
|
rendered_content = None
|
2018-10-25 15:58:53 +02:00
|
|
|
|
|
|
|
if is_pm_data:
|
2018-11-10 17:10:45 +01:00
|
|
|
topic_name = ''
|
2018-10-25 15:58:53 +02:00
|
|
|
else:
|
2018-11-10 17:10:45 +01:00
|
|
|
topic_name = 'imported from hipchat'
|
2018-09-29 21:11:22 +02:00
|
|
|
user_id = raw_message['sender_id']
|
|
|
|
|
2018-10-13 16:25:44 +02:00
|
|
|
# Another side effect:
|
|
|
|
extra_content = attachment_handler.handle_message_data(
|
|
|
|
realm_id=realm_id,
|
|
|
|
message_id=message_id,
|
|
|
|
sender_id=user_id,
|
|
|
|
attachment=raw_message['attachment'],
|
|
|
|
files_dir=raw_message['files_dir'],
|
|
|
|
)
|
|
|
|
|
|
|
|
if extra_content:
|
|
|
|
has_attachment = True
|
|
|
|
content += '\n' + extra_content
|
|
|
|
else:
|
|
|
|
has_attachment = False
|
|
|
|
|
2018-11-17 16:50:24 +01:00
|
|
|
message = build_message(
|
2018-09-29 21:11:22 +02:00
|
|
|
content=content,
|
|
|
|
message_id=message_id,
|
2019-08-28 02:43:19 +02:00
|
|
|
date_sent=date_sent,
|
2018-09-29 21:11:22 +02:00
|
|
|
recipient_id=recipient_id,
|
|
|
|
rendered_content=rendered_content,
|
2018-11-10 17:10:45 +01:00
|
|
|
topic_name=topic_name,
|
2018-09-29 21:11:22 +02:00
|
|
|
user_id=user_id,
|
2018-10-13 16:25:44 +02:00
|
|
|
has_attachment=has_attachment,
|
2018-09-29 21:11:22 +02:00
|
|
|
)
|
2018-11-17 16:50:24 +01:00
|
|
|
zerver_message.append(message)
|
2018-09-29 21:11:22 +02:00
|
|
|
|
2018-10-14 12:55:05 +02:00
|
|
|
zerver_usermessage = make_user_messages(
|
|
|
|
zerver_message=zerver_message,
|
2018-10-23 16:53:09 +02:00
|
|
|
subscriber_map=subscriber_map,
|
2018-10-25 15:58:53 +02:00
|
|
|
is_pm_data=is_pm_data,
|
2018-10-14 12:55:05 +02:00
|
|
|
mention_map=mention_map,
|
|
|
|
)
|
|
|
|
|
|
|
|
message_json = dict(
|
|
|
|
zerver_message=zerver_message,
|
|
|
|
zerver_usermessage=zerver_usermessage,
|
|
|
|
)
|
|
|
|
|
|
|
|
dump_file_id = NEXT_ID('dump_file_id')
|
2020-06-13 08:59:37 +02:00
|
|
|
message_file = f"/messages-{dump_file_id:06}.json"
|
2018-10-14 12:55:05 +02:00
|
|
|
create_converted_data_files(message_json, output_dir, message_file)
|
|
|
|
|
2018-10-25 00:57:11 +02:00
|
|
|
def do_convert_data(input_tar_file: str,
|
|
|
|
output_dir: str,
|
2019-01-10 00:42:38 +01:00
|
|
|
masking_content: bool,
|
|
|
|
api_token: Optional[str]=None,
|
|
|
|
slim_mode: bool=False) -> None:
|
2018-09-28 13:00:32 +02:00
|
|
|
input_data_dir = untar_input_file(input_tar_file)
|
|
|
|
|
2018-10-13 16:25:44 +02:00
|
|
|
attachment_handler = AttachmentHandler()
|
2018-10-14 23:23:34 +02:00
|
|
|
user_handler = UserHandler()
|
2018-10-24 18:15:29 +02:00
|
|
|
subscriber_handler = SubscriberHandler()
|
2018-11-13 23:03:18 +01:00
|
|
|
user_id_mapper = IdMapper()
|
|
|
|
stream_id_mapper = IdMapper()
|
2018-10-13 16:25:44 +02:00
|
|
|
|
2018-09-28 13:00:32 +02:00
|
|
|
realm_id = 0
|
|
|
|
realm = make_realm(realm_id=realm_id)
|
|
|
|
|
|
|
|
# users.json -> UserProfile
|
|
|
|
raw_user_data = read_user_data(data_dir=input_data_dir)
|
2018-10-14 23:23:34 +02:00
|
|
|
convert_user_data(
|
|
|
|
user_handler=user_handler,
|
2018-11-17 16:50:55 +01:00
|
|
|
slim_mode=slim_mode,
|
2018-11-13 23:03:18 +01:00
|
|
|
user_id_mapper=user_id_mapper,
|
2018-09-28 13:00:32 +02:00
|
|
|
raw_data=raw_user_data,
|
|
|
|
realm_id=realm_id,
|
|
|
|
)
|
2018-10-14 23:23:34 +02:00
|
|
|
normal_users = user_handler.get_normal_users()
|
|
|
|
# Don't write zerver_userprofile here, because we
|
|
|
|
# may add more users later.
|
2018-09-28 13:00:32 +02:00
|
|
|
|
|
|
|
# streams.json -> Stream
|
|
|
|
raw_stream_data = read_room_data(data_dir=input_data_dir)
|
|
|
|
zerver_stream = convert_room_data(
|
|
|
|
raw_data=raw_stream_data,
|
2018-10-24 18:15:29 +02:00
|
|
|
subscriber_handler=subscriber_handler,
|
2018-11-13 23:03:18 +01:00
|
|
|
stream_id_mapper=stream_id_mapper,
|
|
|
|
user_id_mapper=user_id_mapper,
|
2018-09-28 13:00:32 +02:00
|
|
|
realm_id=realm_id,
|
2019-01-10 00:42:38 +01:00
|
|
|
api_token=api_token,
|
2018-09-28 13:00:32 +02:00
|
|
|
)
|
|
|
|
realm['zerver_stream'] = zerver_stream
|
|
|
|
|
|
|
|
zerver_recipient = build_recipients(
|
2018-10-14 23:23:34 +02:00
|
|
|
zerver_userprofile=normal_users,
|
2018-09-28 13:00:32 +02:00
|
|
|
zerver_stream=zerver_stream,
|
|
|
|
)
|
|
|
|
realm['zerver_recipient'] = zerver_recipient
|
|
|
|
|
2019-01-10 00:42:38 +01:00
|
|
|
if api_token is None:
|
2019-01-10 01:17:54 +01:00
|
|
|
if slim_mode:
|
python: Convert assignment type annotations to Python 3.6 style.
This commit was split by tabbott; this piece covers the vast majority
of files in Zulip, but excludes scripts/, tools/, and puppet/ to help
ensure we at least show the right error messages for Xenial systems.
We can likely further refine the remaining pieces with some testing.
Generated by com2ann, with whitespace fixes and various manual fixes
for runtime issues:
- invoiced_through: Optional[LicenseLedger] = models.ForeignKey(
+ invoiced_through: Optional["LicenseLedger"] = models.ForeignKey(
-_apns_client: Optional[APNsClient] = None
+_apns_client: Optional["APNsClient"] = None
- notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
+ author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
- bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
+ bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
- default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
- default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
-descriptors_by_handler_id: Dict[int, ClientDescriptor] = {}
+descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {}
-worker_classes: Dict[str, Type[QueueProcessingWorker]] = {}
-queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {}
+worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {}
+queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {}
-AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None
+AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-22 01:09:50 +02:00
|
|
|
public_stream_subscriptions: List[ZerverFieldsT] = []
|
2019-01-10 01:17:54 +01:00
|
|
|
else:
|
|
|
|
public_stream_subscriptions = build_public_stream_subscriptions(
|
|
|
|
zerver_userprofile=normal_users,
|
|
|
|
zerver_recipient=zerver_recipient,
|
|
|
|
zerver_stream=zerver_stream,
|
|
|
|
)
|
|
|
|
|
|
|
|
private_stream_subscriptions = build_stream_subscriptions(
|
|
|
|
get_users=subscriber_handler.get_users,
|
2018-11-21 19:55:01 +01:00
|
|
|
zerver_recipient=zerver_recipient,
|
2019-01-10 01:17:54 +01:00
|
|
|
zerver_stream=[stream_dict for stream_dict in zerver_stream
|
|
|
|
if stream_dict['invite_only']],
|
2018-11-21 19:55:01 +01:00
|
|
|
)
|
2019-01-10 01:17:54 +01:00
|
|
|
stream_subscriptions = public_stream_subscriptions + private_stream_subscriptions
|
2019-01-10 00:42:38 +01:00
|
|
|
else:
|
|
|
|
stream_subscriptions = build_stream_subscriptions(
|
|
|
|
get_users=subscriber_handler.get_users,
|
|
|
|
zerver_recipient=zerver_recipient,
|
|
|
|
zerver_stream=zerver_stream,
|
|
|
|
)
|
2018-10-24 18:15:29 +02:00
|
|
|
|
2018-10-23 23:56:14 +02:00
|
|
|
personal_subscriptions = build_personal_subscriptions(
|
|
|
|
zerver_recipient=zerver_recipient,
|
|
|
|
)
|
2019-01-10 01:17:54 +01:00
|
|
|
zerver_subscription = personal_subscriptions + stream_subscriptions
|
2018-10-24 18:15:29 +02:00
|
|
|
|
2018-09-28 13:00:32 +02:00
|
|
|
realm['zerver_subscription'] = zerver_subscription
|
|
|
|
|
2018-09-29 15:43:53 +02:00
|
|
|
zerver_realmemoji = write_emoticon_data(
|
|
|
|
realm_id=realm_id,
|
|
|
|
data_dir=input_data_dir,
|
|
|
|
output_dir=output_dir,
|
|
|
|
)
|
|
|
|
realm['zerver_realmemoji'] = zerver_realmemoji
|
|
|
|
|
2018-10-23 16:53:09 +02:00
|
|
|
subscriber_map = make_subscriber_map(
|
|
|
|
zerver_subscription=zerver_subscription,
|
|
|
|
)
|
|
|
|
|
2018-10-13 18:44:13 +02:00
|
|
|
logging.info('Start importing message data')
|
2018-10-13 13:41:18 +02:00
|
|
|
for message_key in ['UserMessage',
|
2018-10-15 13:03:44 +02:00
|
|
|
'NotificationMessage',
|
2018-10-13 13:41:18 +02:00
|
|
|
'PrivateUserMessage']:
|
2018-10-13 01:17:19 +02:00
|
|
|
write_message_data(
|
|
|
|
realm_id=realm_id,
|
2018-11-21 19:46:00 +01:00
|
|
|
slim_mode=slim_mode,
|
2018-10-13 01:17:19 +02:00
|
|
|
message_key=message_key,
|
|
|
|
zerver_recipient=zerver_recipient,
|
2018-10-23 16:53:09 +02:00
|
|
|
subscriber_map=subscriber_map,
|
2018-10-13 01:17:19 +02:00
|
|
|
data_dir=input_data_dir,
|
|
|
|
output_dir=output_dir,
|
2018-10-25 00:57:11 +02:00
|
|
|
masking_content=masking_content,
|
2018-11-13 23:03:18 +01:00
|
|
|
stream_id_mapper=stream_id_mapper,
|
|
|
|
user_id_mapper=user_id_mapper,
|
2018-10-14 23:23:34 +02:00
|
|
|
user_handler=user_handler,
|
2018-10-13 16:25:44 +02:00
|
|
|
attachment_handler=attachment_handler,
|
2018-10-13 01:17:19 +02:00
|
|
|
)
|
2018-09-29 21:11:22 +02:00
|
|
|
|
2018-10-14 23:23:34 +02:00
|
|
|
# Order is important here...don't write users until
|
|
|
|
# we process everything else, since we may introduce
|
|
|
|
# mirror users when processing messages.
|
|
|
|
realm['zerver_userprofile'] = user_handler.get_all_users()
|
2018-10-16 12:34:47 +02:00
|
|
|
realm['sort_by_date'] = True
|
|
|
|
|
2018-10-14 23:23:34 +02:00
|
|
|
create_converted_data_files(realm, output_dir, '/realm.json')
|
|
|
|
|
2018-10-13 18:44:13 +02:00
|
|
|
logging.info('Start importing avatar data')
|
2018-09-28 13:00:32 +02:00
|
|
|
write_avatar_data(
|
2018-09-29 03:11:55 +02:00
|
|
|
raw_user_data=raw_user_data,
|
2018-09-28 13:00:32 +02:00
|
|
|
output_dir=output_dir,
|
2018-11-22 20:57:18 +01:00
|
|
|
user_id_mapper=user_id_mapper,
|
2018-09-28 13:00:32 +02:00
|
|
|
realm_id=realm_id,
|
|
|
|
)
|
|
|
|
|
2018-10-13 16:25:44 +02:00
|
|
|
attachment_handler.write_info(
|
2018-09-28 13:00:32 +02:00
|
|
|
output_dir=output_dir,
|
|
|
|
realm_id=realm_id,
|
|
|
|
)
|
|
|
|
|
2018-10-13 18:44:13 +02:00
|
|
|
logging.info('Start making tarball')
|
2018-09-28 13:00:32 +02:00
|
|
|
subprocess.check_call(["tar", "-czf", output_dir + '.tar.gz', output_dir, '-P'])
|
2018-10-13 18:44:13 +02:00
|
|
|
logging.info('Done making tarball')
|