import os import ujson import dateutil.parser import random import requests import json import logging import shutil import subprocess from django.conf import settings from django.forms.models import model_to_dict from django.utils.timezone import now as timezone_now from typing import Any, Dict, List, Tuple from zerver.models import Realm, UserProfile from zerver.lib.actions import STREAM_ASSIGNMENT_COLORS as stream_colors from zerver.lib.export import MESSAGE_BATCH_CHUNK_SIZE from zerver.lib.avatar_hash import user_avatar_path_from_ids from zerver.lib.parallel import run_parallel # stubs GitterDataT = List[Dict[str, Any]] ZerverFieldsT = Dict[str, Any] realm_id = 0 def gitter_workspace_to_realm(domain_name: str, gitter_data: GitterDataT, realm_subdomain: str) -> Tuple[ZerverFieldsT, List[ZerverFieldsT], Dict[str, int]]: """ Returns: 1. realm, Converted Realm data 2. avatars, which is list to map avatars to zulip avatar records.json 3. user_map, which is a dictionary to map from gitter user id to zulip user id """ NOW = float(timezone_now().timestamp()) zerver_realm = build_zerver_realm(realm_subdomain, NOW) realm = dict(zerver_client=[{"name": "populate_db", "id": 1}, {"name": "website", "id": 2}, {"name": "API", "id": 3}], zerver_customprofilefield=[], zerver_customprofilefieldvalue=[], zerver_userpresence=[], # shows last logged in data, which is not available in gitter zerver_userprofile_mirrordummy=[], zerver_realmdomain=[{"realm": realm_id, "allow_subdomains": False, "domain": domain_name, "id": realm_id}], zerver_useractivity=[], zerver_realm=zerver_realm, zerver_huddle=[], zerver_userprofile_crossrealm=[], zerver_useractivityinterval=[], zerver_reaction=[], zerver_realmemoji=[], zerver_realmfilter=[]) zerver_userprofile, avatars, user_map = build_userprofile(int(NOW), domain_name, gitter_data) zerver_stream, zerver_defaultstream = build_stream(int(NOW)) zerver_recipient, zerver_subscription = build_recipient_and_subscription( zerver_userprofile, zerver_stream) realm['zerver_userprofile'] = zerver_userprofile realm['zerver_stream'] = zerver_stream realm['zerver_defaultstream'] = zerver_defaultstream realm['zerver_recipient'] = zerver_recipient realm['zerver_subscription'] = zerver_subscription return realm, avatars, user_map def build_zerver_realm(realm_subdomain: str, time: float) -> List[ZerverFieldsT]: realm = Realm(id=realm_id, date_created=time, name=realm_subdomain, string_id=realm_subdomain, description="Organization imported from Gitter!") auth_methods = [[flag[0], flag[1]] for flag in realm.authentication_methods] realm_dict = model_to_dict(realm, exclude='authentication_methods') realm_dict['authentication_methods'] = auth_methods return[realm_dict] def build_userprofile(timestamp: Any, domain_name: str, gitter_data: GitterDataT) -> Tuple[List[ZerverFieldsT], List[ZerverFieldsT], Dict[str, int]]: """ Returns: 1. zerver_userprofile, which is a list of user profile 2. avatar_list, which is list to map avatars to zulip avatard records.json 3. added_users, which is a dictionary to map from gitter user id to zulip id """ logging.info('######### IMPORTING USERS STARTED #########\n') zerver_userprofile = [] avatar_list = [] # type: List[ZerverFieldsT] user_map = {} # type: Dict[str, int] user_id = 0 for data in gitter_data: if data['fromUser']['id'] not in user_map: user_data = data['fromUser'] user_map[user_data['id']] = user_id email = get_user_email(user_data, domain_name) build_avatar(user_id, realm_id, email, user_data, timestamp, avatar_list) # Build userprofile object userprofile = UserProfile( full_name=user_data['displayName'], short_name=user_data['username'], id=user_id, email=email, avatar_source='U', pointer=-1, date_joined=timestamp, last_login=timestamp) userprofile_dict = model_to_dict(userprofile) # Set realm id separately as the corresponding realm is not yet a Realm model # instance userprofile_dict['realm'] = realm_id zerver_userprofile.append(userprofile_dict) user_id += 1 logging.info('######### IMPORTING USERS FINISHED #########\n') return zerver_userprofile, avatar_list, user_map def get_user_email(user_data: ZerverFieldsT, domain_name: str) -> str: # TODO Get user email from github email = ("%s@users.noreply.github.com" % user_data['username']) return email def build_avatar(user_id: int, realm_id: int, email: str, user_data: ZerverFieldsT, timestamp: Any, avatar_list: List[ZerverFieldsT]) -> None: avatar_url = user_data['avatarUrl'] avatar = dict( path=avatar_url, # Save the avatar url here, which is downloaded later realm_id=realm_id, content_type=None, user_profile_id=user_id, last_modified=timestamp, user_profile_email=email, s3_path="", size="") avatar_list.append(avatar) def build_stream(timestamp: Any) -> Tuple[List[ZerverFieldsT], List[ZerverFieldsT]]: logging.info('######### IMPORTING STREAM STARTED #########\n') # We have only one stream for gitter export stream = dict( realm=realm_id, name="from gitter", deactivated=False, description="Imported from gitter", invite_only=False, date_created=timestamp, id=0) defaultstream = dict( stream=0, realm=realm_id, id=0) logging.info('######### IMPORTING STREAMS FINISHED #########\n') return [stream], [defaultstream] def build_recipient_and_subscription( zerver_userprofile: List[ZerverFieldsT], zerver_stream: List[ZerverFieldsT]) -> Tuple[List[ZerverFieldsT], List[ZerverFieldsT]]: """ Returns: 1. zerver_recipient, which is a list of mapped recipient 2. zerver_subscription, which is a list of mapped subscription """ zerver_recipient = [] zerver_subscription = [] recipient_id = subscription_id = 0 # For stream # We have only one recipient, because we have only one stream # Hence 'recipient_id'=0 corresponds to 'stream_id'=0 recipient = build_recipient(0, recipient_id, 2) zerver_recipient.append(recipient) for user in zerver_userprofile: subscription = build_subscription(recipient_id, user['id'], subscription_id) zerver_subscription.append(subscription) subscription_id += 1 recipient_id += 1 # For users for user in zerver_userprofile: recipient = build_recipient(user['id'], recipient_id, 1) subscription = build_subscription(recipient_id, user['id'], subscription_id) zerver_recipient.append(recipient) zerver_subscription.append(subscription) recipient_id += 1 subscription_id += 1 return zerver_recipient, zerver_subscription def build_recipient(type_id: int, recipient_id: int, type: int) -> ZerverFieldsT: recipient = dict( type_id=type_id, # stream id id=recipient_id, type=type) return recipient def build_subscription(recipient_id: int, user_id: int, subscription_id: int) -> ZerverFieldsT: subscription = dict( recipient=recipient_id, notifications=False, color=random.choice(stream_colors), desktop_notifications=True, pin_to_top=False, in_home_view=True, active=True, user_profile=user_id, id=subscription_id) return subscription def convert_gitter_workspace_messages(gitter_data: GitterDataT, output_dir: str, zerver_subscription: List[ZerverFieldsT], user_map: Dict[str, int], user_short_name_to_full_name: Dict[str, str], chunk_size: int=MESSAGE_BATCH_CHUNK_SIZE) -> None: """ Messages are stored in batches """ logging.info('######### IMPORTING MESSAGES STARTED #########\n') message_id = usermessage_id = 0 recipient_id = 0 # Corresponding to stream "gitter" low_index = 0 upper_index = low_index + chunk_size dump_file_id = 1 while True: message_json = {} zerver_message = [] zerver_usermessage = [] message_data = gitter_data[low_index: upper_index] if len(message_data) == 0: break for message in message_data: message_time = dateutil.parser.parse(message['sent']).timestamp() mentioned_user_ids = get_usermentions(message, user_map, user_short_name_to_full_name) rendered_content = None zulip_message = dict( sending_client=1, rendered_content_version=1, # This is Zulip-specific has_image=False, subject='imported from gitter', pub_date=float(message_time), id=message_id, has_attachment=False, edit_history=None, sender=user_map[message['fromUser']['id']], content=message['text'], rendered_content=rendered_content, recipient=recipient_id, last_edit_time=None, has_link=False) zerver_message.append(zulip_message) for subscription in zerver_subscription: if subscription['recipient'] == recipient_id: flags_mask = 1 # For read if subscription['user_profile'] in mentioned_user_ids: flags_mask = 9 # For read and mentioned usermessage = dict( user_profile=subscription['user_profile'], id=usermessage_id, flags_mask=flags_mask, message=message_id) usermessage_id += 1 zerver_usermessage.append(usermessage) message_id += 1 message_json['zerver_message'] = zerver_message message_json['zerver_usermessage'] = zerver_usermessage message_filename = os.path.join(output_dir, "messages-%06d.json" % (dump_file_id,)) logging.info("Writing Messages to %s\n" % (message_filename,)) write_data_to_file(os.path.join(message_filename), message_json) low_index = upper_index upper_index = chunk_size + low_index dump_file_id += 1 logging.info('######### IMPORTING MESSAGES FINISHED #########\n') def get_usermentions(message: Dict[str, Any], user_map: Dict[str, int], user_short_name_to_full_name: Dict[str, str]) -> List[int]: mentioned_user_ids = [] if 'mentions' in message: for mention in message['mentions']: if mention.get('userId') in user_map: gitter_mention = '@%s' % (mention['screenName']) zulip_mention = ('@**%s**' % (user_short_name_to_full_name[mention['screenName']])) message['text'] = message['text'].replace(gitter_mention, zulip_mention) mentioned_user_ids.append(user_map[mention['userId']]) return mentioned_user_ids def do_convert_data(gitter_data_file: str, output_dir: str, threads: int=6) -> None: # Subdomain is set by the user while running the import commands realm_subdomain = "" domain_name = settings.EXTERNAL_HOST os.makedirs(output_dir, exist_ok=True) # output directory should be empty initially if os.listdir(output_dir): raise Exception("Output directory should be empty!") # Read data from the gitter file gitter_data = json.load(open(gitter_data_file)) realm, avatar_list, user_map = gitter_workspace_to_realm( domain_name, gitter_data, realm_subdomain) # For user mentions user_short_name_to_full_name = {} for userprofile in realm['zerver_userprofile']: user_short_name_to_full_name[userprofile['short_name']] = userprofile['full_name'] convert_gitter_workspace_messages( gitter_data, output_dir, realm['zerver_subscription'], user_map, user_short_name_to_full_name) avatar_folder = os.path.join(output_dir, 'avatars') avatar_realm_folder = os.path.join(avatar_folder, str(realm_id)) os.makedirs(avatar_realm_folder, exist_ok=True) avatar_records = process_avatars(avatar_list, avatar_folder, threads) attachment = {"zerver_attachment": []} # type: Dict[str, List[Any]] # IO realm.json create_converted_data_files(realm, output_dir, '/realm.json') # IO emoji records create_converted_data_files([], output_dir, '/emoji/records.json') # IO avatar records create_converted_data_files(avatar_records, output_dir, '/avatars/records.json') # IO uploads records create_converted_data_files([], output_dir, '/uploads/records.json') # IO attachments records create_converted_data_files(attachment, output_dir, '/attachment.json') subprocess.check_call(["tar", "-czf", output_dir + '.tar.gz', output_dir, '-P']) logging.info('######### DATA CONVERSION FINISHED #########\n') logging.info("Zulip data dump created at %s" % (output_dir)) def process_avatars(avatar_list: List[ZerverFieldsT], avatar_dir: str, threads: int) -> List[ZerverFieldsT]: """ This function gets the gitter avatar of the user and saves it in the user's avatar directory with both the extensions '.png' and '.original' """ def get_avatar(avatar_upload_list: List[str]) -> int: gitter_avatar_url = avatar_upload_list[0] image_path = avatar_upload_list[1] original_image_path = avatar_upload_list[2] response = requests.get(gitter_avatar_url, stream=True) with open(image_path, 'wb') as image_file: shutil.copyfileobj(response.raw, image_file) shutil.copy(image_path, original_image_path) return 0 logging.info('######### GETTING AVATARS #########\n') logging.info('DOWNLOADING AVATARS .......\n') avatar_original_list = [] avatar_upload_list = [] for avatar in avatar_list: avatar_hash = user_avatar_path_from_ids(avatar['user_profile_id'], realm_id) gitter_avatar_url = avatar['path'] avatar_original = dict(avatar) image_path = ('%s/%s.png' % (avatar_dir, avatar_hash)) original_image_path = ('%s/%s.original' % (avatar_dir, avatar_hash)) avatar_upload_list.append([gitter_avatar_url, image_path, original_image_path]) # We don't add the size field here in avatar's records.json, # since the metadata is not needed on the import end, and we # don't have it until we've downloaded the files anyway. avatar['path'] = image_path avatar['s3_path'] = image_path avatar_original['path'] = original_image_path avatar_original['s3_path'] = original_image_path avatar_original_list.append(avatar_original) # Run downloads parallely output = [] for (status, job) in run_parallel(get_avatar, avatar_upload_list, threads=threads): output.append(job) logging.info('######### GETTING AVATARS FINISHED #########\n') return avatar_list + avatar_original_list def create_converted_data_files(data: Any, output_dir: str, file_path: str) -> None: output_file = output_dir + file_path os.makedirs(os.path.dirname(output_file), exist_ok=True) json.dump(data, open(output_file, 'w'), indent=4) def write_data_to_file(output_file: str, data: Any) -> None: with open(output_file, "w") as f: f.write(ujson.dumps(data, indent=4))