From c4b886d8aee2503a03cfbffd30676ee0d4aba933 Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Mon, 23 Apr 2018 14:28:27 -0700 Subject: [PATCH] import: Split out import.py into its own module. This should make it a bit easier to find the code. --- zerver/lib/export.py | 687 +------------------------- zerver/lib/import_realm.py | 700 +++++++++++++++++++++++++++ zerver/management/commands/import.py | 2 +- zerver/tests/test_slack_importer.py | 2 +- 4 files changed, 705 insertions(+), 686 deletions(-) create mode 100644 zerver/lib/import_realm.py diff --git a/zerver/lib/export.py b/zerver/lib/export.py index 6a0f4ccd30..a93f889853 100644 --- a/zerver/lib/export.py +++ b/zerver/lib/export.py @@ -1,31 +1,22 @@ import datetime -from boto.s3.key import Key from boto.s3.connection import S3Connection from django.conf import settings -from django.db import connection from django.forms.models import model_to_dict from django.utils.timezone import make_aware as timezone_make_aware from django.utils.timezone import utc as timezone_utc from django.utils.timezone import is_naive as timezone_is_naive -from django.db.models.query import QuerySet import glob import logging import os import ujson -import shutil import subprocess import tempfile -from zerver.lib.upload import random_name, sanitize_name -from zerver.lib.avatar_hash import user_avatar_hash, user_avatar_path_from_ids -from zerver.lib.upload import S3UploadBackend, LocalUploadBackend -from zerver.lib.create_user import random_api_key -from zerver.lib.bulk_create import bulk_create_users +from zerver.lib.avatar_hash import user_avatar_path_from_ids from zerver.models import UserProfile, Realm, Client, Huddle, Stream, \ UserMessage, Subscription, Message, RealmEmoji, RealmFilter, \ RealmDomain, Recipient, DefaultStream, get_user_profile_by_id, \ - UserPresence, UserActivity, UserActivityInterval, Reaction, \ - CustomProfileField, CustomProfileFieldValue, \ - get_display_recipient, Attachment, get_system_bot, email_to_username + UserPresence, UserActivity, UserActivityInterval, \ + get_display_recipient, Attachment, get_system_bot from zerver.lib.parallel import run_parallel from typing import Any, Callable, Dict, List, Optional, Set, Tuple, \ Iterable, Text @@ -1205,675 +1196,3 @@ def export_messages_single_user(user_profile: UserProfile, output_dir: Path, chu write_message_export(message_filename, output) min_id = max(user_message_ids) dump_file_id += 1 - -# Code from here is the realm import code path - -# id_maps is a dictionary that maps table names to dictionaries -# that map old ids to new ids. We use this in -# re_map_foreign_keys and other places. -# -# We explicity initialize id_maps with the tables that support -# id re-mapping. -# -# Code reviewers: give these tables extra scrutiny, as we need to -# make sure to reload related tables AFTER we re-map the ids. -id_maps = { - 'client': {}, - 'user_profile': {}, - 'realm': {}, - 'stream': {}, - 'recipient': {}, - 'subscription': {}, - 'defaultstream': {}, - 'reaction': {}, - 'realmemoji': {}, - 'realmdomain': {}, - 'realmfilter': {}, - 'message': {}, - 'user_presence': {}, - 'useractivity': {}, - 'useractivityinterval': {}, - 'usermessage': {}, - 'customprofilefield': {}, - 'customprofilefield_value': {}, - 'attachment': {}, -} # type: Dict[str, Dict[int, int]] - -path_maps = { - 'attachment_path': {}, -} # type: Dict[str, Dict[str, str]] - -def update_id_map(table: TableName, old_id: int, new_id: int) -> None: - if table not in id_maps: - raise Exception(''' - Table %s is not initialized in id_maps, which could - mean that we have not thought through circular - dependencies. - ''' % (table,)) - id_maps[table][old_id] = new_id - -def fix_datetime_fields(data: TableData, table: TableName) -> None: - for item in data[table]: - for field_name in DATE_FIELDS[table]: - if item[field_name] is not None: - item[field_name] = datetime.datetime.fromtimestamp(item[field_name], tz=timezone_utc) - -def fix_upload_links(data: TableData, message_table: TableName) -> None: - """ - Because the URLs for uploaded files encode the realm ID of the - organization being imported (which is only determined at import - time), we need to rewrite the URLs of links to uploaded files - during the import process. - """ - for message in data[message_table]: - if message['has_attachment'] is True: - for key, value in path_maps['attachment_path'].items(): - if key in message['content']: - message['content'] = message['content'].replace(key, value) - if message['rendered_content']: - message['rendered_content'] = message['rendered_content'].replace(key, value) - -def current_table_ids(data: TableData, table: TableName) -> List[int]: - """ - Returns the ids present in the current table - """ - id_list = [] - for item in data[table]: - id_list.append(item["id"]) - return id_list - -def idseq(model_class: Any) -> str: - if model_class == RealmDomain: - return 'zerver_realmalias_id_seq' - return '{}_id_seq'.format(model_class._meta.db_table) - -def allocate_ids(model_class: Any, count: int) -> List[int]: - """ - Increases the sequence number for a given table by the amount of objects being - imported into that table. Hence, this gives a reserved range of ids to import the - converted slack objects into the tables. - """ - conn = connection.cursor() - sequence = idseq(model_class) - conn.execute("select nextval('%s') from generate_series(1,%s)" % - (sequence, str(count))) - query = conn.fetchall() # Each element in the result is a tuple like (5,) - conn.close() - # convert List[Tuple[int]] to List[int] - return [item[0] for item in query] - -def convert_to_id_fields(data: TableData, table: TableName, field_name: Field) -> None: - ''' - When Django gives us dict objects via model_to_dict, the foreign - key fields are `foo`, but we want `foo_id` for the bulk insert. - This function handles the simple case where we simply rename - the fields. For cases where we need to munge ids in the - database, see re_map_foreign_keys. - ''' - for item in data[table]: - item[field_name + "_id"] = item[field_name] - del item[field_name] - -def re_map_foreign_keys(data: TableData, - table: TableName, - field_name: Field, - related_table: TableName, - verbose: bool=False, - id_field: bool=False, - recipient_field: bool=False) -> None: - """ - This is a wrapper function for all the realm data tables - and only avatar and attachment records need to be passed through the internal function - because of the difference in data format (TableData corresponding to realm data tables - and List[Record] corresponding to the avatar and attachment records) - """ - re_map_foreign_keys_internal(data[table], table, field_name, related_table, verbose, id_field, - recipient_field) - -def re_map_foreign_keys_internal(data_table: List[Record], - table: TableName, - field_name: Field, - related_table: TableName, - verbose: bool=False, - id_field: bool=False, - recipient_field: bool=False) -> None: - ''' - We occasionally need to assign new ids to rows during the - import/export process, to accommodate things like existing rows - already being in tables. See bulk_import_client for more context. - - The tricky part is making sure that foreign key references - are in sync with the new ids, and this fixer function does - the re-mapping. (It also appends `_id` to the field.) - ''' - lookup_table = id_maps[related_table] - for item in data_table: - if recipient_field: - if related_table == "stream" and item['type'] == 2: - pass - elif related_table == "user_profile" and item['type'] == 1: - pass - else: - continue - old_id = item[field_name] - if old_id in lookup_table: - new_id = lookup_table[old_id] - if verbose: - logging.info('Remapping %s %s from %s to %s' % (table, - field_name + '_id', - old_id, - new_id)) - else: - new_id = old_id - if not id_field: - item[field_name + "_id"] = new_id - del item[field_name] - else: - item[field_name] = new_id - -def fix_bitfield_keys(data: TableData, table: TableName, field_name: Field) -> None: - for item in data[table]: - item[field_name] = item[field_name + '_mask'] - del item[field_name + '_mask'] - -def fix_realm_authentication_bitfield(data: TableData, table: TableName, field_name: Field) -> None: - """Used to fixup the authentication_methods bitfield to be a string""" - for item in data[table]: - values_as_bitstring = ''.join(['1' if field[1] else '0' for field in - item[field_name]]) - values_as_int = int(values_as_bitstring, 2) - item[field_name] = values_as_int - -def update_model_ids(model: Any, data: TableData, table: TableName, related_table: TableName) -> None: - old_id_list = current_table_ids(data, table) - allocated_id_list = allocate_ids(model, len(data[table])) - for item in range(len(data[table])): - update_id_map(related_table, old_id_list[item], allocated_id_list[item]) - re_map_foreign_keys(data, table, 'id', related_table=related_table, id_field=True) - -def bulk_import_model(data: TableData, model: Any, table: TableName, - dump_file_id: Optional[str]=None) -> None: - # TODO, deprecate dump_file_id - model.objects.bulk_create(model(**item) for item in data[table]) - if dump_file_id is None: - logging.info("Successfully imported %s from %s." % (model, table)) - else: - logging.info("Successfully imported %s from %s[%s]." % (model, table, dump_file_id)) - -# Client is a table shared by multiple realms, so in order to -# correctly import multiple realms into the same server, we need to -# check if a Client object already exists, and so we need to support -# remap all Client IDs to the values in the new DB. -def bulk_import_client(data: TableData, model: Any, table: TableName) -> None: - for item in data[table]: - try: - client = Client.objects.get(name=item['name']) - except Client.DoesNotExist: - client = Client.objects.create(name=item['name']) - update_id_map(table='client', old_id=item['id'], new_id=client.id) - -def import_uploads_local(import_dir: Path, processing_avatars: bool=False, - processing_emojis: bool=False) -> None: - records_filename = os.path.join(import_dir, "records.json") - with open(records_filename) as records_file: - records = ujson.loads(records_file.read()) - - re_map_foreign_keys_internal(records, 'records', 'realm_id', related_table="realm", - id_field=True) - if not processing_emojis: - re_map_foreign_keys_internal(records, 'records', 'user_profile_id', - related_table="user_profile", id_field=True) - for record in records: - if processing_avatars: - # For avatars, we need to rehash the user ID with the - # new server's avatar salt - avatar_path = user_avatar_path_from_ids(record['user_profile_id'], record['realm_id']) - file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, "avatars", avatar_path) - if record['s3_path'].endswith('.original'): - file_path += '.original' - else: - file_path += '.png' - elif processing_emojis: - # For emojis we follow the function 'upload_emoji_image' - emoji_path = RealmEmoji.PATH_ID_TEMPLATE.format( - realm_id=record['realm_id'], - emoji_file_name=record['file_name']) - file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, "avatars", emoji_path) - else: - # Should be kept in sync with its equivalent in zerver/lib/uploads in the - # function 'upload_message_image' - s3_file_name = "/".join([ - str(record['realm_id']), - random_name(18), - sanitize_name(os.path.basename(record['path'])) - ]) - file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, "files", s3_file_name) - path_maps['attachment_path'][record['path']] = s3_file_name - - orig_file_path = os.path.join(import_dir, record['path']) - if not os.path.exists(os.path.dirname(file_path)): - subprocess.check_call(["mkdir", "-p", os.path.dirname(file_path)]) - shutil.copy(orig_file_path, file_path) - - if processing_avatars: - # Ensure that we have medium-size avatar images for every - # avatar. TODO: This implementation is hacky, both in that it - # does get_user_profile_by_id for each user, and in that it - # might be better to require the export to just have these. - upload_backend = LocalUploadBackend() - for record in records: - if record['s3_path'].endswith('.original'): - user_profile = get_user_profile_by_id(record['user_profile_id']) - avatar_path = user_avatar_path_from_ids(user_profile.id, record['realm_id']) - medium_file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, "avatars", - avatar_path) + '-medium.png' - if os.path.exists(medium_file_path): - # We remove the image here primarily to deal with - # issues when running the import script multiple - # times in development (where one might reuse the - # same realm ID from a previous iteration). - os.remove(medium_file_path) - upload_backend.ensure_medium_avatar_image(user_profile=user_profile) - -def import_uploads_s3(bucket_name: str, import_dir: Path, processing_avatars: bool=False, - processing_emojis: bool=False) -> None: - upload_backend = S3UploadBackend() - conn = S3Connection(settings.S3_KEY, settings.S3_SECRET_KEY) - bucket = conn.get_bucket(bucket_name, validate=True) - - records_filename = os.path.join(import_dir, "records.json") - with open(records_filename) as records_file: - records = ujson.loads(records_file.read()) - - re_map_foreign_keys_internal(records, 'records', 'realm_id', related_table="realm", - id_field=True) - re_map_foreign_keys_internal(records, 'records', 'user_profile_id', - related_table="user_profile", id_field=True) - for record in records: - key = Key(bucket) - - if processing_avatars: - # For avatars, we need to rehash the user's email with the - # new server's avatar salt - avatar_path = user_avatar_path_from_ids(record['user_profile_id'], record['realm_id']) - key.key = avatar_path - if record['s3_path'].endswith('.original'): - key.key += '.original' - if processing_emojis: - # For emojis we follow the function 'upload_emoji_image' - emoji_path = RealmEmoji.PATH_ID_TEMPLATE.format( - realm_id=record['realm_id'], - emoji_file_name=record['file_name']) - key.key = emoji_path - else: - # Should be kept in sync with its equivalent in zerver/lib/uploads in the - # function 'upload_message_image' - s3_file_name = "/".join([ - str(record['realm_id']), - random_name(18), - sanitize_name(os.path.basename(record['path'])) - ]) - key.key = s3_file_name - path_maps['attachment_path'][record['path']] = s3_file_name - - user_profile_id = int(record['user_profile_id']) - # Support email gateway bot and other cross-realm messages - if user_profile_id in id_maps["user_profile"]: - logging.info("Uploaded by ID mapped user: %s!" % (user_profile_id,)) - user_profile_id = id_maps["user_profile"][user_profile_id] - user_profile = get_user_profile_by_id(user_profile_id) - key.set_metadata("user_profile_id", str(user_profile.id)) - key.set_metadata("realm_id", str(user_profile.realm_id)) - key.set_metadata("orig_last_modified", record['last_modified']) - - headers = {'Content-Type': record['content_type']} - - key.set_contents_from_filename(os.path.join(import_dir, record['path']), headers=headers) - - if processing_avatars: - # TODO: Ideally, we'd do this in a separate pass, after - # all the avatars have been uploaded, since we may end up - # unnecssarily resizing images just before the medium-size - # image in the export is uploaded. See the local uplods - # code path for more notes. - upload_backend.ensure_medium_avatar_image(user_profile=user_profile) - -def import_uploads(import_dir: Path, processing_avatars: bool=False, - processing_emojis: bool=False) -> None: - if processing_avatars: - logging.info("Importing avatars") - elif processing_emojis: - logging.info("Importing emojis") - else: - logging.info("Importing uploaded files") - if settings.LOCAL_UPLOADS_DIR: - import_uploads_local(import_dir, processing_avatars=processing_avatars, - processing_emojis=processing_emojis) - else: - if processing_avatars or processing_emojis: - bucket_name = settings.S3_AVATAR_BUCKET - else: - bucket_name = settings.S3_AUTH_UPLOADS_BUCKET - import_uploads_s3(bucket_name, import_dir, processing_avatars=processing_avatars) - -# Importing data suffers from a difficult ordering problem because of -# models that reference each other circularly. Here is a correct order. -# -# * Client [no deps] -# * Realm [-notifications_stream] -# * Stream [only depends on realm] -# * Realm's notifications_stream -# * Now can do all realm_tables -# * UserProfile, in order by ID to avoid bot loop issues -# * Huddle -# * Recipient -# * Subscription -# * Message -# * UserMessage -# -# Because the Python object => JSON conversion process is not fully -# faithful, we have to use a set of fixers (e.g. on DateTime objects -# and Foreign Keys) to do the import correctly. -def do_import_realm(import_dir: Path, subdomain: str) -> Realm: - logging.info("Importing realm dump %s" % (import_dir,)) - if not os.path.exists(import_dir): - raise Exception("Missing import directory!") - - realm_data_filename = os.path.join(import_dir, "realm.json") - if not os.path.exists(realm_data_filename): - raise Exception("Missing realm.json file!") - - logging.info("Importing realm data from %s" % (realm_data_filename,)) - with open(realm_data_filename) as f: - data = ujson.load(f) - - update_model_ids(Stream, data, 'zerver_stream', 'stream') - re_map_foreign_keys(data, 'zerver_realm', 'notifications_stream', related_table="stream") - - fix_datetime_fields(data, 'zerver_realm') - # Fix realm subdomain information - data['zerver_realm'][0]['string_id'] = subdomain - data['zerver_realm'][0]['name'] = subdomain - fix_realm_authentication_bitfield(data, 'zerver_realm', 'authentication_methods') - update_model_ids(Realm, data, 'zerver_realm', 'realm') - - realm = Realm(**data['zerver_realm'][0]) - if realm.notifications_stream_id is not None: - notifications_stream_id = int(realm.notifications_stream_id) # type: Optional[int] - else: - notifications_stream_id = None - realm.notifications_stream_id = None - realm.save() - bulk_import_client(data, Client, 'zerver_client') - - # Email tokens will automatically be randomly generated when the - # Stream objects are created by Django. - fix_datetime_fields(data, 'zerver_stream') - re_map_foreign_keys(data, 'zerver_stream', 'realm', related_table="realm") - bulk_import_model(data, Stream, 'zerver_stream') - - realm.notifications_stream_id = notifications_stream_id - realm.save() - - re_map_foreign_keys(data, 'zerver_defaultstream', 'stream', related_table="stream") - re_map_foreign_keys(data, 'zerver_realmemoji', 'author', related_table="user_profile") - for (table, model, related_table) in realm_tables: - re_map_foreign_keys(data, table, 'realm', related_table="realm") - update_model_ids(model, data, table, related_table) - bulk_import_model(data, model, table) - - # Remap the user IDs for notification_bot and friends to their - # appropriate IDs on this server - for item in data['zerver_userprofile_crossrealm']: - logging.info("Adding to ID map: %s %s" % (item['id'], get_system_bot(item['email']).id)) - new_user_id = get_system_bot(item['email']).id - update_id_map(table='user_profile', old_id=item['id'], new_id=new_user_id) - - # Merge in zerver_userprofile_mirrordummy - data['zerver_userprofile'] = data['zerver_userprofile'] + data['zerver_userprofile_mirrordummy'] - del data['zerver_userprofile_mirrordummy'] - data['zerver_userprofile'].sort(key=lambda r: r['id']) - - # To remap foreign key for UserProfile.last_active_message_id - update_message_foreign_keys(import_dir) - - fix_datetime_fields(data, 'zerver_userprofile') - update_model_ids(UserProfile, data, 'zerver_userprofile', 'user_profile') - re_map_foreign_keys(data, 'zerver_userprofile', 'realm', related_table="realm") - re_map_foreign_keys(data, 'zerver_userprofile', 'bot_owner', related_table="user_profile") - re_map_foreign_keys(data, 'zerver_userprofile', 'default_sending_stream', - related_table="stream") - re_map_foreign_keys(data, 'zerver_userprofile', 'default_events_register_stream', - related_table="stream") - re_map_foreign_keys(data, 'zerver_userprofile', 'last_active_message_id', - related_table="message", id_field=True) - for user_profile_dict in data['zerver_userprofile']: - user_profile_dict['password'] = None - user_profile_dict['api_key'] = random_api_key() - # Since Zulip doesn't use these permissions, drop them - del user_profile_dict['user_permissions'] - del user_profile_dict['groups'] - - user_profiles = [UserProfile(**item) for item in data['zerver_userprofile']] - for user_profile in user_profiles: - user_profile.set_unusable_password() - UserProfile.objects.bulk_create(user_profiles) - - if 'zerver_huddle' in data: - bulk_import_model(data, Huddle, 'zerver_huddle') - - re_map_foreign_keys(data, 'zerver_recipient', 'type_id', related_table="stream", - recipient_field=True, id_field=True) - re_map_foreign_keys(data, 'zerver_recipient', 'type_id', related_table="user_profile", - recipient_field=True, id_field=True) - update_model_ids(Recipient, data, 'zerver_recipient', 'recipient') - bulk_import_model(data, Recipient, 'zerver_recipient') - - re_map_foreign_keys(data, 'zerver_subscription', 'user_profile', related_table="user_profile") - re_map_foreign_keys(data, 'zerver_subscription', 'recipient', related_table="recipient") - update_model_ids(Subscription, data, 'zerver_subscription', 'subscription') - bulk_import_model(data, Subscription, 'zerver_subscription') - - fix_datetime_fields(data, 'zerver_userpresence') - re_map_foreign_keys(data, 'zerver_userpresence', 'user_profile', related_table="user_profile") - re_map_foreign_keys(data, 'zerver_userpresence', 'client', related_table='client') - update_model_ids(UserPresence, data, 'zerver_userpresence', 'user_presence') - bulk_import_model(data, UserPresence, 'zerver_userpresence') - - fix_datetime_fields(data, 'zerver_useractivity') - re_map_foreign_keys(data, 'zerver_useractivity', 'user_profile', related_table="user_profile") - re_map_foreign_keys(data, 'zerver_useractivity', 'client', related_table='client') - update_model_ids(UserActivity, data, 'zerver_useractivity', 'useractivity') - bulk_import_model(data, UserActivity, 'zerver_useractivity') - - fix_datetime_fields(data, 'zerver_useractivityinterval') - re_map_foreign_keys(data, 'zerver_useractivityinterval', 'user_profile', related_table="user_profile") - update_model_ids(UserActivityInterval, data, 'zerver_useractivityinterval', - 'useractivityinterval') - bulk_import_model(data, UserActivityInterval, 'zerver_useractivityinterval') - - if 'zerver_customprofilefield' in data: - # As the export of Custom Profile fields is not supported, Zulip exported - # data would not contain this field. - # However this is supported in slack importer script - re_map_foreign_keys(data, 'zerver_customprofilefield', 'realm', related_table="realm") - update_model_ids(CustomProfileField, data, 'zerver_customprofilefield', - related_table="customprofilefield") - bulk_import_model(data, CustomProfileField, 'zerver_customprofilefield') - - re_map_foreign_keys(data, 'zerver_customprofilefield_value', 'user_profile', - related_table="user_profile") - re_map_foreign_keys(data, 'zerver_customprofilefield_value', 'field', - related_table="customprofilefield") - update_model_ids(CustomProfileFieldValue, data, 'zerver_customprofilefield_value', - related_table="customprofilefield_value") - bulk_import_model(data, CustomProfileFieldValue, 'zerver_customprofilefield_value') - - # Import uploaded files and avatars - import_uploads(os.path.join(import_dir, "avatars"), processing_avatars=True) - import_uploads(os.path.join(import_dir, "uploads")) - - # We need to have this check as the emoji files are only present in the data - # importer from slack - # For Zulip export, this doesn't exist - if os.path.exists(os.path.join(import_dir, "emoji")): - import_uploads(os.path.join(import_dir, "emoji"), processing_emojis=True) - - # Import zerver_message and zerver_usermessage - import_message_data(import_dir) - - # Do attachments AFTER message data is loaded. - # TODO: de-dup how we read these json files. - fn = os.path.join(import_dir, "attachment.json") - if not os.path.exists(fn): - raise Exception("Missing attachment.json file!") - - logging.info("Importing attachment data from %s" % (fn,)) - with open(fn) as f: - data = ujson.load(f) - - import_attachments(data) - return realm - -# create_users and do_import_system_bots differ from their equivalent in -# zerver/management/commands/initialize_voyager_db.py because here we check if the bots -# don't already exist and only then create a user for these bots. -def do_import_system_bots(realm: Any) -> None: - internal_bots = [(bot['name'], bot['email_template'] % (settings.INTERNAL_BOT_DOMAIN,)) - for bot in settings.INTERNAL_BOTS] - create_users(realm, internal_bots, bot_type=UserProfile.DEFAULT_BOT) - names = [(settings.FEEDBACK_BOT_NAME, settings.FEEDBACK_BOT)] - create_users(realm, names, bot_type=UserProfile.DEFAULT_BOT) - print("Finished importing system bots.") - -def create_users(realm: Realm, name_list: Iterable[Tuple[Text, Text]], - bot_type: Optional[int]=None) -> None: - user_set = set() - for full_name, email in name_list: - short_name = email_to_username(email) - if not UserProfile.objects.filter(email=email): - user_set.add((email, full_name, short_name, True)) - bulk_create_users(realm, user_set, bot_type) - -def update_message_foreign_keys(import_dir: Path) -> None: - dump_file_id = 1 - while True: - message_filename = os.path.join(import_dir, "messages-%06d.json" % (dump_file_id,)) - if not os.path.exists(message_filename): - break - - with open(message_filename) as f: - data = ujson.load(f) - - update_model_ids(Message, data, 'zerver_message', 'message') - dump_file_id += 1 - -def import_message_data(import_dir: Path) -> None: - dump_file_id = 1 - while True: - message_filename = os.path.join(import_dir, "messages-%06d.json" % (dump_file_id,)) - if not os.path.exists(message_filename): - break - - with open(message_filename) as f: - data = ujson.load(f) - - logging.info("Importing message dump %s" % (message_filename,)) - re_map_foreign_keys(data, 'zerver_message', 'sender', related_table="user_profile") - re_map_foreign_keys(data, 'zerver_message', 'recipient', related_table="recipient") - re_map_foreign_keys(data, 'zerver_message', 'sending_client', related_table='client') - fix_datetime_fields(data, 'zerver_message') - # Parser to update message content with the updated attachment urls - fix_upload_links(data, 'zerver_message') - - re_map_foreign_keys(data, 'zerver_message', 'id', related_table='message', id_field=True) - bulk_import_model(data, Message, 'zerver_message') - - # Due to the structure of these message chunks, we're - # guaranteed to have already imported all the Message objects - # for this batch of UserMessage objects. - re_map_foreign_keys(data, 'zerver_usermessage', 'message', related_table="message") - re_map_foreign_keys(data, 'zerver_usermessage', 'user_profile', related_table="user_profile") - fix_bitfield_keys(data, 'zerver_usermessage', 'flags') - update_model_ids(UserMessage, data, 'zerver_usermessage', 'usermessage') - bulk_import_model(data, UserMessage, 'zerver_usermessage') - - # As the export of Reactions is not supported, Zulip exported - # data would not contain this field. - # However this is supported in slack importer script - if 'zerver_reaction' in data: - re_map_foreign_keys(data, 'zerver_reaction', 'message', related_table="message") - re_map_foreign_keys(data, 'zerver_reaction', 'user_profile', related_table="user_profile") - for reaction in data['zerver_reaction']: - if reaction['reaction_type'] == Reaction.REALM_EMOJI: - re_map_foreign_keys(data, 'zerver_reaction', 'emoji_code', - related_table="realmemoji", id_field=True) - update_model_ids(Reaction, data, 'zerver_reaction', 'reaction') - bulk_import_model(data, Reaction, 'zerver_reaction') - - dump_file_id += 1 - -def import_attachments(data: TableData) -> None: - - # Clean up the data in zerver_attachment that is not - # relevant to our many-to-many import. - fix_datetime_fields(data, 'zerver_attachment') - re_map_foreign_keys(data, 'zerver_attachment', 'owner', related_table="user_profile") - re_map_foreign_keys(data, 'zerver_attachment', 'realm', related_table="realm") - - # Configure ourselves. Django models many-to-many (m2m) - # relations asymmetrically. The parent here refers to the - # Model that has the ManyToManyField. It is assumed here - # the child models have been loaded, but we are in turn - # responsible for loading the parents and the m2m rows. - parent_model = Attachment - parent_db_table_name = 'zerver_attachment' - parent_singular = 'attachment' - child_singular = 'message' - child_plural = 'messages' - m2m_table_name = 'zerver_attachment_messages' - parent_id = 'attachment_id' - child_id = 'message_id' - - update_model_ids(parent_model, data, parent_db_table_name, 'attachment') - # First, build our list of many-to-many (m2m) rows. - # We do this in a slightly convoluted way to anticipate - # a future where we may need to call re_map_foreign_keys. - - m2m_rows = [] # type: List[Record] - for parent_row in data[parent_db_table_name]: - for fk_id in parent_row[child_plural]: - m2m_row = {} # type: Record - m2m_row[parent_singular] = parent_row['id'] - m2m_row[child_singular] = id_maps['message'][fk_id] - m2m_rows.append(m2m_row) - - # Create our table data for insert. - m2m_data = {m2m_table_name: m2m_rows} # type: TableData - convert_to_id_fields(m2m_data, m2m_table_name, parent_singular) - convert_to_id_fields(m2m_data, m2m_table_name, child_singular) - m2m_rows = m2m_data[m2m_table_name] - - # Next, delete out our child data from the parent rows. - for parent_row in data[parent_db_table_name]: - del parent_row[child_plural] - - # Update 'path_id' for the attachments - for attachment in data[parent_db_table_name]: - attachment['path_id'] = path_maps['attachment_path'][attachment['path_id']] - - # Next, load the parent rows. - bulk_import_model(data, parent_model, parent_db_table_name) - - # Now, go back to our m2m rows. - # TODO: Do this the kosher Django way. We may find a - # better way to do this in Django 1.9 particularly. - with connection.cursor() as cursor: - sql_template = ''' - insert into %s (%s, %s) values(%%s, %%s);''' % (m2m_table_name, - parent_id, - child_id) - tups = [(row[parent_id], row[child_id]) for row in m2m_rows] - cursor.executemany(sql_template, tups) - - logging.info('Successfully imported M2M table %s' % (m2m_table_name,)) diff --git a/zerver/lib/import_realm.py b/zerver/lib/import_realm.py new file mode 100644 index 0000000000..2c6896e91f --- /dev/null +++ b/zerver/lib/import_realm.py @@ -0,0 +1,700 @@ +import datetime +import logging +import os +import ujson +import shutil +import subprocess + +from boto.s3.connection import S3Connection +from boto.s3.key import Key +from django.conf import settings +from django.db import connection +from django.utils.timezone import utc as timezone_utc +from typing import Any, Dict, List, Optional, Set, Tuple, \ + Iterable, Text + +from zerver.lib.avatar_hash import user_avatar_path_from_ids +from zerver.lib.bulk_create import bulk_create_users +from zerver.lib.create_user import random_api_key +from zerver.lib.export import DATE_FIELDS, realm_tables, \ + Record, TableData, TableName, Field, Path +from zerver.lib.upload import random_name, sanitize_name, \ + S3UploadBackend, LocalUploadBackend +from zerver.models import UserProfile, Realm, Client, Huddle, Stream, \ + UserMessage, Subscription, Message, RealmEmoji, \ + RealmDomain, Recipient, get_user_profile_by_id, \ + UserPresence, UserActivity, UserActivityInterval, Reaction, \ + CustomProfileField, CustomProfileFieldValue, \ + Attachment, get_system_bot, email_to_username + +# Code from here is the realm import code path + +# id_maps is a dictionary that maps table names to dictionaries +# that map old ids to new ids. We use this in +# re_map_foreign_keys and other places. +# +# We explicity initialize id_maps with the tables that support +# id re-mapping. +# +# Code reviewers: give these tables extra scrutiny, as we need to +# make sure to reload related tables AFTER we re-map the ids. +id_maps = { + 'client': {}, + 'user_profile': {}, + 'realm': {}, + 'stream': {}, + 'recipient': {}, + 'subscription': {}, + 'defaultstream': {}, + 'reaction': {}, + 'realmemoji': {}, + 'realmdomain': {}, + 'realmfilter': {}, + 'message': {}, + 'user_presence': {}, + 'useractivity': {}, + 'useractivityinterval': {}, + 'usermessage': {}, + 'customprofilefield': {}, + 'customprofilefield_value': {}, + 'attachment': {}, +} # type: Dict[str, Dict[int, int]] + +path_maps = { + 'attachment_path': {}, +} # type: Dict[str, Dict[str, str]] + +def update_id_map(table: TableName, old_id: int, new_id: int) -> None: + if table not in id_maps: + raise Exception(''' + Table %s is not initialized in id_maps, which could + mean that we have not thought through circular + dependencies. + ''' % (table,)) + id_maps[table][old_id] = new_id + +def fix_datetime_fields(data: TableData, table: TableName) -> None: + for item in data[table]: + for field_name in DATE_FIELDS[table]: + if item[field_name] is not None: + item[field_name] = datetime.datetime.fromtimestamp(item[field_name], tz=timezone_utc) + +def fix_upload_links(data: TableData, message_table: TableName) -> None: + """ + Because the URLs for uploaded files encode the realm ID of the + organization being imported (which is only determined at import + time), we need to rewrite the URLs of links to uploaded files + during the import process. + """ + for message in data[message_table]: + if message['has_attachment'] is True: + for key, value in path_maps['attachment_path'].items(): + if key in message['content']: + message['content'] = message['content'].replace(key, value) + if message['rendered_content']: + message['rendered_content'] = message['rendered_content'].replace(key, value) + +def current_table_ids(data: TableData, table: TableName) -> List[int]: + """ + Returns the ids present in the current table + """ + id_list = [] + for item in data[table]: + id_list.append(item["id"]) + return id_list + +def idseq(model_class: Any) -> str: + if model_class == RealmDomain: + return 'zerver_realmalias_id_seq' + return '{}_id_seq'.format(model_class._meta.db_table) + +def allocate_ids(model_class: Any, count: int) -> List[int]: + """ + Increases the sequence number for a given table by the amount of objects being + imported into that table. Hence, this gives a reserved range of ids to import the + converted slack objects into the tables. + """ + conn = connection.cursor() + sequence = idseq(model_class) + conn.execute("select nextval('%s') from generate_series(1,%s)" % + (sequence, str(count))) + query = conn.fetchall() # Each element in the result is a tuple like (5,) + conn.close() + # convert List[Tuple[int]] to List[int] + return [item[0] for item in query] + +def convert_to_id_fields(data: TableData, table: TableName, field_name: Field) -> None: + ''' + When Django gives us dict objects via model_to_dict, the foreign + key fields are `foo`, but we want `foo_id` for the bulk insert. + This function handles the simple case where we simply rename + the fields. For cases where we need to munge ids in the + database, see re_map_foreign_keys. + ''' + for item in data[table]: + item[field_name + "_id"] = item[field_name] + del item[field_name] + +def re_map_foreign_keys(data: TableData, + table: TableName, + field_name: Field, + related_table: TableName, + verbose: bool=False, + id_field: bool=False, + recipient_field: bool=False) -> None: + """ + This is a wrapper function for all the realm data tables + and only avatar and attachment records need to be passed through the internal function + because of the difference in data format (TableData corresponding to realm data tables + and List[Record] corresponding to the avatar and attachment records) + """ + re_map_foreign_keys_internal(data[table], table, field_name, related_table, verbose, id_field, + recipient_field) + +def re_map_foreign_keys_internal(data_table: List[Record], + table: TableName, + field_name: Field, + related_table: TableName, + verbose: bool=False, + id_field: bool=False, + recipient_field: bool=False) -> None: + ''' + We occasionally need to assign new ids to rows during the + import/export process, to accommodate things like existing rows + already being in tables. See bulk_import_client for more context. + + The tricky part is making sure that foreign key references + are in sync with the new ids, and this fixer function does + the re-mapping. (It also appends `_id` to the field.) + ''' + lookup_table = id_maps[related_table] + for item in data_table: + if recipient_field: + if related_table == "stream" and item['type'] == 2: + pass + elif related_table == "user_profile" and item['type'] == 1: + pass + else: + continue + old_id = item[field_name] + if old_id in lookup_table: + new_id = lookup_table[old_id] + if verbose: + logging.info('Remapping %s %s from %s to %s' % (table, + field_name + '_id', + old_id, + new_id)) + else: + new_id = old_id + if not id_field: + item[field_name + "_id"] = new_id + del item[field_name] + else: + item[field_name] = new_id + +def fix_bitfield_keys(data: TableData, table: TableName, field_name: Field) -> None: + for item in data[table]: + item[field_name] = item[field_name + '_mask'] + del item[field_name + '_mask'] + +def fix_realm_authentication_bitfield(data: TableData, table: TableName, field_name: Field) -> None: + """Used to fixup the authentication_methods bitfield to be a string""" + for item in data[table]: + values_as_bitstring = ''.join(['1' if field[1] else '0' for field in + item[field_name]]) + values_as_int = int(values_as_bitstring, 2) + item[field_name] = values_as_int + +def update_model_ids(model: Any, data: TableData, table: TableName, related_table: TableName) -> None: + old_id_list = current_table_ids(data, table) + allocated_id_list = allocate_ids(model, len(data[table])) + for item in range(len(data[table])): + update_id_map(related_table, old_id_list[item], allocated_id_list[item]) + re_map_foreign_keys(data, table, 'id', related_table=related_table, id_field=True) + +def bulk_import_model(data: TableData, model: Any, table: TableName, + dump_file_id: Optional[str]=None) -> None: + # TODO, deprecate dump_file_id + model.objects.bulk_create(model(**item) for item in data[table]) + if dump_file_id is None: + logging.info("Successfully imported %s from %s." % (model, table)) + else: + logging.info("Successfully imported %s from %s[%s]." % (model, table, dump_file_id)) + +# Client is a table shared by multiple realms, so in order to +# correctly import multiple realms into the same server, we need to +# check if a Client object already exists, and so we need to support +# remap all Client IDs to the values in the new DB. +def bulk_import_client(data: TableData, model: Any, table: TableName) -> None: + for item in data[table]: + try: + client = Client.objects.get(name=item['name']) + except Client.DoesNotExist: + client = Client.objects.create(name=item['name']) + update_id_map(table='client', old_id=item['id'], new_id=client.id) + +def import_uploads_local(import_dir: Path, processing_avatars: bool=False, + processing_emojis: bool=False) -> None: + records_filename = os.path.join(import_dir, "records.json") + with open(records_filename) as records_file: + records = ujson.loads(records_file.read()) + + re_map_foreign_keys_internal(records, 'records', 'realm_id', related_table="realm", + id_field=True) + if not processing_emojis: + re_map_foreign_keys_internal(records, 'records', 'user_profile_id', + related_table="user_profile", id_field=True) + for record in records: + if processing_avatars: + # For avatars, we need to rehash the user ID with the + # new server's avatar salt + avatar_path = user_avatar_path_from_ids(record['user_profile_id'], record['realm_id']) + file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, "avatars", avatar_path) + if record['s3_path'].endswith('.original'): + file_path += '.original' + else: + file_path += '.png' + elif processing_emojis: + # For emojis we follow the function 'upload_emoji_image' + emoji_path = RealmEmoji.PATH_ID_TEMPLATE.format( + realm_id=record['realm_id'], + emoji_file_name=record['file_name']) + file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, "avatars", emoji_path) + else: + # Should be kept in sync with its equivalent in zerver/lib/uploads in the + # function 'upload_message_image' + s3_file_name = "/".join([ + str(record['realm_id']), + random_name(18), + sanitize_name(os.path.basename(record['path'])) + ]) + file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, "files", s3_file_name) + path_maps['attachment_path'][record['path']] = s3_file_name + + orig_file_path = os.path.join(import_dir, record['path']) + if not os.path.exists(os.path.dirname(file_path)): + subprocess.check_call(["mkdir", "-p", os.path.dirname(file_path)]) + shutil.copy(orig_file_path, file_path) + + if processing_avatars: + # Ensure that we have medium-size avatar images for every + # avatar. TODO: This implementation is hacky, both in that it + # does get_user_profile_by_id for each user, and in that it + # might be better to require the export to just have these. + upload_backend = LocalUploadBackend() + for record in records: + if record['s3_path'].endswith('.original'): + user_profile = get_user_profile_by_id(record['user_profile_id']) + avatar_path = user_avatar_path_from_ids(user_profile.id, record['realm_id']) + medium_file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, "avatars", + avatar_path) + '-medium.png' + if os.path.exists(medium_file_path): + # We remove the image here primarily to deal with + # issues when running the import script multiple + # times in development (where one might reuse the + # same realm ID from a previous iteration). + os.remove(medium_file_path) + upload_backend.ensure_medium_avatar_image(user_profile=user_profile) + +def import_uploads_s3(bucket_name: str, import_dir: Path, processing_avatars: bool=False, + processing_emojis: bool=False) -> None: + upload_backend = S3UploadBackend() + conn = S3Connection(settings.S3_KEY, settings.S3_SECRET_KEY) + bucket = conn.get_bucket(bucket_name, validate=True) + + records_filename = os.path.join(import_dir, "records.json") + with open(records_filename) as records_file: + records = ujson.loads(records_file.read()) + + re_map_foreign_keys_internal(records, 'records', 'realm_id', related_table="realm", + id_field=True) + re_map_foreign_keys_internal(records, 'records', 'user_profile_id', + related_table="user_profile", id_field=True) + for record in records: + key = Key(bucket) + + if processing_avatars: + # For avatars, we need to rehash the user's email with the + # new server's avatar salt + avatar_path = user_avatar_path_from_ids(record['user_profile_id'], record['realm_id']) + key.key = avatar_path + if record['s3_path'].endswith('.original'): + key.key += '.original' + if processing_emojis: + # For emojis we follow the function 'upload_emoji_image' + emoji_path = RealmEmoji.PATH_ID_TEMPLATE.format( + realm_id=record['realm_id'], + emoji_file_name=record['file_name']) + key.key = emoji_path + else: + # Should be kept in sync with its equivalent in zerver/lib/uploads in the + # function 'upload_message_image' + s3_file_name = "/".join([ + str(record['realm_id']), + random_name(18), + sanitize_name(os.path.basename(record['path'])) + ]) + key.key = s3_file_name + path_maps['attachment_path'][record['path']] = s3_file_name + + user_profile_id = int(record['user_profile_id']) + # Support email gateway bot and other cross-realm messages + if user_profile_id in id_maps["user_profile"]: + logging.info("Uploaded by ID mapped user: %s!" % (user_profile_id,)) + user_profile_id = id_maps["user_profile"][user_profile_id] + user_profile = get_user_profile_by_id(user_profile_id) + key.set_metadata("user_profile_id", str(user_profile.id)) + key.set_metadata("realm_id", str(user_profile.realm_id)) + key.set_metadata("orig_last_modified", record['last_modified']) + + headers = {'Content-Type': record['content_type']} + + key.set_contents_from_filename(os.path.join(import_dir, record['path']), headers=headers) + + if processing_avatars: + # TODO: Ideally, we'd do this in a separate pass, after + # all the avatars have been uploaded, since we may end up + # unnecssarily resizing images just before the medium-size + # image in the export is uploaded. See the local uplods + # code path for more notes. + upload_backend.ensure_medium_avatar_image(user_profile=user_profile) + +def import_uploads(import_dir: Path, processing_avatars: bool=False, + processing_emojis: bool=False) -> None: + if processing_avatars: + logging.info("Importing avatars") + elif processing_emojis: + logging.info("Importing emojis") + else: + logging.info("Importing uploaded files") + if settings.LOCAL_UPLOADS_DIR: + import_uploads_local(import_dir, processing_avatars=processing_avatars, + processing_emojis=processing_emojis) + else: + if processing_avatars or processing_emojis: + bucket_name = settings.S3_AVATAR_BUCKET + else: + bucket_name = settings.S3_AUTH_UPLOADS_BUCKET + import_uploads_s3(bucket_name, import_dir, processing_avatars=processing_avatars) + +# Importing data suffers from a difficult ordering problem because of +# models that reference each other circularly. Here is a correct order. +# +# * Client [no deps] +# * Realm [-notifications_stream] +# * Stream [only depends on realm] +# * Realm's notifications_stream +# * Now can do all realm_tables +# * UserProfile, in order by ID to avoid bot loop issues +# * Huddle +# * Recipient +# * Subscription +# * Message +# * UserMessage +# +# Because the Python object => JSON conversion process is not fully +# faithful, we have to use a set of fixers (e.g. on DateTime objects +# and Foreign Keys) to do the import correctly. +def do_import_realm(import_dir: Path, subdomain: str) -> Realm: + logging.info("Importing realm dump %s" % (import_dir,)) + if not os.path.exists(import_dir): + raise Exception("Missing import directory!") + + realm_data_filename = os.path.join(import_dir, "realm.json") + if not os.path.exists(realm_data_filename): + raise Exception("Missing realm.json file!") + + logging.info("Importing realm data from %s" % (realm_data_filename,)) + with open(realm_data_filename) as f: + data = ujson.load(f) + + update_model_ids(Stream, data, 'zerver_stream', 'stream') + re_map_foreign_keys(data, 'zerver_realm', 'notifications_stream', related_table="stream") + + fix_datetime_fields(data, 'zerver_realm') + # Fix realm subdomain information + data['zerver_realm'][0]['string_id'] = subdomain + data['zerver_realm'][0]['name'] = subdomain + fix_realm_authentication_bitfield(data, 'zerver_realm', 'authentication_methods') + update_model_ids(Realm, data, 'zerver_realm', 'realm') + + realm = Realm(**data['zerver_realm'][0]) + if realm.notifications_stream_id is not None: + notifications_stream_id = int(realm.notifications_stream_id) # type: Optional[int] + else: + notifications_stream_id = None + realm.notifications_stream_id = None + realm.save() + bulk_import_client(data, Client, 'zerver_client') + + # Email tokens will automatically be randomly generated when the + # Stream objects are created by Django. + fix_datetime_fields(data, 'zerver_stream') + re_map_foreign_keys(data, 'zerver_stream', 'realm', related_table="realm") + bulk_import_model(data, Stream, 'zerver_stream') + + realm.notifications_stream_id = notifications_stream_id + realm.save() + + re_map_foreign_keys(data, 'zerver_defaultstream', 'stream', related_table="stream") + re_map_foreign_keys(data, 'zerver_realmemoji', 'author', related_table="user_profile") + for (table, model, related_table) in realm_tables: + re_map_foreign_keys(data, table, 'realm', related_table="realm") + update_model_ids(model, data, table, related_table) + bulk_import_model(data, model, table) + + # Remap the user IDs for notification_bot and friends to their + # appropriate IDs on this server + for item in data['zerver_userprofile_crossrealm']: + logging.info("Adding to ID map: %s %s" % (item['id'], get_system_bot(item['email']).id)) + new_user_id = get_system_bot(item['email']).id + update_id_map(table='user_profile', old_id=item['id'], new_id=new_user_id) + + # Merge in zerver_userprofile_mirrordummy + data['zerver_userprofile'] = data['zerver_userprofile'] + data['zerver_userprofile_mirrordummy'] + del data['zerver_userprofile_mirrordummy'] + data['zerver_userprofile'].sort(key=lambda r: r['id']) + + # To remap foreign key for UserProfile.last_active_message_id + update_message_foreign_keys(import_dir) + + fix_datetime_fields(data, 'zerver_userprofile') + update_model_ids(UserProfile, data, 'zerver_userprofile', 'user_profile') + re_map_foreign_keys(data, 'zerver_userprofile', 'realm', related_table="realm") + re_map_foreign_keys(data, 'zerver_userprofile', 'bot_owner', related_table="user_profile") + re_map_foreign_keys(data, 'zerver_userprofile', 'default_sending_stream', + related_table="stream") + re_map_foreign_keys(data, 'zerver_userprofile', 'default_events_register_stream', + related_table="stream") + re_map_foreign_keys(data, 'zerver_userprofile', 'last_active_message_id', + related_table="message", id_field=True) + for user_profile_dict in data['zerver_userprofile']: + user_profile_dict['password'] = None + user_profile_dict['api_key'] = random_api_key() + # Since Zulip doesn't use these permissions, drop them + del user_profile_dict['user_permissions'] + del user_profile_dict['groups'] + + user_profiles = [UserProfile(**item) for item in data['zerver_userprofile']] + for user_profile in user_profiles: + user_profile.set_unusable_password() + UserProfile.objects.bulk_create(user_profiles) + + if 'zerver_huddle' in data: + bulk_import_model(data, Huddle, 'zerver_huddle') + + re_map_foreign_keys(data, 'zerver_recipient', 'type_id', related_table="stream", + recipient_field=True, id_field=True) + re_map_foreign_keys(data, 'zerver_recipient', 'type_id', related_table="user_profile", + recipient_field=True, id_field=True) + update_model_ids(Recipient, data, 'zerver_recipient', 'recipient') + bulk_import_model(data, Recipient, 'zerver_recipient') + + re_map_foreign_keys(data, 'zerver_subscription', 'user_profile', related_table="user_profile") + re_map_foreign_keys(data, 'zerver_subscription', 'recipient', related_table="recipient") + update_model_ids(Subscription, data, 'zerver_subscription', 'subscription') + bulk_import_model(data, Subscription, 'zerver_subscription') + + fix_datetime_fields(data, 'zerver_userpresence') + re_map_foreign_keys(data, 'zerver_userpresence', 'user_profile', related_table="user_profile") + re_map_foreign_keys(data, 'zerver_userpresence', 'client', related_table='client') + update_model_ids(UserPresence, data, 'zerver_userpresence', 'user_presence') + bulk_import_model(data, UserPresence, 'zerver_userpresence') + + fix_datetime_fields(data, 'zerver_useractivity') + re_map_foreign_keys(data, 'zerver_useractivity', 'user_profile', related_table="user_profile") + re_map_foreign_keys(data, 'zerver_useractivity', 'client', related_table='client') + update_model_ids(UserActivity, data, 'zerver_useractivity', 'useractivity') + bulk_import_model(data, UserActivity, 'zerver_useractivity') + + fix_datetime_fields(data, 'zerver_useractivityinterval') + re_map_foreign_keys(data, 'zerver_useractivityinterval', 'user_profile', related_table="user_profile") + update_model_ids(UserActivityInterval, data, 'zerver_useractivityinterval', + 'useractivityinterval') + bulk_import_model(data, UserActivityInterval, 'zerver_useractivityinterval') + + if 'zerver_customprofilefield' in data: + # As the export of Custom Profile fields is not supported, Zulip exported + # data would not contain this field. + # However this is supported in slack importer script + re_map_foreign_keys(data, 'zerver_customprofilefield', 'realm', related_table="realm") + update_model_ids(CustomProfileField, data, 'zerver_customprofilefield', + related_table="customprofilefield") + bulk_import_model(data, CustomProfileField, 'zerver_customprofilefield') + + re_map_foreign_keys(data, 'zerver_customprofilefield_value', 'user_profile', + related_table="user_profile") + re_map_foreign_keys(data, 'zerver_customprofilefield_value', 'field', + related_table="customprofilefield") + update_model_ids(CustomProfileFieldValue, data, 'zerver_customprofilefield_value', + related_table="customprofilefield_value") + bulk_import_model(data, CustomProfileFieldValue, 'zerver_customprofilefield_value') + + # Import uploaded files and avatars + import_uploads(os.path.join(import_dir, "avatars"), processing_avatars=True) + import_uploads(os.path.join(import_dir, "uploads")) + + # We need to have this check as the emoji files are only present in the data + # importer from slack + # For Zulip export, this doesn't exist + if os.path.exists(os.path.join(import_dir, "emoji")): + import_uploads(os.path.join(import_dir, "emoji"), processing_emojis=True) + + # Import zerver_message and zerver_usermessage + import_message_data(import_dir) + + # Do attachments AFTER message data is loaded. + # TODO: de-dup how we read these json files. + fn = os.path.join(import_dir, "attachment.json") + if not os.path.exists(fn): + raise Exception("Missing attachment.json file!") + + logging.info("Importing attachment data from %s" % (fn,)) + with open(fn) as f: + data = ujson.load(f) + + import_attachments(data) + return realm + +# create_users and do_import_system_bots differ from their equivalent in +# zerver/management/commands/initialize_voyager_db.py because here we check if the bots +# don't already exist and only then create a user for these bots. +def do_import_system_bots(realm: Any) -> None: + internal_bots = [(bot['name'], bot['email_template'] % (settings.INTERNAL_BOT_DOMAIN,)) + for bot in settings.INTERNAL_BOTS] + create_users(realm, internal_bots, bot_type=UserProfile.DEFAULT_BOT) + names = [(settings.FEEDBACK_BOT_NAME, settings.FEEDBACK_BOT)] + create_users(realm, names, bot_type=UserProfile.DEFAULT_BOT) + print("Finished importing system bots.") + +def create_users(realm: Realm, name_list: Iterable[Tuple[Text, Text]], + bot_type: Optional[int]=None) -> None: + user_set = set() + for full_name, email in name_list: + short_name = email_to_username(email) + if not UserProfile.objects.filter(email=email): + user_set.add((email, full_name, short_name, True)) + bulk_create_users(realm, user_set, bot_type) + +def update_message_foreign_keys(import_dir: Path) -> None: + dump_file_id = 1 + while True: + message_filename = os.path.join(import_dir, "messages-%06d.json" % (dump_file_id,)) + if not os.path.exists(message_filename): + break + + with open(message_filename) as f: + data = ujson.load(f) + + update_model_ids(Message, data, 'zerver_message', 'message') + dump_file_id += 1 + +def import_message_data(import_dir: Path) -> None: + dump_file_id = 1 + while True: + message_filename = os.path.join(import_dir, "messages-%06d.json" % (dump_file_id,)) + if not os.path.exists(message_filename): + break + + with open(message_filename) as f: + data = ujson.load(f) + + logging.info("Importing message dump %s" % (message_filename,)) + re_map_foreign_keys(data, 'zerver_message', 'sender', related_table="user_profile") + re_map_foreign_keys(data, 'zerver_message', 'recipient', related_table="recipient") + re_map_foreign_keys(data, 'zerver_message', 'sending_client', related_table='client') + fix_datetime_fields(data, 'zerver_message') + # Parser to update message content with the updated attachment urls + fix_upload_links(data, 'zerver_message') + + re_map_foreign_keys(data, 'zerver_message', 'id', related_table='message', id_field=True) + bulk_import_model(data, Message, 'zerver_message') + + # Due to the structure of these message chunks, we're + # guaranteed to have already imported all the Message objects + # for this batch of UserMessage objects. + re_map_foreign_keys(data, 'zerver_usermessage', 'message', related_table="message") + re_map_foreign_keys(data, 'zerver_usermessage', 'user_profile', related_table="user_profile") + fix_bitfield_keys(data, 'zerver_usermessage', 'flags') + update_model_ids(UserMessage, data, 'zerver_usermessage', 'usermessage') + bulk_import_model(data, UserMessage, 'zerver_usermessage') + + # As the export of Reactions is not supported, Zulip exported + # data would not contain this field. + # However this is supported in slack importer script + if 'zerver_reaction' in data: + re_map_foreign_keys(data, 'zerver_reaction', 'message', related_table="message") + re_map_foreign_keys(data, 'zerver_reaction', 'user_profile', related_table="user_profile") + for reaction in data['zerver_reaction']: + if reaction['reaction_type'] == Reaction.REALM_EMOJI: + re_map_foreign_keys(data, 'zerver_reaction', 'emoji_code', + related_table="realmemoji", id_field=True) + update_model_ids(Reaction, data, 'zerver_reaction', 'reaction') + bulk_import_model(data, Reaction, 'zerver_reaction') + + dump_file_id += 1 + +def import_attachments(data: TableData) -> None: + + # Clean up the data in zerver_attachment that is not + # relevant to our many-to-many import. + fix_datetime_fields(data, 'zerver_attachment') + re_map_foreign_keys(data, 'zerver_attachment', 'owner', related_table="user_profile") + re_map_foreign_keys(data, 'zerver_attachment', 'realm', related_table="realm") + + # Configure ourselves. Django models many-to-many (m2m) + # relations asymmetrically. The parent here refers to the + # Model that has the ManyToManyField. It is assumed here + # the child models have been loaded, but we are in turn + # responsible for loading the parents and the m2m rows. + parent_model = Attachment + parent_db_table_name = 'zerver_attachment' + parent_singular = 'attachment' + child_singular = 'message' + child_plural = 'messages' + m2m_table_name = 'zerver_attachment_messages' + parent_id = 'attachment_id' + child_id = 'message_id' + + update_model_ids(parent_model, data, parent_db_table_name, 'attachment') + # First, build our list of many-to-many (m2m) rows. + # We do this in a slightly convoluted way to anticipate + # a future where we may need to call re_map_foreign_keys. + + m2m_rows = [] # type: List[Record] + for parent_row in data[parent_db_table_name]: + for fk_id in parent_row[child_plural]: + m2m_row = {} # type: Record + m2m_row[parent_singular] = parent_row['id'] + m2m_row[child_singular] = id_maps['message'][fk_id] + m2m_rows.append(m2m_row) + + # Create our table data for insert. + m2m_data = {m2m_table_name: m2m_rows} # type: TableData + convert_to_id_fields(m2m_data, m2m_table_name, parent_singular) + convert_to_id_fields(m2m_data, m2m_table_name, child_singular) + m2m_rows = m2m_data[m2m_table_name] + + # Next, delete out our child data from the parent rows. + for parent_row in data[parent_db_table_name]: + del parent_row[child_plural] + + # Update 'path_id' for the attachments + for attachment in data[parent_db_table_name]: + attachment['path_id'] = path_maps['attachment_path'][attachment['path_id']] + + # Next, load the parent rows. + bulk_import_model(data, parent_model, parent_db_table_name) + + # Now, go back to our m2m rows. + # TODO: Do this the kosher Django way. We may find a + # better way to do this in Django 1.9 particularly. + with connection.cursor() as cursor: + sql_template = ''' + insert into %s (%s, %s) values(%%s, %%s);''' % (m2m_table_name, + parent_id, + child_id) + tups = [(row[parent_id], row[child_id]) for row in m2m_rows] + cursor.executemany(sql_template, tups) + + logging.info('Successfully imported M2M table %s' % (m2m_table_name,)) diff --git a/zerver/management/commands/import.py b/zerver/management/commands/import.py index 7d616eb2a8..9deef7f61f 100644 --- a/zerver/management/commands/import.py +++ b/zerver/management/commands/import.py @@ -8,7 +8,7 @@ from django.conf import settings from django.core.management import call_command from django.core.management.base import BaseCommand, CommandParser -from zerver.lib.export import do_import_realm, do_import_system_bots +from zerver.lib.import_realm import do_import_realm, do_import_system_bots from zerver.forms import check_subdomain_available from zerver.models import Client, DefaultStream, Huddle, \ Message, Realm, RealmDomain, RealmFilter, Recipient, \ diff --git a/zerver/tests/test_slack_importer.py b/zerver/tests/test_slack_importer.py index 55c1d9b5d3..9ce46623a6 100644 --- a/zerver/tests/test_slack_importer.py +++ b/zerver/tests/test_slack_importer.py @@ -24,7 +24,7 @@ from zerver.lib.slack_data_to_zulip_data import ( do_convert_data, process_avatars, ) -from zerver.lib.export import ( +from zerver.lib.import_realm import ( do_import_realm, ) from zerver.lib.avatar_hash import (