zulip/zerver/data_import/gitter.py

import os
import ujson
import dateutil.parser
import random
import requests
import json
import logging
import shutil
import subprocess

from django.conf import settings
from django.forms.models import model_to_dict
from django.utils.timezone import now as timezone_now
from typing import Any, Dict, List, Tuple

from zerver.models import Realm, UserProfile, Recipient
from zerver.lib.export import MESSAGE_BATCH_CHUNK_SIZE
from zerver.lib.avatar_hash import user_avatar_path_from_ids
from zerver.lib.parallel import run_parallel
from zerver.data_import.import_util import ZerverFieldsT, build_zerver_realm, \
    build_avatar, build_subscription, build_recipient

# stubs
GitterDataT = List[Dict[str, Any]]

realm_id = 0

def gitter_workspace_to_realm(domain_name: str, gitter_data: GitterDataT,
                              realm_subdomain: str) -> Tuple[ZerverFieldsT,
                                                             List[ZerverFieldsT],
                                                             Dict[str, int]]:
    """
    Returns:
    1. realm, Converted Realm data
    2. avatars, which is list to map avatars to zulip avatar records.json
    3. user_map, which is a dictionary to map from gitter user id to zulip user id
    """
    NOW = float(timezone_now().timestamp())
    zerver_realm = build_zerver_realm(realm_id, realm_subdomain, NOW, 'Gitter')  # type: List[ZerverFieldsT]

    realm = dict(zerver_client=[{"name": "populate_db", "id": 1},
                                {"name": "website", "id": 2},
                                {"name": "API", "id": 3}],
                 zerver_customprofilefield=[],
                 zerver_customprofilefieldvalue=[],
                 zerver_userpresence=[],  # shows last logged in data, which is not available in gitter
                 zerver_userprofile_mirrordummy=[],
                 zerver_realmdomain=[{"realm": realm_id,
                                      "allow_subdomains": False,
                                      "domain": domain_name,
                                      "id": realm_id}],
                 zerver_useractivity=[],
                 zerver_realm=zerver_realm,
                 zerver_huddle=[],
                 zerver_userprofile_crossrealm=[],
                 zerver_useractivityinterval=[],
                 zerver_reaction=[],
                 zerver_realmemoji=[],
                 zerver_realmfilter=[])

    zerver_userprofile, avatars, user_map = build_userprofile(int(NOW), domain_name, gitter_data)
    zerver_stream, zerver_defaultstream = build_stream(int(NOW))
    zerver_recipient, zerver_subscription = build_recipient_and_subscription(
        zerver_userprofile, zerver_stream)

    realm['zerver_userprofile'] = zerver_userprofile
    realm['zerver_stream'] = zerver_stream
    realm['zerver_defaultstream'] = zerver_defaultstream
    realm['zerver_recipient'] = zerver_recipient
    realm['zerver_subscription'] = zerver_subscription

    return realm, avatars, user_map

def build_userprofile(timestamp: Any, domain_name: str,
                      gitter_data: GitterDataT) -> Tuple[List[ZerverFieldsT],
                                                         List[ZerverFieldsT],
                                                         Dict[str, int]]:
    """
    Returns:
    1. zerver_userprofile, which is a list of user profile
    2. avatar_list, which is list to map avatars to zulip avatard records.json
    3. added_users, which is a dictionary to map from gitter user id to zulip id
    """
    logging.info('######### IMPORTING USERS STARTED #########\n')
    zerver_userprofile = []
    avatar_list = []  # type: List[ZerverFieldsT]
    user_map = {}  # type: Dict[str, int]
    user_id = 0

    for data in gitter_data:
        if data['fromUser']['id'] not in user_map:
            user_data = data['fromUser']
            user_map[user_data['id']] = user_id

            email = get_user_email(user_data, domain_name)
            build_avatar(user_id, realm_id, email, user_data['avatarUrl'],
                         timestamp, avatar_list)

            # Build userprofile object
            userprofile = UserProfile(
                full_name=user_data['displayName'],
                short_name=user_data['username'],
                id=user_id,
                email=email,
                avatar_source='U',
                pointer=-1,
                date_joined=timestamp,
                last_login=timestamp)
            userprofile_dict = model_to_dict(userprofile)
            # Set realm id separately as the corresponding realm is not yet a Realm model
            # instance
            userprofile_dict['realm'] = realm_id
            zerver_userprofile.append(userprofile_dict)
            user_id += 1
    logging.info('######### IMPORTING USERS FINISHED #########\n')
    return zerver_userprofile, avatar_list, user_map

def get_user_email(user_data: ZerverFieldsT, domain_name: str) -> str:
    # TODO Get user email from github
    email = ("%s@users.noreply.github.com" % user_data['username'])
    return email

def build_stream(timestamp: Any) -> Tuple[List[ZerverFieldsT],
                                          List[ZerverFieldsT]]:
    logging.info('######### IMPORTING STREAM STARTED #########\n')
    # We have only one stream for gitter export
    stream = dict(
        realm=realm_id,
        name="from gitter",
        deactivated=False,
        description="Imported from gitter",
        invite_only=False,
        date_created=timestamp,
        id=0)

    defaultstream = dict(
        stream=0,
        realm=realm_id,
        id=0)
    logging.info('######### IMPORTING STREAMS FINISHED #########\n')
    return [stream], [defaultstream]

def build_recipient_and_subscription(
    zerver_userprofile: List[ZerverFieldsT],
    zerver_stream: List[ZerverFieldsT]) -> Tuple[List[ZerverFieldsT],
                                                 List[ZerverFieldsT]]:
    """
    Returns:
    1. zerver_recipient, which is a list of mapped recipient
    2. zerver_subscription, which is a list of mapped subscription
    """
    zerver_recipient = []
    zerver_subscription = []
    recipient_id = subscription_id = 0

    # For stream

    # We have only one recipient, because we have only one stream
    # Hence 'recipient_id'=0 corresponds to 'stream_id'=0
    recipient = build_recipient(0, recipient_id, Recipient.STREAM)
    zerver_recipient.append(recipient)

    for user in zerver_userprofile:
        subscription = build_subscription(recipient_id, user['id'], subscription_id)
        zerver_subscription.append(subscription)
        subscription_id += 1
    recipient_id += 1

    # For users
    for user in zerver_userprofile:
        recipient = build_recipient(user['id'], recipient_id, Recipient.PERSONAL)
        subscription = build_subscription(recipient_id, user['id'], subscription_id)
        zerver_recipient.append(recipient)
        zerver_subscription.append(subscription)
        recipient_id += 1
        subscription_id += 1

    return zerver_recipient, zerver_subscription

def convert_gitter_workspace_messages(gitter_data: GitterDataT, output_dir: str,
                                      zerver_subscription: List[ZerverFieldsT],
                                      user_map: Dict[str, int],
                                      user_short_name_to_full_name: Dict[str, str],
                                      chunk_size: int=MESSAGE_BATCH_CHUNK_SIZE) -> None:
    """
    Messages are stored in batches
    """
    logging.info('######### IMPORTING MESSAGES STARTED #########\n')
    message_id = usermessage_id = 0
    recipient_id = 0  # Corresponding to stream "gitter"

    low_index = 0
    upper_index = low_index + chunk_size
    dump_file_id = 1

    while True:
        message_json = {}
        zerver_message = []
        zerver_usermessage = []
        message_data = gitter_data[low_index: upper_index]
        if len(message_data) == 0:
            break
        for message in message_data:
            message_time = dateutil.parser.parse(message['sent']).timestamp()
            mentioned_user_ids = get_usermentions(message, user_map,
                                                  user_short_name_to_full_name)
            rendered_content = None

            zulip_message = dict(
                sending_client=1,
                rendered_content_version=1,  # This is Zulip-specific
                has_image=False,
                subject='imported from gitter',
                pub_date=float(message_time),
                id=message_id,
                has_attachment=False,
                edit_history=None,
                sender=user_map[message['fromUser']['id']],
                content=message['text'],
                rendered_content=rendered_content,
                recipient=recipient_id,
                last_edit_time=None,
                has_link=False)
            zerver_message.append(zulip_message)

            for subscription in zerver_subscription:
                if subscription['recipient'] == recipient_id:
                    flags_mask = 1  # For read
                    if subscription['user_profile'] in mentioned_user_ids:
                        flags_mask = 9  # For read and mentioned

                    usermessage = dict(
                        user_profile=subscription['user_profile'],
                        id=usermessage_id,
                        flags_mask=flags_mask,
                        message=message_id)
                    usermessage_id += 1
                    zerver_usermessage.append(usermessage)
            message_id += 1

        message_json['zerver_message'] = zerver_message
        message_json['zerver_usermessage'] = zerver_usermessage
        message_filename = os.path.join(output_dir, "messages-%06d.json" % (dump_file_id,))
        logging.info("Writing Messages to %s\n" % (message_filename,))
        write_data_to_file(os.path.join(message_filename), message_json)

        low_index = upper_index
        upper_index = chunk_size + low_index
        dump_file_id += 1

    logging.info('######### IMPORTING MESSAGES FINISHED #########\n')

def get_usermentions(message: Dict[str, Any], user_map: Dict[str, int],
                     user_short_name_to_full_name: Dict[str, str]) -> List[int]:
    mentioned_user_ids = []
    if 'mentions' in message:
        for mention in message['mentions']:
            if mention.get('userId') in user_map:
                gitter_mention = '@%s' % (mention['screenName'])
                zulip_mention = ('@**%s**' %
                                 (user_short_name_to_full_name[mention['screenName']]))
                message['text'] = message['text'].replace(gitter_mention, zulip_mention)

                mentioned_user_ids.append(user_map[mention['userId']])
    return mentioned_user_ids

def do_convert_data(gitter_data_file: str, output_dir: str, threads: int=6) -> None:
    #  Subdomain is set by the user while running the import commands
    realm_subdomain = ""
    domain_name = settings.EXTERNAL_HOST

    os.makedirs(output_dir, exist_ok=True)
    # output directory should be empty initially
    if os.listdir(output_dir):
        raise Exception("Output directory should be empty!")

    # Read data from the gitter file
    gitter_data = json.load(open(gitter_data_file))

    realm, avatar_list, user_map = gitter_workspace_to_realm(
        domain_name, gitter_data, realm_subdomain)

    # For user mentions
    user_short_name_to_full_name = {}
    for userprofile in realm['zerver_userprofile']:
        user_short_name_to_full_name[userprofile['short_name']] = userprofile['full_name']

    convert_gitter_workspace_messages(
        gitter_data, output_dir, realm['zerver_subscription'], user_map,
        user_short_name_to_full_name)

    avatar_folder = os.path.join(output_dir, 'avatars')
    avatar_realm_folder = os.path.join(avatar_folder, str(realm_id))
    os.makedirs(avatar_realm_folder, exist_ok=True)
    avatar_records = process_avatars(avatar_list, avatar_folder, threads)

    attachment = {"zerver_attachment": []}  # type: Dict[str, List[Any]]

    # IO realm.json
    create_converted_data_files(realm, output_dir, '/realm.json')
    # IO emoji records
    create_converted_data_files([], output_dir, '/emoji/records.json')
    # IO avatar records
    create_converted_data_files(avatar_records, output_dir, '/avatars/records.json')
    # IO uploads records
    create_converted_data_files([], output_dir, '/uploads/records.json')
    # IO attachments records
    create_converted_data_files(attachment, output_dir, '/attachment.json')

    subprocess.check_call(["tar", "-czf", output_dir + '.tar.gz', output_dir, '-P'])

    logging.info('######### DATA CONVERSION FINISHED #########\n')
    logging.info("Zulip data dump created at %s" % (output_dir))

def process_avatars(avatar_list: List[ZerverFieldsT], avatar_dir: str,
                    threads: int) -> List[ZerverFieldsT]:
    """
    This function gets the gitter avatar of the user and saves it in the
    user's avatar directory with both the extensions '.png' and '.original'
    """
    def get_avatar(avatar_upload_list: List[str]) -> int:
        gitter_avatar_url = avatar_upload_list[0]
        image_path = avatar_upload_list[1]
        original_image_path = avatar_upload_list[2]
        response = requests.get(gitter_avatar_url, stream=True)
        with open(image_path, 'wb') as image_file:
            shutil.copyfileobj(response.raw, image_file)
        shutil.copy(image_path, original_image_path)
        return 0

    logging.info('######### GETTING AVATARS #########\n')
    logging.info('DOWNLOADING AVATARS .......\n')
    avatar_original_list = []
    avatar_upload_list = []
    for avatar in avatar_list:
        avatar_hash = user_avatar_path_from_ids(avatar['user_profile_id'], realm_id)
        gitter_avatar_url = avatar['path']
        avatar_original = dict(avatar)

        image_path = ('%s/%s.png' % (avatar_dir, avatar_hash))
        original_image_path = ('%s/%s.original' % (avatar_dir, avatar_hash))

        avatar_upload_list.append([gitter_avatar_url, image_path, original_image_path])
        # We don't add the size field here in avatar's records.json,
        # since the metadata is not needed on the import end, and we
        # don't have it until we've downloaded the files anyway.
        avatar['path'] = image_path
        avatar['s3_path'] = image_path

        avatar_original['path'] = original_image_path
        avatar_original['s3_path'] = original_image_path
        avatar_original_list.append(avatar_original)

    # Run downloads parallely
    output = []
    for (status, job) in run_parallel(get_avatar, avatar_upload_list, threads=threads):
        output.append(job)

    logging.info('######### GETTING AVATARS FINISHED #########\n')
    return avatar_list + avatar_original_list

def create_converted_data_files(data: Any, output_dir: str, file_path: str) -> None:
    output_file = output_dir + file_path
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    json.dump(data, open(output_file, 'w'), indent=4)

def write_data_to_file(output_file: str, data: Any) -> None:
    with open(output_file, "w") as f:
        f.write(ujson.dumps(data, indent=4))
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00			`import os`
gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`import ujson`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00			`import dateutil.parser`
			`import random`
			`import requests`
			`import json`
			`import logging`
			`import shutil`
			`import subprocess`

			`from django.conf import settings`
			`from django.forms.models import model_to_dict`
			`from django.utils.timezone import now as timezone_now`
			`from typing import Any, Dict, List, Tuple`

import: Move 'build_subscription' and 'build_recipient' to import_util. 2018-08-02 00:35:02 +02:00			`from zerver.models import Realm, UserProfile, Recipient`
gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`from zerver.lib.export import MESSAGE_BATCH_CHUNK_SIZE`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00			`from zerver.lib.avatar_hash import user_avatar_path_from_ids`
			`from zerver.lib.parallel import run_parallel`
import: Move 'build_avatar' to import_util. 2018-08-01 01:10:55 +02:00			`from zerver.data_import.import_util import ZerverFieldsT, build_zerver_realm, \`
import: Move 'build_subscription' and 'build_recipient' to import_util. 2018-08-02 00:35:02 +02:00			`build_avatar, build_subscription, build_recipient`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00
			`# stubs`
			`GitterDataT = List[Dict[str, Any]]`

			`realm_id = 0`

			`def gitter_workspace_to_realm(domain_name: str, gitter_data: GitterDataT,`
			`realm_subdomain: str) -> Tuple[ZerverFieldsT,`
			`List[ZerverFieldsT],`
			`Dict[str, int]]:`
			`"""`
			`Returns:`
			`1. realm, Converted Realm data`
			`2. avatars, which is list to map avatars to zulip avatar records.json`
			`3. user_map, which is a dictionary to map from gitter user id to zulip user id`
			`"""`
			`NOW = float(timezone_now().timestamp())`
import: Move 'ZerverFieldsT' and 'build_zerver_realm' to import_util. 2018-08-01 01:01:55 +02:00			`zerver_realm = build_zerver_realm(realm_id, realm_subdomain, NOW, 'Gitter') # type: List[ZerverFieldsT]`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00
			`realm = dict(zerver_client=[{"name": "populate_db", "id": 1},`
			`{"name": "website", "id": 2},`
			`{"name": "API", "id": 3}],`
			`zerver_customprofilefield=[],`
			`zerver_customprofilefieldvalue=[],`
			`zerver_userpresence=[], # shows last logged in data, which is not available in gitter`
			`zerver_userprofile_mirrordummy=[],`
			`zerver_realmdomain=[{"realm": realm_id,`
			`"allow_subdomains": False,`
			`"domain": domain_name,`
			`"id": realm_id}],`
			`zerver_useractivity=[],`
			`zerver_realm=zerver_realm,`
			`zerver_huddle=[],`
			`zerver_userprofile_crossrealm=[],`
			`zerver_useractivityinterval=[],`
			`zerver_reaction=[],`
			`zerver_realmemoji=[],`
			`zerver_realmfilter=[])`

			`zerver_userprofile, avatars, user_map = build_userprofile(int(NOW), domain_name, gitter_data)`
			`zerver_stream, zerver_defaultstream = build_stream(int(NOW))`
			`zerver_recipient, zerver_subscription = build_recipient_and_subscription(`
			`zerver_userprofile, zerver_stream)`

			`realm['zerver_userprofile'] = zerver_userprofile`
			`realm['zerver_stream'] = zerver_stream`
			`realm['zerver_defaultstream'] = zerver_defaultstream`
			`realm['zerver_recipient'] = zerver_recipient`
			`realm['zerver_subscription'] = zerver_subscription`

			`return realm, avatars, user_map`

			`def build_userprofile(timestamp: Any, domain_name: str,`
			`gitter_data: GitterDataT) -> Tuple[List[ZerverFieldsT],`
			`List[ZerverFieldsT],`
			`Dict[str, int]]:`
			`"""`
			`Returns:`
			`1. zerver_userprofile, which is a list of user profile`
			`2. avatar_list, which is list to map avatars to zulip avatard records.json`
			`3. added_users, which is a dictionary to map from gitter user id to zulip id`
			`"""`
			`logging.info('######### IMPORTING USERS STARTED #########\n')`
			`zerver_userprofile = []`
			`avatar_list = [] # type: List[ZerverFieldsT]`
			`user_map = {} # type: Dict[str, int]`
			`user_id = 0`

			`for data in gitter_data:`
			`if data['fromUser']['id'] not in user_map:`
			`user_data = data['fromUser']`
			`user_map[user_data['id']] = user_id`

			`email = get_user_email(user_data, domain_name)`
import: Move 'build_avatar' to import_util. 2018-08-01 01:10:55 +02:00			`build_avatar(user_id, realm_id, email, user_data['avatarUrl'],`
			`timestamp, avatar_list)`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00
			`# Build userprofile object`
			`userprofile = UserProfile(`
			`full_name=user_data['displayName'],`
			`short_name=user_data['username'],`
			`id=user_id,`
			`email=email,`
			`avatar_source='U',`
			`pointer=-1,`
			`date_joined=timestamp,`
			`last_login=timestamp)`
			`userprofile_dict = model_to_dict(userprofile)`
			`# Set realm id separately as the corresponding realm is not yet a Realm model`
			`# instance`
			`userprofile_dict['realm'] = realm_id`
			`zerver_userprofile.append(userprofile_dict)`
			`user_id += 1`
			`logging.info('######### IMPORTING USERS FINISHED #########\n')`
			`return zerver_userprofile, avatar_list, user_map`

			`def get_user_email(user_data: ZerverFieldsT, domain_name: str) -> str:`
			`# TODO Get user email from github`
			`email = ("%s@users.noreply.github.com" % user_data['username'])`
			`return email`

			`def build_stream(timestamp: Any) -> Tuple[List[ZerverFieldsT],`
			`List[ZerverFieldsT]]:`
			`logging.info('######### IMPORTING STREAM STARTED #########\n')`
			`# We have only one stream for gitter export`
			`stream = dict(`
			`realm=realm_id,`
			`name="from gitter",`
			`deactivated=False,`
			`description="Imported from gitter",`
			`invite_only=False,`
			`date_created=timestamp,`
			`id=0)`

			`defaultstream = dict(`
			`stream=0,`
			`realm=realm_id,`
			`id=0)`
			`logging.info('######### IMPORTING STREAMS FINISHED #########\n')`
			`return [stream], [defaultstream]`

			`def build_recipient_and_subscription(`
			`zerver_userprofile: List[ZerverFieldsT],`
			`zerver_stream: List[ZerverFieldsT]) -> Tuple[List[ZerverFieldsT],`
			`List[ZerverFieldsT]]:`
			`"""`
			`Returns:`
			`1. zerver_recipient, which is a list of mapped recipient`
			`2. zerver_subscription, which is a list of mapped subscription`
			`"""`
			`zerver_recipient = []`
			`zerver_subscription = []`
			`recipient_id = subscription_id = 0`

			`# For stream`

			`# We have only one recipient, because we have only one stream`
			`# Hence 'recipient_id'=0 corresponds to 'stream_id'=0`
import: Move 'build_subscription' and 'build_recipient' to import_util. 2018-08-02 00:35:02 +02:00			`recipient = build_recipient(0, recipient_id, Recipient.STREAM)`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00			`zerver_recipient.append(recipient)`

			`for user in zerver_userprofile:`
			`subscription = build_subscription(recipient_id, user['id'], subscription_id)`
			`zerver_subscription.append(subscription)`
			`subscription_id += 1`
			`recipient_id += 1`

			`# For users`
			`for user in zerver_userprofile:`
import: Move 'build_subscription' and 'build_recipient' to import_util. 2018-08-02 00:35:02 +02:00			`recipient = build_recipient(user['id'], recipient_id, Recipient.PERSONAL)`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00			`subscription = build_subscription(recipient_id, user['id'], subscription_id)`
			`zerver_recipient.append(recipient)`
			`zerver_subscription.append(subscription)`
			`recipient_id += 1`
			`subscription_id += 1`

			`return zerver_recipient, zerver_subscription`

gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`def convert_gitter_workspace_messages(gitter_data: GitterDataT, output_dir: str,`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00			`zerver_subscription: List[ZerverFieldsT],`
gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`user_map: Dict[str, int],`
gitter import: Process user mentions. The gitter mentions are in the format '@usermention' and the mentions are included in the export data as: "mentions": [ { "screenName": "usermention", "userId": "54d7876c15522ed4b3dbbefb", "userIds": [] }] We extract this data and map this mention to @usermention for Zulip. 2018-07-04 22:59:40 +02:00			`user_short_name_to_full_name: Dict[str, str],`
gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`chunk_size: int=MESSAGE_BATCH_CHUNK_SIZE) -> None:`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00			`"""`
gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`Messages are stored in batches`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00			`"""`
			`logging.info('######### IMPORTING MESSAGES STARTED #########\n')`
			`message_id = usermessage_id = 0`
			`recipient_id = 0 # Corresponding to stream "gitter"`

gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`low_index = 0`
			`upper_index = low_index + chunk_size`
			`dump_file_id = 1`

			`while True:`
			`message_json = {}`
			`zerver_message = []`
			`zerver_usermessage = []`
			`message_data = gitter_data[low_index: upper_index]`
			`if len(message_data) == 0:`
			`break`
			`for message in message_data:`
			`message_time = dateutil.parser.parse(message['sent']).timestamp()`
gitter import: Process user mentions. The gitter mentions are in the format '@usermention' and the mentions are included in the export data as: "mentions": [ { "screenName": "usermention", "userId": "54d7876c15522ed4b3dbbefb", "userIds": [] }] We extract this data and map this mention to @usermention for Zulip. 2018-07-04 22:59:40 +02:00			`mentioned_user_ids = get_usermentions(message, user_map,`
			`user_short_name_to_full_name)`
gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`rendered_content = None`

			`zulip_message = dict(`
			`sending_client=1,`
			`rendered_content_version=1, # This is Zulip-specific`
			`has_image=False,`
			`subject='imported from gitter',`
			`pub_date=float(message_time),`
			`id=message_id,`
			`has_attachment=False,`
			`edit_history=None,`
			`sender=user_map[message['fromUser']['id']],`
			`content=message['text'],`
			`rendered_content=rendered_content,`
			`recipient=recipient_id,`
			`last_edit_time=None,`
			`has_link=False)`
			`zerver_message.append(zulip_message)`

			`for subscription in zerver_subscription:`
			`if subscription['recipient'] == recipient_id:`
			`flags_mask = 1 # For read`
gitter import: Process user mentions. The gitter mentions are in the format '@usermention' and the mentions are included in the export data as: "mentions": [ { "screenName": "usermention", "userId": "54d7876c15522ed4b3dbbefb", "userIds": [] }] We extract this data and map this mention to @usermention for Zulip. 2018-07-04 22:59:40 +02:00			`if subscription['user_profile'] in mentioned_user_ids:`
			`flags_mask = 9 # For read and mentioned`

gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`usermessage = dict(`
			`user_profile=subscription['user_profile'],`
			`id=usermessage_id,`
			`flags_mask=flags_mask,`
			`message=message_id)`
			`usermessage_id += 1`
			`zerver_usermessage.append(usermessage)`
			`message_id += 1`

			`message_json['zerver_message'] = zerver_message`
			`message_json['zerver_usermessage'] = zerver_usermessage`
			`message_filename = os.path.join(output_dir, "messages-%06d.json" % (dump_file_id,))`
			`logging.info("Writing Messages to %s\n" % (message_filename,))`
			`write_data_to_file(os.path.join(message_filename), message_json)`

			`low_index = upper_index`
			`upper_index = chunk_size + low_index`
			`dump_file_id += 1`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00
			`logging.info('######### IMPORTING MESSAGES FINISHED #########\n')`

gitter import: Process user mentions. The gitter mentions are in the format '@usermention' and the mentions are included in the export data as: "mentions": [ { "screenName": "usermention", "userId": "54d7876c15522ed4b3dbbefb", "userIds": [] }] We extract this data and map this mention to @usermention for Zulip. 2018-07-04 22:59:40 +02:00			`def get_usermentions(message: Dict[str, Any], user_map: Dict[str, int],`
			`user_short_name_to_full_name: Dict[str, str]) -> List[int]:`
			`mentioned_user_ids = []`
			`if 'mentions' in message:`
			`for mention in message['mentions']:`
			`if mention.get('userId') in user_map:`
			`gitter_mention = '@%s' % (mention['screenName'])`
			`zulip_mention = ('@%s' %`
			`(user_short_name_to_full_name[mention['screenName']]))`
			`message['text'] = message['text'].replace(gitter_mention, zulip_mention)`

			`mentioned_user_ids.append(user_map[mention['userId']])`
			`return mentioned_user_ids`

gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00			`def do_convert_data(gitter_data_file: str, output_dir: str, threads: int=6) -> None:`
			`# Subdomain is set by the user while running the import commands`
			`realm_subdomain = ""`
			`domain_name = settings.EXTERNAL_HOST`

			`os.makedirs(output_dir, exist_ok=True)`
			`# output directory should be empty initially`
			`if os.listdir(output_dir):`
			`raise Exception("Output directory should be empty!")`

			`# Read data from the gitter file`
			`gitter_data = json.load(open(gitter_data_file))`

			`realm, avatar_list, user_map = gitter_workspace_to_realm(`
			`domain_name, gitter_data, realm_subdomain)`
gitter import: Process user mentions. The gitter mentions are in the format '@usermention' and the mentions are included in the export data as: "mentions": [ { "screenName": "usermention", "userId": "54d7876c15522ed4b3dbbefb", "userIds": [] }] We extract this data and map this mention to @usermention for Zulip. 2018-07-04 22:59:40 +02:00
			`# For user mentions`
			`user_short_name_to_full_name = {}`
			`for userprofile in realm['zerver_userprofile']:`
			`user_short_name_to_full_name[userprofile['short_name']] = userprofile['full_name']`

gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00			`convert_gitter_workspace_messages(`
gitter import: Process user mentions. The gitter mentions are in the format '@usermention' and the mentions are included in the export data as: "mentions": [ { "screenName": "usermention", "userId": "54d7876c15522ed4b3dbbefb", "userIds": [] }] We extract this data and map this mention to @usermention for Zulip. 2018-07-04 22:59:40 +02:00			`gitter_data, output_dir, realm['zerver_subscription'], user_map,`
			`user_short_name_to_full_name)`
gitter import: Add gitter data conversion script. 2018-05-29 13:55:52 +02:00
			`avatar_folder = os.path.join(output_dir, 'avatars')`
			`avatar_realm_folder = os.path.join(avatar_folder, str(realm_id))`
			`os.makedirs(avatar_realm_folder, exist_ok=True)`
			`avatar_records = process_avatars(avatar_list, avatar_folder, threads)`

			`attachment = {"zerver_attachment": []} # type: Dict[str, List[Any]]`

			`# IO realm.json`
			`create_converted_data_files(realm, output_dir, '/realm.json')`
			`# IO emoji records`
			`create_converted_data_files([], output_dir, '/emoji/records.json')`
			`# IO avatar records`
			`create_converted_data_files(avatar_records, output_dir, '/avatars/records.json')`
			`# IO uploads records`
			`create_converted_data_files([], output_dir, '/uploads/records.json')`
			`# IO attachments records`
			`create_converted_data_files(attachment, output_dir, '/attachment.json')`

			`subprocess.check_call(["tar", "-czf", output_dir + '.tar.gz', output_dir, '-P'])`

			`logging.info('######### DATA CONVERSION FINISHED #########\n')`
			`logging.info("Zulip data dump created at %s" % (output_dir))`

			`def process_avatars(avatar_list: List[ZerverFieldsT], avatar_dir: str,`
			`threads: int) -> List[ZerverFieldsT]:`
			`"""`
			`This function gets the gitter avatar of the user and saves it in the`
			`user's avatar directory with both the extensions '.png' and '.original'`
			`"""`
			`def get_avatar(avatar_upload_list: List[str]) -> int:`
			`gitter_avatar_url = avatar_upload_list[0]`
			`image_path = avatar_upload_list[1]`
			`original_image_path = avatar_upload_list[2]`
			`response = requests.get(gitter_avatar_url, stream=True)`
			`with open(image_path, 'wb') as image_file:`
			`shutil.copyfileobj(response.raw, image_file)`
			`shutil.copy(image_path, original_image_path)`
			`return 0`

			`logging.info('######### GETTING AVATARS #########\n')`
			`logging.info('DOWNLOADING AVATARS .......\n')`
			`avatar_original_list = []`
			`avatar_upload_list = []`
			`for avatar in avatar_list:`
			`avatar_hash = user_avatar_path_from_ids(avatar['user_profile_id'], realm_id)`
			`gitter_avatar_url = avatar['path']`
			`avatar_original = dict(avatar)`

			`image_path = ('%s/%s.png' % (avatar_dir, avatar_hash))`
			`original_image_path = ('%s/%s.original' % (avatar_dir, avatar_hash))`

			`avatar_upload_list.append([gitter_avatar_url, image_path, original_image_path])`
			`# We don't add the size field here in avatar's records.json,`
			`# since the metadata is not needed on the import end, and we`
			`# don't have it until we've downloaded the files anyway.`
			`avatar['path'] = image_path`
			`avatar['s3_path'] = image_path`

			`avatar_original['path'] = original_image_path`
			`avatar_original['s3_path'] = original_image_path`
			`avatar_original_list.append(avatar_original)`

			`# Run downloads parallely`
			`output = []`
			`for (status, job) in run_parallel(get_avatar, avatar_upload_list, threads=threads):`
			`output.append(job)`

			`logging.info('######### GETTING AVATARS FINISHED #########\n')`
			`return avatar_list + avatar_original_list`

			`def create_converted_data_files(data: Any, output_dir: str, file_path: str) -> None:`
			`output_file = output_dir + file_path`
			`os.makedirs(os.path.dirname(output_file), exist_ok=True)`
			`json.dump(data, open(output_file, 'w'), indent=4)`
gitter import: Write messages batch-wise. Messages can be bulky, and storing them in a single data structure can cause a memory error. In this commit, the messages are written to a file batch-wise, thus avoiding the memory error. Similar to commit 6b7b6b38add61faa451e860101822ff725b0ab8a 2018-06-07 12:25:59 +02:00
			`def write_data_to_file(output_file: str, data: Any) -> None:`
			`with open(output_file, "w") as f:`
			`f.write(ujson.dumps(data, indent=4))`