zulip/zerver/management/commands/import_dump.py

from __future__ import absolute_import
from __future__ import print_function

from optparse import make_option

from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.db import connection
from django.conf import settings

from zerver.lib.actions import do_create_stream
from zerver.models import Realm, Stream, UserProfile, Recipient, Subscription, \
    Message, UserMessage, Huddle, DefaultStream, RealmAlias, RealmFilter, Client

import sys
import json

class Command(BaseCommand):
    DEFAULT_CHUNK_SIZE = 5000

    help = """Import Zulip database dump files into a fresh Zulip instance.

This command should be used only on a newly created, empty Zulip instance to
import a database dump from one or more JSON files.

Usage: python2.7 manage.py import_dump [--destroy-rebuild-database] [--chunk-size=%s] <json file name> [<json file name>...]""" % DEFAULT_CHUNK_SIZE

    option_list = BaseCommand.option_list + (
        make_option('--destroy-rebuild-database',
                    dest='destroy_rebuild_database',
                    default=False,
                    action="store_true",
                    help='Destroys and rebuilds the databases prior to import.'),
        make_option('--chunk-size',
                    dest='chunk_size',
                    type='int',
                    default=DEFAULT_CHUNK_SIZE,
                    help='Number of objects that are added to the table in one roundtrip to the database.')
    )


    def new_instance_check(self, model):
        count = model.objects.count()
        if count:
            print("Zulip instance is not empty, found %d rows in %s table. " \
                % (count, model._meta.db_table))
            print("You may use --destroy-rebuild-database to destroy and rebuild the database prior to import.")
            exit(1)


    def do_destroy_and_rebuild_database(self, db_name):
        call_command('flush', verbosity=0, interactive=False)

    def increment_row_counter(self, row_counter, database_dump, model):
        table_name = model._meta.db_table
        row_counter[table_name] = (row_counter.get(table_name) or 0) + \
            len(database_dump.get(table_name) or [ ])


    def test_table_row_count(self, row_counter, model):
        table_name = model._meta.db_table
        sys.stdout.write("%s: " % table_name)
        expected_count = row_counter.get(table_name) or 0
        actual_count = model.objects.count()
        status = "PASSED" if expected_count == actual_count else "FAILED"
        params = (expected_count, actual_count, status)
        sys.stdout.write("expected %d rows, got %d. %s\n" % params)


    def import_table(self, database_dump, realm_notification_map, model):
        table_name = model._meta.db_table
        if table_name in database_dump:
            cursor = connection.cursor()
            sys.stdout.write("Importing %s: " % table_name)
            accumulator = [ ]
            for row in database_dump[table_name]:
                # hack to filter out notifications_stream_id circular reference
                # out of zerver_realm table prior to insert of corresponding
                # streams.
                # removes notifications_stream_id from row dict
                if table_name == "zerver_realm":
                    realm_notification_map[row["id"]] = row.get("notifications_stream_id")
                    row = { field_name: value \
                        for field_name, value in row.items() \
                            if field_name != "notifications_stream_id" }

                accumulator.append(model(**row))
                if len(accumulator) % self.chunk_size == 0:
                    model.objects.bulk_create(accumulator)
                    sys.stdout.write(".")
                    accumulator = [ ]

            # create any remaining objects that haven't been flushed yet
            if len(accumulator):
                model.objects.bulk_create(accumulator)

            # set the next id sequence value to avoid a collision with the
            # imported ids
            cursor.execute("SELECT setval(%s, MAX(id)+1) FROM " + table_name,
                [table_name + "_id_seq"])

            sys.stdout.write(" [Done]\n")


    def handle(self, *args, **options):
        models_to_import = [Realm, Stream, UserProfile, Recipient, Subscription,
            Client, Message, UserMessage, Huddle, DefaultStream, RealmAlias,
            RealmFilter]

        self.chunk_size = options["chunk_size"]
        encoding = sys.getfilesystemencoding()

        if len(args) == 0:
            print("Please provide at least one database dump file name.")
            exit(1)

        if not options["destroy_rebuild_database"]:
            for model in models_to_import:
                self.new_instance_check(model)
        else:
            db_name = settings.DATABASES['default']['NAME']
            self.do_destroy_and_rebuild_database(db_name)

        # maps relationship between realm id and notifications_stream_id
        # generally, there should be only one realm per dump, but the code
        # doesn't make that assumption
        realm_notification_map = dict()

        # maping between table name and a total expected number of rows across
        # all input json files
        row_counter = dict()

        for file_name in args:
            try:
                fp = open(file_name, 'r')
            except IOError:
                print("File not found: '%s'" % (file_name,))
                exit(1)

            print("Processing file: %s ..." % (file_name,))

            # parse the database dump and load in memory
            # TODO: change this to a streaming parser to support loads > RAM size
            database_dump = json.load(fp, encoding)

            for model in models_to_import:
                self.increment_row_counter(row_counter, database_dump, model)
                self.import_table(database_dump, realm_notification_map, model)

            print("")

        # set notifications_stream_id on realm objects to correct value now
        # that foreign keys are in streams table
        if len(realm_notification_map):
            print("Setting realm notification stream...")
            for id, notifications_stream_id in realm_notification_map.items():
                Realm.objects \
                    .filter(id=id) \
                    .update(notifications_stream = notifications_stream_id)

        print("")
        print("Testing data import: ")

        # test that everything from all json dumps made it into the database
        for model in models_to_import:
            self.test_table_row_count(row_counter, model)
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00			`from __future__ import absolute_import`
Apply Python 3 futurize transform libfuturize.fixes.fix_print_with_import. 2015-11-01 17:11:06 +01:00			`from __future__ import print_function`
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00
			`from optparse import make_option`

			`from django.core.management import call_command`
			`from django.core.management.base import BaseCommand`
			`from django.db import connection`
			`from django.conf import settings`

			`from zerver.lib.actions import do_create_stream`
			`from zerver.models import Realm, Stream, UserProfile, Recipient, Subscription, \`
			`Message, UserMessage, Huddle, DefaultStream, RealmAlias, RealmFilter, Client`

			`import sys`
			`import json`

			`class Command(BaseCommand):`
			`DEFAULT_CHUNK_SIZE = 5000`

			`help = """Import Zulip database dump files into a fresh Zulip instance.`

			`This command should be used only on a newly created, empty Zulip instance to`
			`import a database dump from one or more JSON files.`

Consistently use /usr/bin/env python2.7 in shebangs and commands. 2015-10-04 18:56:26 +02:00			`Usage: python2.7 manage.py import_dump [--destroy-rebuild-database] [--chunk-size=%s] <json file name> [<json file name>...]""" % DEFAULT_CHUNK_SIZE`
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00
			`option_list = BaseCommand.option_list + (`
			`make_option('--destroy-rebuild-database',`
			`dest='destroy_rebuild_database',`
			`default=False,`
			`action="store_true",`
			`help='Destroys and rebuilds the databases prior to import.'),`
			`make_option('--chunk-size',`
			`dest='chunk_size',`
			`type='int',`
			`default=DEFAULT_CHUNK_SIZE,`
			`help='Number of objects that are added to the table in one roundtrip to the database.')`
			`)`


			`def new_instance_check(self, model):`
			`count = model.objects.count()`
			`if count:`
Apply Python 3 futurize transform libfuturize.fixes.fix_print_with_import. 2015-11-01 17:11:06 +01:00			`print("Zulip instance is not empty, found %d rows in %s table. " \`
			`% (count, model._meta.db_table))`
			`print("You may use --destroy-rebuild-database to destroy and rebuild the database prior to import.")`
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00			`exit(1)`


			`def do_destroy_and_rebuild_database(self, db_name):`
			`call_command('flush', verbosity=0, interactive=False)`

			`def increment_row_counter(self, row_counter, database_dump, model):`
			`table_name = model._meta.db_table`
			`row_counter[table_name] = (row_counter.get(table_name) or 0) + \`
			`len(database_dump.get(table_name) or [ ])`


			`def test_table_row_count(self, row_counter, model):`
			`table_name = model._meta.db_table`
			`sys.stdout.write("%s: " % table_name)`
			`expected_count = row_counter.get(table_name) or 0`
			`actual_count = model.objects.count()`
			`status = "PASSED" if expected_count == actual_count else "FAILED"`
			`params = (expected_count, actual_count, status)`
			`sys.stdout.write("expected %d rows, got %d. %s\n" % params)`


			`def import_table(self, database_dump, realm_notification_map, model):`
			`table_name = model._meta.db_table`
			`if table_name in database_dump:`
			`cursor = connection.cursor()`
			`sys.stdout.write("Importing %s: " % table_name)`
			`accumulator = [ ]`
			`for row in database_dump[table_name]:`
			`# hack to filter out notifications_stream_id circular reference`
			`# out of zerver_realm table prior to insert of corresponding`
			`# streams.`
			`# removes notifications_stream_id from row dict`
			`if table_name == "zerver_realm":`
			`realm_notification_map[row["id"]] = row.get("notifications_stream_id")`
			`row = { field_name: value \`
			`for field_name, value in row.items() \`
			`if field_name != "notifications_stream_id" }`

			`accumulator.append(model(**row))`
			`if len(accumulator) % self.chunk_size == 0:`
			`model.objects.bulk_create(accumulator)`
			`sys.stdout.write(".")`
			`accumulator = [ ]`

			`# create any remaining objects that haven't been flushed yet`
			`if len(accumulator):`
			`model.objects.bulk_create(accumulator)`

			`# set the next id sequence value to avoid a collision with the`
			`# imported ids`
			`cursor.execute("SELECT setval(%s, MAX(id)+1) FROM " + table_name,`
			`[table_name + "_id_seq"])`

			`sys.stdout.write(" [Done]\n")`


			`def handle(self, args, *options):`
			`models_to_import = [Realm, Stream, UserProfile, Recipient, Subscription,`
			`Client, Message, UserMessage, Huddle, DefaultStream, RealmAlias,`
			`RealmFilter]`

			`self.chunk_size = options["chunk_size"]`
			`encoding = sys.getfilesystemencoding()`

			`if len(args) == 0:`
Apply Python 3 futurize transform libfuturize.fixes.fix_print_with_import. 2015-11-01 17:11:06 +01:00			`print("Please provide at least one database dump file name.")`
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00			`exit(1)`

			`if not options["destroy_rebuild_database"]:`
			`for model in models_to_import:`
			`self.new_instance_check(model)`
			`else:`
			`db_name = settings.DATABASES['default']['NAME']`
			`self.do_destroy_and_rebuild_database(db_name)`

			`# maps relationship between realm id and notifications_stream_id`
			`# generally, there should be only one realm per dump, but the code`
			`# doesn't make that assumption`
			`realm_notification_map = dict()`

			`# maping between table name and a total expected number of rows across`
			`# all input json files`
			`row_counter = dict()`

			`for file_name in args:`
			`try:`
			`fp = open(file_name, 'r')`
			`except IOError:`
Apply Python 3 futurize transform libfuturize.fixes.fix_print_with_import. 2015-11-01 17:11:06 +01:00			`print("File not found: '%s'" % (file_name,))`
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00			`exit(1)`

Apply Python 3 futurize transform libfuturize.fixes.fix_print_with_import. 2015-11-01 17:11:06 +01:00			`print("Processing file: %s ..." % (file_name,))`
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00
			`# parse the database dump and load in memory`
			`# TODO: change this to a streaming parser to support loads > RAM size`
			`database_dump = json.load(fp, encoding)`

			`for model in models_to_import:`
			`self.increment_row_counter(row_counter, database_dump, model)`
			`self.import_table(database_dump, realm_notification_map, model)`

Apply Python 3 futurize transform libfuturize.fixes.fix_print_with_import. 2015-11-01 17:11:06 +01:00			`print("")`
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00
			`# set notifications_stream_id on realm objects to correct value now`
			`# that foreign keys are in streams table`
			`if len(realm_notification_map):`
Apply Python 3 futurize transform libfuturize.fixes.fix_print_with_import. 2015-11-01 17:11:06 +01:00			`print("Setting realm notification stream...")`
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00			`for id, notifications_stream_id in realm_notification_map.items():`
			`Realm.objects \`
			`.filter(id=id) \`
			`.update(notifications_stream = notifications_stream_id)`

Apply Python 3 futurize transform libfuturize.fixes.fix_print_with_import. 2015-11-01 17:11:06 +01:00			`print("")`
			`print("Testing data import: ")`
Added import_dump command to manage.py to import JSON database dumps (imported from commit abd410e5299f15c1a5851e87d18664b59837fbb0) 2015-08-19 05:15:40 +02:00
			`# test that everything from all json dumps made it into the database`
			`for model in models_to_import:`
			`self.test_table_row_count(row_counter, model)`