zulip/zerver/management/commands/export.py


import os
import shutil
import subprocess
import sys
import tempfile
from argparse import ArgumentParser
from typing import Any

from django.conf import settings
from django.core.management.base import CommandError

from zerver.lib.export import do_export_realm, \
    do_write_stats_file_for_realm_export
from zerver.lib.management import ZulipBaseCommand
from zerver.lib.utils import generate_random_token

class Command(ZulipBaseCommand):
    help = """Exports all data from a Zulip realm

    This command exports all significant data from a Zulip realm.  The
    result can be imported using the `./manage.py import` command.

    Things that are exported:
    * All user-accessible data in the Zulip database (Messages,
      Streams, UserMessages, RealmEmoji, etc.)
    * Copies of all uploaded files and avatar images along with
      metadata needed to restore them even in the ab

    Things that are not exported:
    * Confirmation and PreregistrationUser (transient tables)
    * Sessions (everyone will need to login again post-export)
    * Users' passwords and API keys (users will need to use SSO or reset password)
    * Mobile tokens for APNS/GCM (users will need to reconnect their mobile devices)
    * ScheduledEmail (Not relevant on a new server)
    * RemoteZulipServer (Unlikely to be migrated)
    * third_party_api_results cache (this means rerending all old
      messages could be expensive)

    Things that will break as a result of the export:
    * Passwords will not be transferred.  They will all need to go
      through the password reset flow to obtain a new password (unless
      they intend to only use e.g. Google Auth).
    * Users will need to logout and re-login to the Zulip desktop and
      mobile apps.  The apps now all have an option on the login page
      where you can specify which Zulip server to use; your users
      should enter <domain name>.
    * All bots will stop working since they will be pointing to the
      wrong server URL, and all users' API keys have been rotated as
      part of the migration.  So to re-enable your integrations, you
      will need to direct your integrations at the new server.
      Usually this means updating the URL and the bots' API keys.  You
      can see a list of all the bots that have been configured for
      your realm on the `/#organization` page, and use that list to
      make sure you migrate them all.

    The proper procedure for using this to export a realm is as follows:

    * Use `./manage.py deactivate_realm` to deactivate the realm, so
      nothing happens in the realm being exported during the export
      process.

    * Use `./manage.py export` to export the realm, producing a data
      tarball.

    * Transfer the tarball to the new server and unpack it.

    * Use `./manage.py import` to import the realm

    * Use `./manage.py reactivate_realm` to reactivate the realm, so
      users can login again.

    * Inform the users about the things broken above.

    We recommend testing by exporting without having deactivated the
    realm first, to make sure you have the procedure right and
    minimize downtime.

    Performance: In one test, the tool exported a realm with hundreds
    of users and ~1M messages of history with --threads=1 in about 3
    hours of serial runtime (goes down to ~50m with --threads=6 on a
    machine with 8 CPUs).  Importing that same data set took about 30
    minutes.  But this will vary a lot depending on the average number
    of recipients of messages in the realm, hardware, etc."""

    def add_arguments(self, parser: ArgumentParser) -> None:
        parser.add_argument('--output',
                            dest='output_dir',
                            action="store",
                            default=None,
                            help='Directory to write exported data to.')
        parser.add_argument('--threads',
                            dest='threads',
                            action="store",
                            default=6,
                            help='Threads to use in exporting UserMessage objects in parallel')
        parser.add_argument('--public-only',
                            action="store_true",
                            help='Export only public stream messages and associated attachments')
        parser.add_argument('--upload-to-s3',
                            action="store_true",
                            help="Whether to upload resulting tarball to s3")
        self.add_realm_args(parser, True)

    def handle(self, *args: Any, **options: Any) -> None:
        realm = self.get_realm(options)
        assert realm is not None  # Should be ensured by parser

        output_dir = options["output_dir"]
        if output_dir is None:
            output_dir = tempfile.mkdtemp(prefix="zulip-export-")
        else:
            output_dir = os.path.realpath(os.path.expanduser(output_dir))
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)
        os.makedirs(output_dir)
        print("Exporting realm %s" % (realm.string_id,))
        num_threads = int(options['threads'])
        if num_threads < 1:
            raise CommandError('You must have at least one thread.')

        do_export_realm(realm, output_dir, threads=num_threads, public_only=options["public_only"])
        print("Finished exporting to %s; tarring" % (output_dir,))

        do_write_stats_file_for_realm_export(output_dir)

        tarball_path = output_dir.rstrip('/') + '.tar.gz'
        os.chdir(os.path.dirname(output_dir))
        subprocess.check_call(["tar", "-czf", tarball_path, os.path.basename(output_dir)])
        print("Tarball written to %s" % (tarball_path,))

        if not options["upload_to_s3"]:
            return

        def percent_callback(complete: Any, total: Any) -> None:
            sys.stdout.write('.')
            sys.stdout.flush()

        if settings.LOCAL_UPLOADS_DIR is not None:
            raise CommandError("S3 backend must be configured to upload to S3")

        print("Uploading export tarball to S3")

        from zerver.lib.upload import S3Connection, get_bucket, Key
        conn = S3Connection(settings.S3_KEY, settings.S3_SECRET_KEY)
        # We use the avatar bucket, because it's world-readable.
        bucket = get_bucket(conn, settings.S3_AVATAR_BUCKET)
        key = Key(bucket)
        key.key = os.path.join("exports", generate_random_token(32), os.path.basename(tarball_path))
        key.set_contents_from_filename(tarball_path, cb=percent_callback, num_cb=40)

        public_url = 'https://{bucket}.{host}/{key}'.format(
            host=conn.server_name(),
            bucket=bucket.name,
            key=key.key)
        print("Uploaded to %s" % (public_url,))