Add tools for dumping and comparing markdown renderings.

This adds a couple new tools that can be used to determine whether a particular change in Zulip's backend markdown processor would impact the rendering of historical messages, without a human actually looking at the message content. This is a useful way to verify whether a change to our markdown syntax is likely to create problems. [commit message and code tweaked by tabbott]
2016-10-24 19:32:09 +03:00 · 2016-10-24 19:32:09 +03:00 · 93965a8e89
parent 6107c877e8
commit 93965a8e89
4 changed files with 96 additions and 40 deletions
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -162,3 +162,6 @@ git+https://github.com/lorenzogil/glue@01c00cd33b9b78ea868300c266c16acd59a81bfc#
 # Needed for cloning virtual environments
 git+https://github.com/umairwaheed/virtualenv-clone.git@short-version#egg=virtualenv-clone==0.2.6
 # Needed for reading json as stream
 ijson==2.3
--- a/zilencer/management/commands/compare_messages.py
+++ b/zilencer/management/commands/compare_messages.py
@ -0,0 +1,37 @@
 from __future__ import absolute_import
 from __future__ import print_function
 from typing import Any
 from six.moves import zip
 import ijson
 from django.core.management.base import BaseCommand, CommandParser
 class Command(BaseCommand):
    help = """
    Render messages to a file.
    Usage: python manage.py render_messages <destination> <--amount>
    """
    def add_arguments(self, parser):
        # type: (CommandParser) -> None
        parser.add_argument('dump1', help='First file to compare')
        parser.add_argument('dump2', help='Second file to compare')
    def handle(self, *args, **options):
        # type: (*Any, **Any) -> None
        total_count = 0
        changed_count = 0
        with open(options['dump1'], 'r') as dump1, open(options['dump2'], 'r') as dump2:
            for m1, m2 in zip(ijson.items(dump1, 'item'), ijson.items(dump2, 'item')):
                total_count += 1
                if m1['id'] != m2['id']:
                    self.stderr.write('Inconsistent messages dump')
                    break
                if m1['content'] != m2['content']:
                    changed_count += 1
                    self.stdout.write('Changed message id: {id}'.format(id=m1['id']))
        self.stdout.write('Total messages: {count}'.format(count=total_count))
        self.stdout.write('Changed messages: {count}'.format(count=changed_count))
--- a/zilencer/management/commands/render_messages.py
+++ b/zilencer/management/commands/render_messages.py
@ -0,0 +1,56 @@
 from __future__ import absolute_import
 from __future__ import print_function
 import os
 import ujson
 from typing import Any, Generator
 from django.core.management.base import BaseCommand, CommandParser
 from django.db.models import QuerySet
 from zerver.lib.message import render_markdown
 from zerver.models import Message
 def queryset_iterator(queryset, chunksize=5000):
    # type: (QuerySet, int) -> Generator
    queryset = queryset.order_by('id')
    while queryset.exists():
        for row in queryset[:chunksize]:
            msg_id = row.id
            yield row
        queryset = queryset.filter(id__gt=msg_id)
 class Command(BaseCommand):
    help = """
    Render messages to a file.
    Usage: python manage.py render_messages <destination> [--amount=10000]
    """
    def add_arguments(self, parser):
        # type: (CommandParser) -> None
        parser.add_argument('destination', help='Destination file path')
        parser.add_argument('--amount', default=100000, help='Number of messages to render')
        parser.add_argument('--latest_id', default=0, help="Last message id to render")
    def handle(self, *args, **options):
        # type: (*Any, **Any) -> None
        dest_dir = os.path.realpath(os.path.dirname(options['destination']))
        amount = int(options['amount'])
        latest = int(options['latest_id']) or Message.objects.latest('id').id
        self.stdout.write('Latest message id: {latest}'.format(latest=latest))
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        with open(options['destination'], 'w') as result:
            result.write('[')
            messages = Message.objects.filter(id__gt=latest - amount, id__lte=latest).order_by('id')
            for message in queryset_iterator(messages):
                result.write(ujson.dumps({
                    'id': message.id,
                    'content': render_markdown(message, message.content)
                }))
                if message.id != latest:
                    result.write(',')
            result.write(']')
--- a/zilencer/management/commands/render_old_messages.py
+++ b/zilencer/management/commands/render_old_messages.py
@ -1,40 +0,0 @@
 from __future__ import absolute_import
 from __future__ import print_function
 from typing import Any
 from django.core.management.base import BaseCommand
 import zerver.lib.bugdown as bugdown
 from zerver.lib.message import re_render_content_for_management_command
 from zerver.models import Message
 import datetime
 import sys
 import time
 class Command(BaseCommand):
    help = """Render all historical messages that haven't been rendered yet.
 Usage: python manage.py render_old_messages"""
    def handle(self, *args, **options):
        # type: (*Any, **Any) -> None
        print('''
            This command is currently not supported, and it can be somewhat
            dangerous to run on large instances.  Before upgrading messages
            to a new version, you should make sure that the old renderings
            are actually invalid; it could be quite the opposite (we might
            not want to render V1 messages with V2).
            ''')
        sys.exit(1)
        total_rendered = 0
        while True:
            messages = Message.objects.exclude(rendered_content_version=bugdown.version)[0:100]
            if len(messages) == 0:
                break
            for message in messages:
                re_render_content_for_management_command(message)
            total_rendered += len(messages)
            print(datetime.datetime.now(), total_rendered)
            # Put in some sleep so this can run safely on low resource machines
            time.sleep(0.25)