Add tools for dumping and comparing markdown renderings.

This adds a couple new tools that can be used to determine whether a particular change in Zulip's backend markdown processor would impact the rendering of historical messages, without a human actually looking at the message content. This is a useful way to verify whether a change to our markdown syntax is likely to create problems. [commit message and code tweaked by tabbott]
2016-10-24 19:32:09 +03:00 · 2016-10-24 19:32:09 +03:00 · 93965a8e89
parent 6107c877e8
commit 93965a8e89
4 changed files with 96 additions and 40 deletions
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -162,3 +162,6 @@ git+https://github.com/lorenzogil/glue@01c00cd33b9b78ea868300c266c16acd59a81bfc#

 # Needed for cloning virtual environments
 git+https://github.com/umairwaheed/virtualenv-clone.git@short-version#egg=virtualenv-clone==0.2.6
+
+# Needed for reading json as stream
+ijson==2.3
--- a/zilencer/management/commands/compare_messages.py
+++ b/zilencer/management/commands/compare_messages.py
@ -0,0 +1,37 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+from typing import Any
+from six.moves import zip
+
+import ijson
+
+from django.core.management.base import BaseCommand, CommandParser
+
+
+class Command(BaseCommand):
+    help = """
+    Render messages to a file.
+    Usage: python manage.py render_messages <destination> <--amount>
+    """
+
+    def add_arguments(self, parser):
+        # type: (CommandParser) -> None
+        parser.add_argument('dump1', help='First file to compare')
+        parser.add_argument('dump2', help='Second file to compare')
+
+    def handle(self, *args, **options):
+        # type: (*Any, **Any) -> None
+        total_count = 0
+        changed_count = 0
+        with open(options['dump1'], 'r') as dump1, open(options['dump2'], 'r') as dump2:
+            for m1, m2 in zip(ijson.items(dump1, 'item'), ijson.items(dump2, 'item')):
+                total_count += 1
+                if m1['id'] != m2['id']:
+                    self.stderr.write('Inconsistent messages dump')
+                    break
+                if m1['content'] != m2['content']:
+                    changed_count += 1
+                    self.stdout.write('Changed message id: {id}'.format(id=m1['id']))
+        self.stdout.write('Total messages: {count}'.format(count=total_count))
+        self.stdout.write('Changed messages: {count}'.format(count=changed_count))
--- a/zilencer/management/commands/render_messages.py
+++ b/zilencer/management/commands/render_messages.py
@ -0,0 +1,56 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import ujson
+from typing import Any, Generator
+
+from django.core.management.base import BaseCommand, CommandParser
+from django.db.models import QuerySet
+
+from zerver.lib.message import render_markdown
+from zerver.models import Message
+
+
+def queryset_iterator(queryset, chunksize=5000):
+    # type: (QuerySet, int) -> Generator
+    queryset = queryset.order_by('id')
+    while queryset.exists():
+        for row in queryset[:chunksize]:
+            msg_id = row.id
+            yield row
+        queryset = queryset.filter(id__gt=msg_id)
+
+
+class Command(BaseCommand):
+    help = """
+    Render messages to a file.
+    Usage: python manage.py render_messages <destination> [--amount=10000]
+    """
+
+    def add_arguments(self, parser):
+        # type: (CommandParser) -> None
+        parser.add_argument('destination', help='Destination file path')
+        parser.add_argument('--amount', default=100000, help='Number of messages to render')
+        parser.add_argument('--latest_id', default=0, help="Last message id to render")
+
+    def handle(self, *args, **options):
+        # type: (*Any, **Any) -> None
+        dest_dir = os.path.realpath(os.path.dirname(options['destination']))
+        amount = int(options['amount'])
+        latest = int(options['latest_id']) or Message.objects.latest('id').id
+        self.stdout.write('Latest message id: {latest}'.format(latest=latest))
+        if not os.path.exists(dest_dir):
+            os.makedirs(dest_dir)
+
+        with open(options['destination'], 'w') as result:
+            result.write('[')
+            messages = Message.objects.filter(id__gt=latest - amount, id__lte=latest).order_by('id')
+            for message in queryset_iterator(messages):
+                result.write(ujson.dumps({
+                    'id': message.id,
+                    'content': render_markdown(message, message.content)
+                }))
+                if message.id != latest:
+                    result.write(',')
+            result.write(']')
--- a/zilencer/management/commands/render_old_messages.py
+++ b/zilencer/management/commands/render_old_messages.py
@ -1,40 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-
-from typing import Any
-
-from django.core.management.base import BaseCommand
-
-import zerver.lib.bugdown as bugdown
-from zerver.lib.message import re_render_content_for_management_command
-from zerver.models import Message
-import datetime
-import sys
-import time
-
-class Command(BaseCommand):
-    help = """Render all historical messages that haven't been rendered yet.
-
-Usage: python manage.py render_old_messages"""
-
-    def handle(self, *args, **options):
-        # type: (*Any, **Any) -> None
-        print('''
-            This command is currently not supported, and it can be somewhat
-            dangerous to run on large instances.  Before upgrading messages
-            to a new version, you should make sure that the old renderings
-            are actually invalid; it could be quite the opposite (we might
-            not want to render V1 messages with V2).
-            ''')
-        sys.exit(1)
-        total_rendered = 0
-        while True:
-            messages = Message.objects.exclude(rendered_content_version=bugdown.version)[0:100]
-            if len(messages) == 0:
-                break
-            for message in messages:
-                re_render_content_for_management_command(message)
-            total_rendered += len(messages)
-            print(datetime.datetime.now(), total_rendered)
-            # Put in some sleep so this can run safely on low resource machines
-            time.sleep(0.25)