db tools: Use common scheme for digests.

We have two different digest schemes to make sure we keep the database up to date. There is the migration digest, which is NOT in the scope of this commit, and which already used the mechanism we use for other tools. Here we are talking about the digest for important files like `populate_db.py`. Now our scheme is more consistent with how we check file changes for other tools (as well as the aformentioned migration files). And we only write one hash file, instead of seven. And we only write the file when things have actually changed. And we are explicit about side effects. Finally, we include a couple new bot settings in the digest: INTERNAL_BOTS DISABLED_REALM_INTERNAL_BOTS NOTE: This will require a one-time transition, where we rebuild both databases (dev/test). It takes a little over two minutes for me, so it's not super painful. I bump the provision version here, even though you don't technically need it (since the relevant tools are actually using the digest files to determine if they need to rebuild the database). I figure it's just good to explicitly make this commit trigger a provision, and the user will then see the one-time migration of the hash files with a little bit less of a surprise. And I do a major bump, not a minor bump, because when we go in the reverse direction, the old code will have to rebuild the database due to the legacy hash files not being around, so, again, I just prefer it to be explicit.
2020-04-22 09:47:03 +00:00 · 2020-04-22 09:47:03 +00:00 · 02252c255a
parent f4942e9927
commit 02252c255a
2 changed files with 78 additions and 70 deletions
--- a/version.py
+++ b/version.py
@ -44,4 +44,4 @@ API_FEATURE_LEVEL = 1
 #   historical commits sharing the same major version, in which case a
 #   minor version bump suffices.

-PROVISION_VERSION = '79.3'
+PROVISION_VERSION = '80.0'
--- a/zerver/lib/test_fixtures.py
+++ b/zerver/lib/test_fixtures.py
@ -1,7 +1,6 @@
 import json
 import os
 import re
-import hashlib
 import subprocess
 import sys
 from typing import Any, List, Set
@ -26,7 +25,35 @@ from scripts.lib.zulip_tools import (
 )

 UUID_VAR_DIR = get_dev_uuid_var_path()
-FILENAME_SPLITTER = re.compile(r'[\W\-_]')
+
+IMPORTANT_FILES = [
+    'zilencer/management/commands/populate_db.py',
+    'zerver/lib/bulk_create.py',
+    'zerver/lib/generate_test_data.py',
+    'zerver/lib/server_initialization.py',
+    'tools/setup/postgres-init-test-db',
+    'tools/setup/postgres-init-dev-db',
+    'zerver/migrations/0258_enable_online_push_notifications_default.py',
+]
+
+VERBOSE_MESSAGE_ABOUT_HASH_TRANSITION = '''
+    NOTE!!!!
+
+    We are rebuilding your database for a one-time transition.
+
+    We have a hashing scheme that we use to detect whether any
+    important files used in the construction of the database
+    have changed.
+
+    We are changing that scheme so it only uses one file
+    instead of a directory of files.
+
+    In order to prevent errors due to this transition, we are
+    doing a one-time rebuild of your database.  This should
+    be the last time this happens (for this particular reason,
+    at least), unless you go back to older branches.
+
+'''

 def migration_paths() -> List[str]:
    return [
@ -38,6 +65,7 @@ class Database:
    def __init__(self, platform: str, database_name: str, settings: str):
        self.database_name = database_name
        self.settings = settings
+        self.digest_name = 'db_files_hash_for_' + platform
        self.migration_status_file = 'migration_status_' + platform
        self.migration_status_path = os.path.join(
            UUID_VAR_DIR,
@ -45,6 +73,17 @@ class Database:
        )
        self.migration_digest_file = "migrations_hash_" + database_name

+    def important_settings(self) -> List[str]:
+        def get(setting_name: str) -> str:
+            value = getattr(settings, setting_name, {})
+            return json.dumps(value, sort_keys=True)
+
+        return [
+            get('INTERNAL_BOTS'),
+            get('REALM_INTERNAL_BOTS'),
+            get('DISABLED_REALM_INTERNAL_BOTS'),
+        ]
+
    def run_db_migrations(self) -> None:
        # We shell out to `manage.py` and pass `DJANGO_SETTINGS_MODULE` on
        # the command line rather than just calling the migration
@ -106,29 +145,35 @@ class Database:
        except OperationalError:
            return False

+    def files_or_settings_have_changed(self) -> bool:
+        database_name = self.database_name
+
+        # Deal with legacy hash files.  We can kill off this code when
+        # enough time has passed since April 2020 that we're not
+        # worried about anomalies doing `git bisect`--probably a few
+        # months is sufficient.
+        legacy_status_dir = os.path.join(UUID_VAR_DIR, database_name + '_db_status')
+        if os.path.exists(legacy_status_dir):
+            print(VERBOSE_MESSAGE_ABOUT_HASH_TRANSITION)
+
+            # Remove the old digest for several reasons:
+            #   - tidiness
+            #   - preventing false positives if you bisect
+            #   - make this only a one-time headache (generally)
+            shutil.rmtree(legacy_status_dir)
+
+            # Return True to force a one-time rebuild.
+            return True
+
+        return is_digest_obsolete(
+            self.digest_name,
+            IMPORTANT_FILES,
+            self.important_settings(),
+        )
+
    def template_status(self) -> str:
        # This function returns a status string specifying the type of
        # state the template db is in and thus the kind of action required.
-        database_name = self.database_name
-
-        check_files = [
-            'zilencer/management/commands/populate_db.py',
-            'zerver/lib/bulk_create.py',
-            'zerver/lib/generate_test_data.py',
-            'zerver/lib/server_initialization.py',
-            'tools/setup/postgres-init-test-db',
-            'tools/setup/postgres-init-dev-db',
-            'zerver/migrations/0258_enable_online_push_notifications_default.py',
-        ]
-        check_settings = [
-            'REALM_INTERNAL_BOTS',
-        ]
-
-        # Construct a directory to store hashes named after the target database.
-        status_dir = os.path.join(UUID_VAR_DIR, database_name + '_db_status')
-        if not os.path.exists(status_dir):
-            os.mkdir(status_dir)
-
        if not self.database_exists():
            # TODO: It's possible that `database_exists` will
            #       return `False` even though the database
@ -140,14 +185,16 @@ class Database:
            #       it's better to err on that side, generally.
            return 'needs_rebuild'

-        # To ensure Python evaluates all the hash tests (and thus creates the
-        # hash files about the current state), we evaluate them in a
-        # list and then process the result
-        files_hash_status = all([check_file_hash(fn, status_dir) for fn in check_files])
-        settings_hash_status = all([check_setting_hash(setting_name, status_dir)
-                                    for setting_name in check_settings])
-        hash_status = files_hash_status and settings_hash_status
-        if not hash_status:
+        if self.files_or_settings_have_changed():
+            # Write the new hash, relying on our callers to
+            # actually rebuild the db successfully.
+            # TODO: Move this code to the callers, and write
+            #       the digest only AFTER the rebuild succeeds.
+            write_new_digest(
+                self.digest_name,
+                IMPORTANT_FILES,
+                self.important_settings(),
+            )
            return 'needs_rebuild'

        # Here we hash and compare our migration files before doing
@ -266,45 +313,6 @@ def extract_migrations_as_list(migration_status: str) -> List[str]:
    MIGRATIONS_RE = re.compile(r'\[[X| ]\] (\d+_.+)\n')
    return MIGRATIONS_RE.findall(migration_status)

-def _get_hash_file_path(source_file_path: str, status_dir: str) -> str:
-    basename = os.path.basename(source_file_path)
-    filename = '_'.join(FILENAME_SPLITTER.split(basename)).lower()
-    return os.path.join(status_dir, filename)
-
-def _check_hash(source_hash_file: str, target_content: str) -> bool:
-    """
-    This function has a side effect of creating a new hash file or
-    updating the old hash file.
-    """
-    target_hash_content = hashlib.sha1(target_content.encode('utf8')).hexdigest()
-
-    if not os.path.exists(source_hash_file):
-        source_hash_content = None
-    else:
-        with open(source_hash_file) as f:
-            source_hash_content = f.read().strip()
-
-    with open(source_hash_file, 'w') as f:
-        f.write(target_hash_content)
-
-    return source_hash_content == target_hash_content
-
-def check_file_hash(target_file_path: str, status_dir: str) -> bool:
-    source_hash_file = _get_hash_file_path(target_file_path, status_dir)
-
-    with open(target_file_path) as f:
-        target_content = f.read()
-
-    return _check_hash(source_hash_file, target_content)
-
-def check_setting_hash(setting_name: str, status_dir: str) -> bool:
-    hash_filename = '_'.join(['settings', setting_name])
-    source_hash_file = os.path.join(status_dir, hash_filename)
-
-    target_content = json.dumps(getattr(settings, setting_name), sort_keys=True)
-
-    return _check_hash(source_hash_file, target_content)
-
 def destroy_leaked_test_databases(expiry_time: int = 60 * 60) -> int:
    """The logic in zerver/lib/test_runner.py tries to delete all the
    temporary test databases generated by test-backend threads, but it