db tools: Use common scheme for digests.

We have two different digest schemes to make
sure we keep the database up to date.  There
is the migration digest, which is NOT in the
scope of this commit, and which already
used the mechanism we use for other tools.

Here we are talking about the digest for
important files like `populate_db.py`.

Now our scheme is more consistent with how we
check file changes for other tools (as
well as the aformentioned migration files).

And we only write one hash file, instead of
seven.

And we only write the file when things have
actually changed.

And we are explicit about side effects.

Finally, we include a couple new bot settings
in the digest:

    INTERNAL_BOTS
    DISABLED_REALM_INTERNAL_BOTS

NOTE: This will require a one-time transition,
where we rebuild both databases (dev/test).
It takes a little over two minutes for me,
so it's not super painful.

I bump the provision version here, even
though you don't technically need it (since
the relevant tools are actually using the
digest files to determine if they need to
rebuild the database).  I figure it's just
good to explicitly make this commit trigger
a provision, and the user will then see
the one-time migration of the hash files
with a little bit less of a surprise.

And I do a major bump, not a minor bump,
because when we go in the reverse direction,
the old code will have to rebuild the
database due to the legacy hash files not
being around, so, again, I just prefer it
to be explicit.
This commit is contained in:
Steve Howell 2020-04-22 09:47:03 +00:00 committed by Tim Abbott
parent f4942e9927
commit 02252c255a
2 changed files with 78 additions and 70 deletions

View File

@ -44,4 +44,4 @@ API_FEATURE_LEVEL = 1
# historical commits sharing the same major version, in which case a
# minor version bump suffices.
PROVISION_VERSION = '79.3'
PROVISION_VERSION = '80.0'

View File

@ -1,7 +1,6 @@
import json
import os
import re
import hashlib
import subprocess
import sys
from typing import Any, List, Set
@ -26,7 +25,35 @@ from scripts.lib.zulip_tools import (
)
UUID_VAR_DIR = get_dev_uuid_var_path()
FILENAME_SPLITTER = re.compile(r'[\W\-_]')
IMPORTANT_FILES = [
'zilencer/management/commands/populate_db.py',
'zerver/lib/bulk_create.py',
'zerver/lib/generate_test_data.py',
'zerver/lib/server_initialization.py',
'tools/setup/postgres-init-test-db',
'tools/setup/postgres-init-dev-db',
'zerver/migrations/0258_enable_online_push_notifications_default.py',
]
VERBOSE_MESSAGE_ABOUT_HASH_TRANSITION = '''
NOTE!!!!
We are rebuilding your database for a one-time transition.
We have a hashing scheme that we use to detect whether any
important files used in the construction of the database
have changed.
We are changing that scheme so it only uses one file
instead of a directory of files.
In order to prevent errors due to this transition, we are
doing a one-time rebuild of your database. This should
be the last time this happens (for this particular reason,
at least), unless you go back to older branches.
'''
def migration_paths() -> List[str]:
return [
@ -38,6 +65,7 @@ class Database:
def __init__(self, platform: str, database_name: str, settings: str):
self.database_name = database_name
self.settings = settings
self.digest_name = 'db_files_hash_for_' + platform
self.migration_status_file = 'migration_status_' + platform
self.migration_status_path = os.path.join(
UUID_VAR_DIR,
@ -45,6 +73,17 @@ class Database:
)
self.migration_digest_file = "migrations_hash_" + database_name
def important_settings(self) -> List[str]:
def get(setting_name: str) -> str:
value = getattr(settings, setting_name, {})
return json.dumps(value, sort_keys=True)
return [
get('INTERNAL_BOTS'),
get('REALM_INTERNAL_BOTS'),
get('DISABLED_REALM_INTERNAL_BOTS'),
]
def run_db_migrations(self) -> None:
# We shell out to `manage.py` and pass `DJANGO_SETTINGS_MODULE` on
# the command line rather than just calling the migration
@ -106,29 +145,35 @@ class Database:
except OperationalError:
return False
def files_or_settings_have_changed(self) -> bool:
database_name = self.database_name
# Deal with legacy hash files. We can kill off this code when
# enough time has passed since April 2020 that we're not
# worried about anomalies doing `git bisect`--probably a few
# months is sufficient.
legacy_status_dir = os.path.join(UUID_VAR_DIR, database_name + '_db_status')
if os.path.exists(legacy_status_dir):
print(VERBOSE_MESSAGE_ABOUT_HASH_TRANSITION)
# Remove the old digest for several reasons:
# - tidiness
# - preventing false positives if you bisect
# - make this only a one-time headache (generally)
shutil.rmtree(legacy_status_dir)
# Return True to force a one-time rebuild.
return True
return is_digest_obsolete(
self.digest_name,
IMPORTANT_FILES,
self.important_settings(),
)
def template_status(self) -> str:
# This function returns a status string specifying the type of
# state the template db is in and thus the kind of action required.
database_name = self.database_name
check_files = [
'zilencer/management/commands/populate_db.py',
'zerver/lib/bulk_create.py',
'zerver/lib/generate_test_data.py',
'zerver/lib/server_initialization.py',
'tools/setup/postgres-init-test-db',
'tools/setup/postgres-init-dev-db',
'zerver/migrations/0258_enable_online_push_notifications_default.py',
]
check_settings = [
'REALM_INTERNAL_BOTS',
]
# Construct a directory to store hashes named after the target database.
status_dir = os.path.join(UUID_VAR_DIR, database_name + '_db_status')
if not os.path.exists(status_dir):
os.mkdir(status_dir)
if not self.database_exists():
# TODO: It's possible that `database_exists` will
# return `False` even though the database
@ -140,14 +185,16 @@ class Database:
# it's better to err on that side, generally.
return 'needs_rebuild'
# To ensure Python evaluates all the hash tests (and thus creates the
# hash files about the current state), we evaluate them in a
# list and then process the result
files_hash_status = all([check_file_hash(fn, status_dir) for fn in check_files])
settings_hash_status = all([check_setting_hash(setting_name, status_dir)
for setting_name in check_settings])
hash_status = files_hash_status and settings_hash_status
if not hash_status:
if self.files_or_settings_have_changed():
# Write the new hash, relying on our callers to
# actually rebuild the db successfully.
# TODO: Move this code to the callers, and write
# the digest only AFTER the rebuild succeeds.
write_new_digest(
self.digest_name,
IMPORTANT_FILES,
self.important_settings(),
)
return 'needs_rebuild'
# Here we hash and compare our migration files before doing
@ -266,45 +313,6 @@ def extract_migrations_as_list(migration_status: str) -> List[str]:
MIGRATIONS_RE = re.compile(r'\[[X| ]\] (\d+_.+)\n')
return MIGRATIONS_RE.findall(migration_status)
def _get_hash_file_path(source_file_path: str, status_dir: str) -> str:
basename = os.path.basename(source_file_path)
filename = '_'.join(FILENAME_SPLITTER.split(basename)).lower()
return os.path.join(status_dir, filename)
def _check_hash(source_hash_file: str, target_content: str) -> bool:
"""
This function has a side effect of creating a new hash file or
updating the old hash file.
"""
target_hash_content = hashlib.sha1(target_content.encode('utf8')).hexdigest()
if not os.path.exists(source_hash_file):
source_hash_content = None
else:
with open(source_hash_file) as f:
source_hash_content = f.read().strip()
with open(source_hash_file, 'w') as f:
f.write(target_hash_content)
return source_hash_content == target_hash_content
def check_file_hash(target_file_path: str, status_dir: str) -> bool:
source_hash_file = _get_hash_file_path(target_file_path, status_dir)
with open(target_file_path) as f:
target_content = f.read()
return _check_hash(source_hash_file, target_content)
def check_setting_hash(setting_name: str, status_dir: str) -> bool:
hash_filename = '_'.join(['settings', setting_name])
source_hash_file = os.path.join(status_dir, hash_filename)
target_content = json.dumps(getattr(settings, setting_name), sort_keys=True)
return _check_hash(source_hash_file, target_content)
def destroy_leaked_test_databases(expiry_time: int = 60 * 60) -> int:
"""The logic in zerver/lib/test_runner.py tries to delete all the
temporary test databases generated by test-backend threads, but it