Optimize incremental virtualenv creation.

This adds a new system for copying packages from old virtualenvs that
are sufficiently similar to the new virtualenv required.

In practice, this results in a huge performance improvement for
re-provisioning Zulip development environments when the requirements
files have changed (which is the dominant performance problem with
provision today).

Fixes: #1507.
This commit is contained in:
Umair Khan 2016-08-11 16:32:17 +05:00 committed by Tim Abbott
parent 27e4ed126f
commit e3078b226a
2 changed files with 146 additions and 5 deletions

View File

@ -156,3 +156,6 @@ polib==1.0.7
# We're using this version from git to ensure compatibility with
# Jinja2==2.8 (see https://github.com/jorgebastida/glue/pull/211).
git+https://github.com/lorenzogil/glue@01c00cd33b9b78ea868300c266c16acd59a81bfc#egg=glue==0.11.1
# Needed for cloning virtual environments
git+https://github.com/umairwaheed/virtualenv-clone.git@short-version#egg=virtualenv-clone==0.2.6

View File

@ -5,6 +5,7 @@ import sys
from os.path import dirname, abspath
import subprocess
from scripts.lib.zulip_tools import run
from scripts.lib.hash_reqs import expand_reqs
ZULIP_PATH = dirname(dirname(dirname(abspath(__file__))))
VENV_CACHE_PATH = "/srv/zulip-venv-cache"
@ -15,7 +16,7 @@ if 'TRAVIS' in os.environ:
if False:
# Don't add a runtime dependency on typing
from typing import List, Optional
from typing import List, Optional, Tuple, Set
VENV_DEPENDENCIES = [
"build-essential",
@ -34,6 +35,136 @@ VENV_DEPENDENCIES = [
"libpq-dev", # Needed by psycopg2
]
def get_index_filename(venv_path):
# type: (str) -> str
return os.path.join(venv_path, 'package_index')
def get_package_names(requirements_file):
# type: (str) -> List[str]
packages = expand_reqs(requirements_file)
cleaned = []
operators = ['~=', '==', '!=', '<', '>']
for package in packages:
for operator in operators:
if operator in package:
package = package.split(operator)[0]
package = package.strip()
if package:
cleaned.append(package.lower())
return sorted(cleaned)
def create_requirements_index_file(venv_path, requirements_file):
# type: (str, str) -> str
"""
Creates a file, called package_index, in the virtual environment
directory that contains all the PIP packages installed in the
virtual environment. This file is used to determine the packages
that can be copied to a new virtual environment.
"""
index_filename = get_index_filename(venv_path)
packages = get_package_names(requirements_file)
with open(index_filename, 'w') as writer:
writer.write('\n'.join(packages))
writer.write('\n')
return index_filename
def get_venv_packages(venv_path):
# type: (str) -> Set[str]
"""
Returns the packages installed in the virtual environment using the
package index file.
"""
with open(get_index_filename(venv_path)) as reader:
return set(p.strip() for p in reader.read().split('\n') if p.strip())
def try_to_copy_venv(venv_path, new_packages):
# type: (str, Set[str]) -> bool
"""
Tries to copy packages from an old virtual environment in the cache
to the new virtual environment. The algorithm works as follows:
1. Find a virtual environment, v, from the cache that has the
highest overlap with the new requirements such that:
a. The new requirements only add to the packages of v.
b. The new requirements only upgrade packages of v.
2. Copy the contents of v to the new virtual environment using
virtualenv-clone.
3. Delete all .pyc files in the new virtual environment.
"""
venv_name = os.path.basename(venv_path)
overlaps = [] # type: List[Tuple[int, str, Set[str]]]
old_packages = set() # type: Set[str]
for sha1sum in os.listdir(VENV_CACHE_PATH):
curr_venv_path = os.path.join(VENV_CACHE_PATH, sha1sum, venv_name)
if (curr_venv_path == venv_path or
not os.path.exists(get_index_filename(curr_venv_path))):
continue
old_packages = get_venv_packages(curr_venv_path)
# We only consider using using old virtualenvs that only
# contain packages that we want in our new virtualenv.
if not (old_packages - new_packages):
overlap = new_packages & old_packages
overlaps.append((len(overlap), curr_venv_path, overlap))
target_log = get_logfile_name(venv_path)
source_venv_path = None
if overlaps:
# Here, we select the old virtualenv with the largest overlap
overlaps = sorted(overlaps)
_, source_venv_path, copied_packages = overlaps[-1]
print('Copying packages from {}'.format(source_venv_path))
clone_ve = "{}/bin/virtualenv-clone".format(source_venv_path)
cmd = "sudo {exe} {source} {target}".format(exe=clone_ve,
source=source_venv_path,
target=venv_path).split()
try:
run(cmd)
except Exception:
# Virtualenv-clone is not installed. Install it and try running
# the command again.
run("{}/bin/pip install --no-deps virtualenv-clone".format(
source_venv_path).split())
run(cmd)
run(["sudo", "chown", "-R",
"{}:{}".format(os.getuid(), os.getgid()), venv_path])
source_log = get_logfile_name(source_venv_path)
copy_parent_log(source_log, target_log)
create_log_entry(target_log, source_venv_path, copied_packages,
new_packages - copied_packages)
return True
return False
def get_logfile_name(venv_path):
# type: (str) -> str
return "{}/setup-venv.log".format(venv_path)
def create_log_entry(target_log, parent, copied_packages, new_packages):
# type: (str, str, Set[str], Set[str]) -> None
venv_path = dirname(target_log)
with open(target_log, 'a') as writer:
writer.write("{}\n".format(venv_path))
if copied_packages:
writer.write(
"Copied from {}:\n".format(parent))
writer.write("\n".join('- {}'.format(p) for p in sorted(copied_packages)))
writer.write("\n")
writer.write("New packages:\n")
writer.write("\n".join('- {}'.format(p) for p in sorted(new_packages)))
writer.write("\n\n")
def copy_parent_log(source_log, target_log):
# type: (str, str) -> None
if os.path.exists(source_log):
run('cp {} {}'.format(source_log, target_log).split())
def do_patch_activate_script(venv_path):
# type: (str) -> None
"""
@ -83,11 +214,18 @@ def do_setup_virtualenv(venv_path, requirements_file, virtualenv_args):
# type: (str, str, List[str]) -> None
# Setup Python virtualenv
run(["sudo", "rm", "-rf", venv_path])
run(["sudo", "mkdir", "-p", venv_path])
run(["sudo", "chown", "{}:{}".format(os.getuid(), os.getgid()), venv_path])
run(["virtualenv"] + virtualenv_args + [venv_path])
new_packages = set(get_package_names(requirements_file))
run(["sudo", "rm", "-rf", venv_path])
if not try_to_copy_venv(venv_path, new_packages):
# Create new virtualenv.
run(["sudo", "mkdir", "-p", venv_path])
run(["sudo", "virtualenv"] + virtualenv_args + [venv_path])
run(["sudo", "chown", "-R",
"{}:{}".format(os.getuid(), os.getgid()), venv_path])
create_log_entry(get_logfile_name(venv_path), "", set(), new_packages)
create_requirements_index_file(venv_path, requirements_file)
# Switch current Python context to the virtualenv.
activate_this = os.path.join(venv_path, "bin", "activate_this.py")
exec(open(activate_this).read(), {}, dict(__file__=activate_this)) # type: ignore # https://github.com/python/mypy/issues/1577