From 2215af4b57618e6ebd0e30ffea00d28e9d82c611 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@zulipchat.com>
Date: Mon, 5 Jun 2017 18:15:17 -0700
Subject: [PATCH] docs: Add a bunch of documentation on Travis CI.

---
 .travis.yml                        |  27 +++++--
 docs/index.rst                     |   3 +-
 docs/testing.md                    |   3 +-
 docs/travis.md                     | 119 +++++++++++++++++++++++++++++
 tools/travis/backend               |   4 +
 tools/travis/production-helper     |   3 +
 tools/travis/setup-backend         |   7 ++
 tools/travis/setup-production      |  12 ++-
 tools/travis/setup-static-analysis |   3 +
 9 files changed, 173 insertions(+), 8 deletions(-)
 create mode 100644 docs/travis.md

diff --git a/.travis.yml b/.travis.yml
index f45a7840fe..2af2213857 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,12 +1,28 @@
+# See https://zulip.readthedocs.io/en/latest/events-system.html for
+# high-level documentation on our Travis CI setup.
 dist: trusty
 before_install:
-   - nvm install 0.10
+  # TODO: Check if this can be removed; we probably don't use it.
+  - nvm install 0.10
 install:
   # Disable Travis CI's built-in NVM installation
   - mv ~/.nvm ~/.travis-nvm-disabled
+
+  # Install coveralls, the library for the code coverage reporting tool we use
   - pip install coveralls
+
+  # This is the main setup job for the test suite
   - tools/travis/setup-$TEST_SUITE
+
+  # Clean any virtualenvs that are not in use to avoid our cache
+  # becoming huge.  TODO: Add similar cleanup code for the other caches.
   - tools/clean-venv-cache --travis
+script:
+  # We unset GEM_PATH here as a hack to work around Travis CI having
+  # broken running their system puppet with Ruby.  See
+  # https://travis-ci.org/zulip/zulip/jobs/240120991 for an example traceback.
+  - unset GEM_PATH
+  - ./tools/travis/$TEST_SUITE
 cache:
   - apt: false
   - directories:
@@ -22,6 +38,9 @@ env:
     - COVERALLS_REPO_TOKEN=hnXUEBKsORKHc8xIENGs9JjktlTb2HKlG
     - BOTO_CONFIG=/tmp/nowhere
 language: python
+# We run all of our test suites for both Python 2.7 and 3.4, with the
+# exception of static analysis, which is just run once (and checks
+# against both Python versions).
 matrix:
   include:
     - python: "3.4"
@@ -38,16 +57,14 @@ matrix:
       env: TEST_SUITE=backend
     - python: "3.4"
       env: TEST_SUITE=backend
-# command to run tests
-script:
-  - unset GEM_PATH
-  - ./tools/travis/$TEST_SUITE
 sudo: required
 services:
 - docker
 addons:
   artifacts:
     paths:
+      # Casper debugging data (screenshots, etc.) is super useful for
+      # debugging test flakes.
       - $(ls var/casper/* | tr "\n" ":")
       - $(ls /tmp/zulip-test-event-log/* | tr "\n" ":")
   postgresql: "9.3"
diff --git a/docs/index.rst b/docs/index.rst
index 39ed528fff..6c8650f7f2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -107,9 +107,10 @@ Contents:
 
    testing
    linters
-   testing-with-node
    testing-with-django
+   testing-with-node
    testing-with-casper
+   travis
    manual-testing
 
 .. _subsystem-documentation:
diff --git a/docs/testing.md b/docs/testing.md
index c8e58cf0dd..b3716818a2 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -8,7 +8,8 @@ important components are documented in depth in their own sections:
 - [Django](testing-with-django.html): backend Python tests
 - [Casper](testing-with-casper.html): end-to-end UI tests
 - [Node](testing-with-node.html): unit tests for JS front end code
-- [Linters](linters.html)
+- [Linters](linters.html): Our parallel linter suite
+- [Travis CI details](travis.html): How all of these run in Travis CI
 
 This document covers more general testing issues, such as how to run the
 entire test suite, how to troubleshoot database issues, how to manually
diff --git a/docs/travis.md b/docs/travis.md
new file mode 100644
index 0000000000..a20b4b744b
--- /dev/null
+++ b/docs/travis.md
@@ -0,0 +1,119 @@
+# Travis CI
+
+The Zulip server uses [Travis CI](https://travis-ci.org/) for its
+continuous integration.  This page documents useful tools and tips to
+know about when using Travis CI and debugging issues with it.
+
+## Goals
+
+The overall goal of our Travis CI setup is to avoid regressions and
+minimize the total time spent debugging Zulip.  We do that by trying
+to catch as many possible future bugs as possible, while minimizing
+both latency and false positives, both of which can waste a lot of
+developer time.  There are a few implications of this overall goal:
+
+* If a test is failing nondeterministically in Travis CI, we consider
+  that to be an urgent problem.
+* If the tests become a lot slower, that is also an urgent problem.
+* Everything we do in CI should also have a way to run it quickly
+(under 1 minute, preferably under 3 seconds), in order to iterate fast
+in development. Except when working on the Travis CI configuration
+itself, a developer should never have to repeatedly wait 10 minutes
+for a full Travis run to iteratively debug something.
+
+## Configuration
+
+The main Travis configuration file is
+[.travis.yml](https://github.com/zulip/zulip/blob/master/.travis.yml).
+The specific test suites we have are listed in the `matrix` section,
+which has a matrix of Python versions and test suites (`$TEST_SUITE`).
+We've configured it to use a few helper scripts for each job:
+
+* `tools/travis/setup-$TEST_SUITE`: The script that sets up the test
+  environment for that suite (E.g., installing dependencies).
+  * For the backend and frontend suites, this is a thin wrapper around
+    `tools/provision`, aka the development environment provision script.
+  * For the production suite, this is a more complicated process
+    because of all the packages Travis installs.  See the comments in
+    `tools/travis/setup-production` for details.
+* `tools/travis/$TEST_SUITE`: The script that runs the actual test
+  suite.
+
+The main purpose of the distinction between the two is that if the
+`setup-backend` job fails, Travis CI will report it as the suite
+having "Errored" (grey in their emails), whereas if the `backend` job
+fails, it'll be reported as "Failed" failure (red in their emails).
+Note that Travis CI's web UI seems to make no visual distinction
+between these.
+
+An important detail is that Travis CI will by default hide most phases
+other than the actual test; you can see this easily by looking at the
+line numbers in the Travis CI output.  There are actually a bunch of
+phases (e.g. the project's setup job, downloading caches near the
+beginning, uploading caches at the end, etc.), and if you're debugging
+our configuration, you'll want to look at these closely.
+
+## Useful debugging tips and tools
+
+* For performance issues,
+  [this statistics tool](https://scribu.github.io/travis-stats/#zulip/zulip/master)
+  can give you test runtime history data that can help with
+  determining when a performance issue was introduced and whether it
+  was fixed.  Note you need to click the "Run" button for it to do
+  anything.
+
+* You can [sign up your personal repo for Travis CI][travis-fork] so
+  that every remote branch you push will be tested, which can be
+  helpful when debugging something complicated.
+
+* You can
+  [use the ts tool](https://github.com/zulip/zulip/commit/da731c) to
+  get timing for every line of your Travis scripts (which can help
+  with performance debugging).  Note, however, that `ts` always return
+  exit code 0, so with that patch, Travis CI will always report success.
+
+[travis-fork]: git-guide.html#step-3-configure-travis-ci-continuous-integration
+
+## Performance optimizations
+
+### Caching
+
+An important element of making Travis CI perform effectively is
+caching the provisioning of a Zulip development environment.  In
+particular, we cache the following across jobs:
+
+* Python virtualenvs
+* node_modules directories
+* Built/downloaded emoji sprite sheets and data
+
+This has a huge impact on the performance of running tests in Travis
+CI; without these caches, the average test time would be several times
+longer.
+
+We have designed these caches carefully (they are also used in
+production and the Zulip development environment) to ensure that each
+is named by a hash of its dependencies, so Zulip should always be
+using the same version of dependencies it would have used had the
+cache not existed.  In practice, bugs are always possible, so be
+mindful of this possibility.
+
+A consequence of this caching is that test jobs for branches which
+modify `package.json`, `requirements/`, and other key dependencies
+will be significantly slower than normal, because they won't get to
+benefit from the cache.
+
+### Uninstalling packages
+
+In the production suite, we run `apt-get upgrade` at some point
+(effectively, because the Zulip installer does).  This carries a huge
+performance cost in Travis CI, because (1) they don't keep their test
+systems up to date and (2) literally everything is installed in their
+build workers (e.g. several copies of Postgres, Java, MySQL, etc.).
+
+In order to make Zulip's tests performance reasonably well, we
+uninstall (or mark with `apt-mark hold`) many of these dependencies
+that are irrelevant to Zulip in
+[`tools/travis/setup-production`][setup-production].
+
+[setup-production]: https://github.com/zulip/zulip/blob/master/tools/travis/setup-production
+
diff --git a/tools/travis/backend b/tools/travis/backend
index 92b4038777..d7843df46f 100755
--- a/tools/travis/backend
+++ b/tools/travis/backend
@@ -1,4 +1,8 @@
 #!/bin/bash
+# This script is very similar to tools/test-all (what one runs
+# locally).  Possibly they should be merged, though it's worth noting,
+# they are intentionally different (basically some slow stuff is not
+# worth running in `test-all`).
 
 source tools/travis/activate-venv
 
diff --git a/tools/travis/production-helper b/tools/travis/production-helper
index f5b39f24d0..01a214ddf2 100755
--- a/tools/travis/production-helper
+++ b/tools/travis/production-helper
@@ -1,4 +1,7 @@
 #!/bin/bash
+# This test installs a Zulip production environment (from the release
+# tarball from setup-production), and then runs some Nagios checks and
+# other tools to verify that everything is working properly.
 set -e
 set -x
 
diff --git a/tools/travis/setup-backend b/tools/travis/setup-backend
index 92dd59e4b5..219b35ec64 100755
--- a/tools/travis/setup-backend
+++ b/tools/travis/setup-backend
@@ -2,6 +2,13 @@
 set -e
 set -x
 
+# This is just a thin wrapper around provision.
 tools/provision --travis
+
+# Create nagios state so that we can test-run the Nagios checks
+# against the run-dev.py server, as a form of end-to-end test
+# (tools/).
+#
+# TODO: Is this actually required?  We don't seem to use it.
 sudo mkdir -p /var/lib/nagios_state
 sudo chown travis /var/lib/nagios_state
diff --git a/tools/travis/setup-production b/tools/travis/setup-production
index 0f11bd1c82..62e35db04f 100755
--- a/tools/travis/setup-production
+++ b/tools/travis/setup-production
@@ -1,9 +1,19 @@
 #!/bin/bash
+# In short, this provisions a Zulip development environment and then
+# builds a Zulip release tarball (the same way we build them for an
+# actual release).  The actual test job will then install that.
+#
+# This script is more complicated than that, basically because Travis
+# CI installs a ton of crap in its build workers, and we need to
+# remove some and reconfigure others to make things run smoothly and
+# quickly.
+#
+# More description in https://zulip.readthedocs.io/en/latest/events-system.html.
 set -e
 set -x
 
 # Make /home/travis world-readable so the `zulip` user will be able to
-# read it.
+# read it, since that's where we store our caches.
 sudo chmod a+rX /home/travis
 
 # Uninstall the unnecessary extra versions of postgres that Travis CI
diff --git a/tools/travis/setup-static-analysis b/tools/travis/setup-static-analysis
index e44ebbf39a..32e83586d6 100755
--- a/tools/travis/setup-static-analysis
+++ b/tools/travis/setup-static-analysis
@@ -1,5 +1,8 @@
 #!/bin/bash
 set -e
 set -x
+
+# We only need mypy and the python 3 compatibility checkers in this
+# build, so we just install those directly, skipping provision.
 pip install --no-deps -r requirements/mypy.txt
 pip install --no-deps -r requirements/py3k.txt