2018-12-18 02:08:53 +01:00
#!/usr/bin/env bash
2017-06-06 03:15:17 +02:00
# This test installs a Zulip production environment (from the release
# tarball from setup-production), and then runs some Nagios checks and
# other tools to verify that everything is working properly.
2015-10-15 01:47:42 +02:00
set -e
set -x
2017-08-10 02:24:49 +02:00
ZULIP_PATH=$(mktemp -d)
tar -xf zulip-server-travis.tar.gz -C "$ZULIP_PATH" --strip-components=1
2015-10-15 01:47:42 +02:00
2016-01-19 20:44:21 +01:00
# Do an apt upgrade to start with an up-to-date machine
2018-08-03 02:14:51 +02:00
APT_OPTIONS=(-o 'Dpkg::Options::=--force-confdef' -o 'Dpkg::Options::=--force-confold')
2016-05-02 23:34:49 +02:00
apt-get update
2016-05-02 23:45:44 +02:00
# Hold upgrades to packages which are expensive to upgrade due to size
# or computational cost (e.g. initramfs rebuilds) and aren't really
# used by Zulip in production.
2019-05-25 02:07:16 +02:00
apt-mark hold initramfs-tools initramfs-tools-bin udev base-files linux-firmware chromium-browser google-chrome-stable g++-4.8 gcc-4.8 cpp-4.8 linux-image-generic-lts-xenial
2016-06-22 17:59:34 +02:00
# And hold tons more packages that aren't expensive to upgrade but
# there are a lot of. This is super ugly, but since Travis CI's
# machines never update, we can avoid years of package upgrades (takes
2017-06-13 03:00:29 +02:00
# ~80s to install) by doing this.
2019-05-25 02:07:16 +02:00
apt-mark hold accountsservice apparmor apport apt apt-transport-https apt-utils bash bash-completion bind9-host binutils binutils-doc bsdutils bzr cloud-guest-utils cloud-init coreutils cpio dbus dnsutils dosfstools dpkg dpkg-dev e2fslibs e2fsprogs eject gcc-4.9-base git-core grub-common grub-pc grub-pc-bin grub2-common icu-devtools ifupdown imagemagick imagemagick-common init-system-helpers initscripts irqbalance isc-dhcp-client isc-dhcp-common klibc-utils krb5-locales krb5-multidev libaccountsservice0 libapparmor-perl libapparmor1 libblkid1 libc-bin libc-dev-bin libc6 libc6-dev libcdt5 libcgmanager0 libcgraph6 libcups2 libcurl3-gnutls libdbus-1-3 libdpkg-perl libdrm-intel1 libdrm-nouveau2 libdrm-radeon1 libdrm2 libevent-2.0-5 libexpat1 libexpat1-dev libgc1c2 libgcc1 libgd3 libgl1-mesa-dri libgl1-mesa-glx libglapi-mesa libgnutls-openssl27 libgraphite2-3 libgraphviz-dev libgssapi-krb5-2 libgssrpc4 libgtk2.0-0 libgtk2.0-common libgvc6 libgvpr2 libicu-dev libjasper-dev libjasper1 libk5crypto3 libkadm5clnt-mit9 libkadm5srv-mit9 libklibc libkrb5-3 libkrb5-dev libkrb5support0 liblcms2-2 liblcms2-dev libmagickwand-dev libmount1 libmysqlclient-dev libnl-3-200 libnl-genl-3-200 libnspr4 libnss3 libnss3-nssdb libnuma1 libpam-modules libpam-modules-bin libpam-runtime libpam-systemd libpam0g libpam0g-dev libpathplan4 libpci3 libpcre3 libpcre3-dev libpcsclite1 libpixman-1-0 libpixman-1-dev libpng12-0 libpng12-dev libpolkit-agent-1-0 libpolkit-backend-1-0 libpolkit-gobject-1-0 libpython3.4 libsndfile1 libss2 libssl-dev libssl1.0.0 libtasn1-6 libtiff5 libtiff5-dev libtiffxx5 libuuid1 libxdot4 libxml2 libxml2-dev libxpm4 linux-libc-dev login lsb-base lshw makedev mongodb-org mongodb-org-mongos mongodb-org-server mongodb-org-shell mongodb-org-tools mount multiarch-support mysql-client-5.7 mysql-client-core-5.7 mysql-common mysql-server-5.7 mysql-server-core-5.7 ntpdate openssh-client openssh-server openssh-sftp-server os-prober overlayroot passwd pciutils perl perl-base pgdg-keyring policykit-1 pollinate postgresql-client postgresql-client-common postgresql-common python-apt python-apt-common python-bzrlib python-urllib3 python3-apport python3-apt python3-distupgrade python3-gdbm python3-problem-report python3-software-properties python3-update-manager python3.4 python3.4-minimal rsync software-properties-common sudo sysv-rc sysvinit-utils tar tcpdump tzdata ubuntu-release-upgrader-core unzip update-manager-core usbutils util-linux uuid-runtime w3m
2016-06-22 17:59:34 +02:00
2018-08-03 02:14:51 +02:00
if ! apt-get dist-upgrade -y "${APT_OPTIONS[@]}"; then
2020-03-28 01:25:56 +01:00
echo "\`apt-get dist-upgrade\`: Failure occurred while trying to perform distribution upgrade, Retrying..."
2018-08-03 02:14:51 +02:00
apt-get dist-upgrade -y "${APT_OPTIONS[@]}"
2017-06-13 19:04:46 +02:00
fi
travis: Remove rabbitmq nodename dependency on hostname.
Because rabbitmq doesn't support changing the nodename of a running
rabbitmq node, Zulip installations suffered a plague of issues where
e.g. a Zulip server would reboot, the hostname would change, and
suddenly the local rabbitmq instance being used by Zulip would stop
working.
We address this problem by using, by default, a fixed rabbitmq
nodename, but providing server administrators the option to set the
rabbitmq nodename used by Zulip however they choose.
To upgrade an existing server to use this new configuration, one will
need to add something like the following to /etc/zulip/zulip.conf:
[rabbitmq]
nodename = zulip@localhost
However, I don't believe we have the puppet code in place to make this
work correctly at initial installation without rabbitmq-server being
already installed (but off), as we can easily setup in Travis CI but I
haven't been willing to do for the installer. So for now, this just
fixes our Travis CI problems.
Fixes: #1579.
2016-08-10 03:40:07 +02:00
# Disable existing rabbitmq node so we can change it
service rabbitmq-server stop
rm -rf /var/lib/rabbitmq/mnesia/
2016-01-10 00:13:12 +01:00
# Install Zulip
2018-03-03 01:10:51 +01:00
env TRAVIS=1 "$ZULIP_PATH"/scripts/setup/install --self-signed-cert --hostname 127.0.0.1 --email zulip-travis-admin@travis.example.com
2015-10-15 01:47:42 +02:00
cat >>/etc/zulip/settings.py <<EOF
# Travis CI override settings above
AUTHENTICATION_BACKENDS = ( 'zproject.backends.EmailAuthBackend', )
NOREPLY_EMAIL_ADDRESS = 'noreply@travis.example.com'
2016-07-13 07:32:21 +02:00
ALLOWED_HOSTS = []
2015-10-15 01:47:42 +02:00
EOF
2016-05-08 03:49:43 +02:00
echo; echo "Now testing that the supervisord jobs are running properly"; echo
2017-02-20 08:41:26 +01:00
sleep 15 # Guaranteed to have a working supervisord process get an extra digit
2016-05-08 03:49:43 +02:00
if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*uptime //' | grep -q 0:00:0; then
set +x
echo
echo "FAILURE: Supervisor output shows daemons are crashing:"
echo
supervisorctl status
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
echo
echo "DEBUG: printing Zulip server's workers log:"
cat /var/log/zulip/workers.log
echo
echo "DEBUG: printing Zulip server's tornado log:"
cat /var/log/zulip/tornado.log
exit 1
fi
2016-12-16 06:03:33 +01:00
# TODO: Ideally this would test actually logging in, but this is a start.
echo; echo "Now testing that the newly installed server's homepage loads"; echo
wget https://localhost -O /tmp/index.html --no-check-certificate -S 2> /tmp/wget-output || true # || true so we see errors.log if this 500s
grep -vi '\(Vary\|Content-Language\|expires\|issued by\|modified\|saved\|[.][.][.]\|Date\|[-][-]\)' /tmp/wget-output > /tmp/http-headers-processed
2017-08-15 17:57:05 +02:00
2018-04-25 22:09:48 +02:00
# Simplify the diff by getting replacing 4-5 digit length numbers with <Length>.
sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' /tmp/http-headers-processed
sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' ~/success-http-headers.txt
2016-12-16 06:03:33 +01:00
if ! diff -ur /tmp/http-headers-processed ~/success-http-headers.txt; then
set +x
echo
2018-12-10 08:05:16 +01:00
echo "FAILURE: The HTTP Headers returned from loading the homepage on the server do not match the contents of tools/ci/success-http-headers.txt. Typically, this means that the server threw a 500 when trying to load the homepage."
2016-12-16 06:03:33 +01:00
echo "Displaying the contents of the server's error log:"
echo
cat /var/log/zulip/errors.log
echo
echo "Displaying the contents of the main server log:"
echo
cat /var/log/zulip/server.log
exit 1
fi
2017-02-19 22:41:43 +01:00
# Start the RabbitMQ queue worker related section
echo; echo "Now confirming all the RabbitMQ queue processors are correctly registered!"; echo
# These hacky shell scripts just extract the sorted list of queue processors, running and expected
supervisorctl status | cut -f1 -dR | cut -f2- -d: | grep events | cut -f1 -d" " | cut -f3- -d_ | cut -f1 -d- | sort -u > /tmp/running_queue_processors.txt
2017-03-06 08:51:10 +01:00
su zulip -c /home/zulip/deployments/current/scripts/lib/queue_workers.py | grep -v ^test$ | sort -u > /tmp/expected_queue_processors.txt
2017-02-20 04:34:15 +01:00
if ! diff /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt >/dev/null; then
set +x
2017-02-19 22:41:43 +01:00
echo "FAILURE: Runnable queue processors declared in zerver/worker/queue_processors.py "
echo "do not match those in puppet/manifests/zulip/base.pp"
2017-11-16 19:51:44 +01:00
echo "See https://zulip.readthedocs.io/en/latest/subsystems/queuing.html for details."
2017-02-19 22:41:43 +01:00
echo
diff -ur /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt
exit 1
fi
2016-05-08 03:49:56 +02:00
echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
2016-08-12 23:09:36 +02:00
# First run the check that usually runs in cron and populates the state files
/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
2017-02-19 22:18:18 +01:00
# Then, compute the list of all Django queue workers to run Nagios checks against
2017-02-22 09:23:07 +01:00
consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer)
2017-02-19 22:18:18 +01:00
for consumer in $consumer_list; do
2016-05-08 03:49:56 +02:00
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
set +x
echo
echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"
rabbitmqctl list_consumers
2017-02-20 08:44:40 +01:00
supervisorctl status
2017-02-22 09:21:22 +01:00
echo "EVENTS LOGS"
echo
cat /var/log/zulip/events*.log
2016-05-08 03:49:56 +02:00
echo
exit 1
fi
done
2017-03-16 05:34:24 +01:00
# Some of the Nagios tests have been temporarily disabled to work
# around a Travis CI infrastructure issue.
2016-05-08 03:49:56 +02:00
echo; echo "Now running additional Nagios tests"; echo
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_queue_worker_errors || \
2017-03-16 05:34:24 +01:00
! su zulip -c /usr/lib/nagios/plugins/zulip_postgres_appdb/check_fts_update_log; then # || \
dependencies: Remove WebSockets system for sending messages.
Zulip has had a small use of WebSockets (specifically, for the code
path of sending messages, via the webapp only) since ~2013. We
originally added this use of WebSockets in the hope that the latency
benefits of doing so would allow us to avoid implementing a markdown
local echo; they were not. Further, HTTP/2 may have eliminated the
latency difference we hoped to exploit by using WebSockets in any
case.
While we’d originally imagined using WebSockets for other endpoints,
there was never a good justification for moving more components to the
WebSockets system.
This WebSockets code path had a lot of downsides/complexity,
including:
* The messy hack involving constructing an emulated request object to
hook into doing Django requests.
* The `message_senders` queue processor system, which increases RAM
needs and must be provisioned independently from the rest of the
server).
* A duplicate check_send_receive_time Nagios test specific to
WebSockets.
* The requirement for users to have their firewalls/NATs allow
WebSocket connections, and a setting to disable them for networks
where WebSockets don’t work.
* Dependencies on the SockJS family of libraries, which has at times
been poorly maintained, and periodically throws random JavaScript
exceptions in our production environments without a deep enough
traceback to effectively investigate.
* A total of about 1600 lines of our code related to the feature.
* Increased load on the Tornado system, especially around a Zulip
server restart, and especially for large installations like
zulipchat.com, resulting in extra delay before messages can be sent
again.
As detailed in
https://github.com/zulip/zulip/pull/12862#issuecomment-536152397, it
appears that removing WebSockets moderately increases the time it
takes for the `send_message` API query to return from the server, but
does not significantly change the time between when a message is sent
and when it is received by clients. We don’t understand the reason
for that change (suggesting the possibility of a measurement error),
and even if it is a real change, we consider that potential small
latency regression to be acceptable.
If we later want WebSockets, we’ll likely want to just use Django
Channels.
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2019-07-23 01:43:40 +02:00
# ! su zulip -c "/usr/lib/nagios/plugins/zulip_app_frontend/check_send_receive_time --site=https://127.0.0.1/api --nagios --insecure"; then
2016-05-08 03:49:56 +02:00
set +x
echo
echo "FAILURE: Nagios checks don't pass:"
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
exit 1
fi
2016-05-08 03:49:43 +02:00
echo "Production installation test successful!"
exit 0