2018-12-18 02:08:53 +01:00
#!/usr/bin/env bash
2017-06-06 03:15:17 +02:00
# This test installs a Zulip production environment (from the release
# tarball from setup-production), and then runs some Nagios checks and
# other tools to verify that everything is working properly.
2015-10-15 01:47:42 +02:00
set -e
set -x
2017-08-10 02:24:49 +02:00
ZULIP_PATH=$(mktemp -d)
2020-04-20 16:54:20 +02:00
tar -xf zulip-server-test.tar.gz -C "$ZULIP_PATH" --strip-components=1
2015-10-15 01:47:42 +02:00
2016-01-19 20:44:21 +01:00
# Do an apt upgrade to start with an up-to-date machine
2018-08-03 02:14:51 +02:00
APT_OPTIONS=(-o 'Dpkg::Options::=--force-confdef' -o 'Dpkg::Options::=--force-confold')
2016-05-02 23:34:49 +02:00
apt-get update
2016-06-22 17:59:34 +02:00
2018-08-03 02:14:51 +02:00
if ! apt-get dist-upgrade -y "${APT_OPTIONS[@]}"; then
2020-03-28 01:25:56 +01:00
echo "\`apt-get dist-upgrade\`: Failure occurred while trying to perform distribution upgrade, Retrying..."
2018-08-03 02:14:51 +02:00
apt-get dist-upgrade -y "${APT_OPTIONS[@]}"
2017-06-13 19:04:46 +02:00
fi
travis: Remove rabbitmq nodename dependency on hostname.
Because rabbitmq doesn't support changing the nodename of a running
rabbitmq node, Zulip installations suffered a plague of issues where
e.g. a Zulip server would reboot, the hostname would change, and
suddenly the local rabbitmq instance being used by Zulip would stop
working.
We address this problem by using, by default, a fixed rabbitmq
nodename, but providing server administrators the option to set the
rabbitmq nodename used by Zulip however they choose.
To upgrade an existing server to use this new configuration, one will
need to add something like the following to /etc/zulip/zulip.conf:
[rabbitmq]
nodename = zulip@localhost
However, I don't believe we have the puppet code in place to make this
work correctly at initial installation without rabbitmq-server being
already installed (but off), as we can easily setup in Travis CI but I
haven't been willing to do for the installer. So for now, this just
fixes our Travis CI problems.
Fixes: #1579.
2016-08-10 03:40:07 +02:00
2016-01-10 00:13:12 +01:00
# Install Zulip
2020-04-21 22:51:04 +02:00
"$ZULIP_PATH"/scripts/setup/install --self-signed-cert --hostname 127.0.0.1 --email circleci@example.com
2015-10-15 01:47:42 +02:00
cat >>/etc/zulip/settings.py <<EOF
2020-04-20 16:54:20 +02:00
# Circle CI override settings above
2015-10-15 01:47:42 +02:00
AUTHENTICATION_BACKENDS = ( 'zproject.backends.EmailAuthBackend', )
2020-04-20 16:54:20 +02:00
NOREPLY_EMAIL_ADDRESS = 'noreply@circleci.example.com'
2016-07-13 07:32:21 +02:00
ALLOWED_HOSTS = []
2015-10-15 01:47:42 +02:00
EOF
2016-05-08 03:49:43 +02:00
echo; echo "Now testing that the supervisord jobs are running properly"; echo
2017-02-20 08:41:26 +01:00
sleep 15 # Guaranteed to have a working supervisord process get an extra digit
2016-05-08 03:49:43 +02:00
if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*uptime //' | grep -q 0:00:0; then
set +x
echo
echo "FAILURE: Supervisor output shows daemons are crashing:"
echo
supervisorctl status
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
echo
echo "DEBUG: printing Zulip server's workers log:"
cat /var/log/zulip/workers.log
echo
echo "DEBUG: printing Zulip server's tornado log:"
cat /var/log/zulip/tornado.log
exit 1
fi
2016-12-16 06:03:33 +01:00
# TODO: Ideally this would test actually logging in, but this is a start.
echo; echo "Now testing that the newly installed server's homepage loads"; echo
wget https://localhost -O /tmp/index.html --no-check-certificate -S 2> /tmp/wget-output || true # || true so we see errors.log if this 500s
grep -vi '\(Vary\|Content-Language\|expires\|issued by\|modified\|saved\|[.][.][.]\|Date\|[-][-]\)' /tmp/wget-output > /tmp/http-headers-processed
2017-08-15 17:57:05 +02:00
2018-04-25 22:09:48 +02:00
# Simplify the diff by getting replacing 4-5 digit length numbers with <Length>.
sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' /tmp/http-headers-processed
sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' ~/success-http-headers.txt
2016-12-16 06:03:33 +01:00
if ! diff -ur /tmp/http-headers-processed ~/success-http-headers.txt; then
set +x
echo
2018-12-10 08:05:16 +01:00
echo "FAILURE: The HTTP Headers returned from loading the homepage on the server do not match the contents of tools/ci/success-http-headers.txt. Typically, this means that the server threw a 500 when trying to load the homepage."
2016-12-16 06:03:33 +01:00
echo "Displaying the contents of the server's error log:"
echo
cat /var/log/zulip/errors.log
echo
echo "Displaying the contents of the main server log:"
echo
cat /var/log/zulip/server.log
exit 1
fi
2017-02-19 22:41:43 +01:00
# Start the RabbitMQ queue worker related section
echo; echo "Now confirming all the RabbitMQ queue processors are correctly registered!"; echo
# These hacky shell scripts just extract the sorted list of queue processors, running and expected
supervisorctl status | cut -f1 -dR | cut -f2- -d: | grep events | cut -f1 -d" " | cut -f3- -d_ | cut -f1 -d- | sort -u > /tmp/running_queue_processors.txt
2017-03-06 08:51:10 +01:00
su zulip -c /home/zulip/deployments/current/scripts/lib/queue_workers.py | grep -v ^test$ | sort -u > /tmp/expected_queue_processors.txt
2017-02-20 04:34:15 +01:00
if ! diff /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt >/dev/null; then
set +x
2017-02-19 22:41:43 +01:00
echo "FAILURE: Runnable queue processors declared in zerver/worker/queue_processors.py "
echo "do not match those in puppet/manifests/zulip/base.pp"
2017-11-16 19:51:44 +01:00
echo "See https://zulip.readthedocs.io/en/latest/subsystems/queuing.html for details."
2017-02-19 22:41:43 +01:00
echo
diff -ur /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt
exit 1
fi
2016-05-08 03:49:56 +02:00
echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
2016-08-12 23:09:36 +02:00
# First run the check that usually runs in cron and populates the state files
/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
2017-02-19 22:18:18 +01:00
# Then, compute the list of all Django queue workers to run Nagios checks against
2017-02-22 09:23:07 +01:00
consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer)
2017-02-19 22:18:18 +01:00
for consumer in $consumer_list; do
2016-05-08 03:49:56 +02:00
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
set +x
echo
echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"
rabbitmqctl list_consumers
2017-02-20 08:44:40 +01:00
supervisorctl status
2017-02-22 09:21:22 +01:00
echo "EVENTS LOGS"
echo
cat /var/log/zulip/events*.log
2016-05-08 03:49:56 +02:00
echo
exit 1
fi
done
2017-03-16 05:34:24 +01:00
# Some of the Nagios tests have been temporarily disabled to work
# around a Travis CI infrastructure issue.
2016-05-08 03:49:56 +02:00
echo; echo "Now running additional Nagios tests"; echo
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_queue_worker_errors || \
2017-03-16 05:34:24 +01:00
! su zulip -c /usr/lib/nagios/plugins/zulip_postgres_appdb/check_fts_update_log; then # || \
dependencies: Remove WebSockets system for sending messages.
Zulip has had a small use of WebSockets (specifically, for the code
path of sending messages, via the webapp only) since ~2013. We
originally added this use of WebSockets in the hope that the latency
benefits of doing so would allow us to avoid implementing a markdown
local echo; they were not. Further, HTTP/2 may have eliminated the
latency difference we hoped to exploit by using WebSockets in any
case.
While we’d originally imagined using WebSockets for other endpoints,
there was never a good justification for moving more components to the
WebSockets system.
This WebSockets code path had a lot of downsides/complexity,
including:
* The messy hack involving constructing an emulated request object to
hook into doing Django requests.
* The `message_senders` queue processor system, which increases RAM
needs and must be provisioned independently from the rest of the
server).
* A duplicate check_send_receive_time Nagios test specific to
WebSockets.
* The requirement for users to have their firewalls/NATs allow
WebSocket connections, and a setting to disable them for networks
where WebSockets don’t work.
* Dependencies on the SockJS family of libraries, which has at times
been poorly maintained, and periodically throws random JavaScript
exceptions in our production environments without a deep enough
traceback to effectively investigate.
* A total of about 1600 lines of our code related to the feature.
* Increased load on the Tornado system, especially around a Zulip
server restart, and especially for large installations like
zulipchat.com, resulting in extra delay before messages can be sent
again.
As detailed in
https://github.com/zulip/zulip/pull/12862#issuecomment-536152397, it
appears that removing WebSockets moderately increases the time it
takes for the `send_message` API query to return from the server, but
does not significantly change the time between when a message is sent
and when it is received by clients. We don’t understand the reason
for that change (suggesting the possibility of a measurement error),
and even if it is a real change, we consider that potential small
latency regression to be acceptable.
If we later want WebSockets, we’ll likely want to just use Django
Channels.
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2019-07-23 01:43:40 +02:00
# ! su zulip -c "/usr/lib/nagios/plugins/zulip_app_frontend/check_send_receive_time --site=https://127.0.0.1/api --nagios --insecure"; then
2016-05-08 03:49:56 +02:00
set +x
echo
echo "FAILURE: Nagios checks don't pass:"
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
exit 1
fi
2016-05-08 03:49:43 +02:00
echo "Production installation test successful!"
exit 0