2020-06-27 01:17:52 +02:00
#!/usr/bin/env bash
# This tests a Zulip production environment (installed via
# production-install) with some Nagios checks and other tools to
# verify that everything is working properly.
set -e
set -x
cat >>/etc/zulip/settings.py <<EOF
# CircleCI override settings above
AUTHENTICATION_BACKENDS = ( 'zproject.backends.EmailAuthBackend', )
NOREPLY_EMAIL_ADDRESS = 'noreply@circleci.example.com'
ALLOWED_HOSTS = []
EOF
2021-03-25 08:37:45 +01:00
if [ -f /etc/os-release ]; then
os_info="$(
. /etc/os-release
printf '%s\n' "$VERSION_CODENAME"
)"
{ read -r os_version_codename || true; } <<<"$os_info"
fi
check_header() {
sed -i -e 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' -e "s|{nginx_version_string}|$nginx_version|g" "$success_header_file"
if ! diff -ur /tmp/http-headers-processed "$success_header_file"; then
set +x
echo
echo "FAILURE: The HTTP Headers returned from loading the homepage on the server do not match the contents of tools/ci/success-http-headers.template.txt. Typically, this means that the server threw a 500 when trying to load the homepage."
echo "Displaying the contents of the server's error log:"
echo
cat /var/log/zulip/errors.log
echo
echo "Displaying the contents of the main server log:"
echo
cat /var/log/zulip/server.log
exit 1
fi
}
2020-10-15 04:55:57 +02:00
echo
echo "Now testing that the supervisord jobs are running properly"
echo
2020-06-27 01:17:52 +02:00
sleep 15 # Guaranteed to have a working supervisord process get an extra digit
if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*uptime //' | grep -q 0:00:0; then
set +x
echo
echo "FAILURE: Supervisor output shows daemons are crashing:"
echo
supervisorctl status
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
echo
echo "DEBUG: printing Zulip server's workers log:"
cat /var/log/zulip/workers.log
echo
echo "DEBUG: printing Zulip server's tornado log:"
cat /var/log/zulip/tornado.log
exit 1
fi
# TODO: Ideally this would test actually logging in, but this is a start.
2020-10-15 04:55:57 +02:00
echo
echo "Now testing that the newly installed server's homepage loads"
echo
2020-06-27 01:17:52 +02:00
2020-10-15 04:55:57 +02:00
wget https://localhost -O /tmp/index.html --no-check-certificate -S 2>/tmp/wget-output || true # || true so we see errors.log if this 500s
grep -vi '\(Vary\|Content-Language\|expires\|issued by\|modified\|saved\|[.][.][.]\|Date\|[-][-]\)' /tmp/wget-output >/tmp/http-headers-processed
2020-06-27 01:17:52 +02:00
2021-03-25 08:37:45 +01:00
nginx_version="$(nginx -v 2>&1 | awk '{print $3, $4}' | xargs)"
2020-07-04 13:17:27 +02:00
2020-06-27 01:17:52 +02:00
# Simplify the diff by getting replacing 4-5 digit length numbers with <Length>.
sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' /tmp/http-headers-processed
2021-04-02 11:04:05 +02:00
if [ "$os_version_codename" = "buster" ] || [ "$os_version_codename" = "bullseye" ]; then
2021-03-25 08:37:45 +01:00
success_header_file="/tmp/success-http-headers.template.debian.txt"
check_header
else
success_header_file="/tmp/success-http-headers.template.txt"
check_header
2020-06-27 01:17:52 +02:00
fi
# Start the RabbitMQ queue worker related section
2020-10-15 04:55:57 +02:00
echo
echo "Now confirming all the RabbitMQ queue processors are correctly registered!"
echo
2020-06-27 01:17:52 +02:00
# These hacky shell scripts just extract the sorted list of queue processors, running and expected
2020-10-15 04:55:57 +02:00
supervisorctl status | cut -f1 -dR | cut -f2- -d: | grep events | cut -f1 -d" " | cut -f3- -d_ | cut -f1 -d- | sort -u >/tmp/running_queue_processors.txt
2020-10-18 13:22:28 +02:00
su zulip -c /home/zulip/deployments/current/scripts/lib/queue_workers.py | sort -u >/tmp/expected_queue_processors.txt
2020-06-27 01:17:52 +02:00
if ! diff /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt >/dev/null; then
set +x
echo "FAILURE: Runnable queue processors declared in zerver/worker/queue_processors.py "
2020-10-20 02:49:54 +02:00
echo "do not match those in puppet/zulip/manifests/profile/base.pp"
2020-06-27 01:17:52 +02:00
echo "See https://zulip.readthedocs.io/en/latest/subsystems/queuing.html for details."
echo
diff -ur /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt
exit 1
fi
2020-10-15 04:55:57 +02:00
echo
echo "Now running RabbitMQ consumer Nagios tests"
echo
2020-06-27 01:17:52 +02:00
# First run the check that usually runs in cron and populates the state files
/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
# Then, compute the list of all Django queue workers to run Nagios checks against
consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer)
for consumer in $consumer_list; do
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
set +x
echo
echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"
set -x
rabbitmqctl list_consumers
supervisorctl status
tail -n +1 /var/log/zulip/events*.log
exit 1
fi
done
# Some of the Nagios tests have been temporarily disabled to work
# around a Travis CI infrastructure issue.
2020-10-15 04:55:57 +02:00
echo
echo "Now running additional Nagios tests"
echo
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_queue_worker_errors \
2020-10-26 22:00:36 +01:00
|| ! su zulip -c /usr/lib/nagios/plugins/zulip_postgresql/check_fts_update_log; then # || \
2020-10-15 04:55:57 +02:00
# ! su zulip -c "/usr/lib/nagios/plugins/zulip_app_frontend/check_send_receive_time --site=https://127.0.0.1/api --nagios --insecure"; then
2020-06-27 01:17:52 +02:00
set +x
echo
echo "FAILURE: Nagios checks don't pass:"
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
exit 1
fi
echo "Production installation test successful!"
exit 0