#!/usr/bin/env bash # This tests a Zulip production environment (installed via # production-install) with some Nagios checks and other tools to # verify that everything is working properly. set -e set -x cat >>/etc/zulip/settings.py </tmp/wget-output || true # || true so we see errors.log if this 500s grep -vi '\(Vary\|Content-Language\|expires\|issued by\|modified\|saved\|[.][.][.]\|Date\|[-][-]\)' /tmp/wget-output >/tmp/http-headers-processed nginx_version="$(nginx -v 2>&1 | awk '{print $3, $4}')" # Simplify the diff by getting replacing 4-5 digit length numbers with . sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: |' /tmp/http-headers-processed sed -i -e 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: |' -e "s|{nginx_version_string}|$nginx_version|g" /tmp/success-http-headers.template.txt if ! diff -ur /tmp/http-headers-processed /tmp/success-http-headers.template.txt; then set +x echo echo "FAILURE: The HTTP Headers returned from loading the homepage on the server do not match the contents of tools/ci/success-http-headers.template.txt. Typically, this means that the server threw a 500 when trying to load the homepage." echo "Displaying the contents of the server's error log:" echo cat /var/log/zulip/errors.log echo echo "Displaying the contents of the main server log:" echo cat /var/log/zulip/server.log exit 1 fi # Start the RabbitMQ queue worker related section echo echo "Now confirming all the RabbitMQ queue processors are correctly registered!" echo # These hacky shell scripts just extract the sorted list of queue processors, running and expected supervisorctl status | cut -f1 -dR | cut -f2- -d: | grep events | cut -f1 -d" " | cut -f3- -d_ | cut -f1 -d- | sort -u >/tmp/running_queue_processors.txt su zulip -c /home/zulip/deployments/current/scripts/lib/queue_workers.py | sort -u >/tmp/expected_queue_processors.txt if ! diff /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt >/dev/null; then set +x echo "FAILURE: Runnable queue processors declared in zerver/worker/queue_processors.py " echo "do not match those in puppet/zulip/manifests/profile/base.pp" echo "See https://zulip.readthedocs.io/en/latest/subsystems/queuing.html for details." echo diff -ur /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt exit 1 fi echo echo "Now running RabbitMQ consumer Nagios tests" echo # First run the check that usually runs in cron and populates the state files /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers # Then, compute the list of all Django queue workers to run Nagios checks against consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer) for consumer in $consumer_list; do if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then set +x echo echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:" set -x rabbitmqctl list_consumers supervisorctl status tail -n +1 /var/log/zulip/events*.log exit 1 fi done # Some of the Nagios tests have been temporarily disabled to work # around a Travis CI infrastructure issue. echo echo "Now running additional Nagios tests" echo if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_queue_worker_errors \ || ! su zulip -c /usr/lib/nagios/plugins/zulip_postgres_appdb/check_fts_update_log; then # || \ # ! su zulip -c "/usr/lib/nagios/plugins/zulip_app_frontend/check_send_receive_time --site=https://127.0.0.1/api --nagios --insecure"; then set +x echo echo "FAILURE: Nagios checks don't pass:" echo echo "DEBUG: printing Zulip server's error log:" cat /var/log/zulip/errors.log exit 1 fi echo "Production installation test successful!" exit 0