diff --git a/.circleci/config.yml b/.circleci/config.yml index 62264ebd98..672c1eac10 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -138,6 +138,12 @@ aliases: sudo service rabbitmq-server restart sudo mispipe "/tmp/production-install 2>&1" ts + - &verify_production + run: + name: verify install + command: | + sudo mispipe "/tmp/production-verify 2>&1" ts + - &check_xenial_provision_error run: name: check tools/provision error message on xenial @@ -290,6 +296,7 @@ jobs: - success-http-headers-bionic.txt - success-http-headers-focal.txt - production-install + - production-verify - production - production-extract-tarball - *notify_failure_status @@ -313,6 +320,7 @@ jobs: - *production_extract_tarball - *restore_cache_package_json - *install_production + - *verify_production - *save_cache_package_json - *notify_failure_status @@ -342,6 +350,7 @@ jobs: - *production_extract_tarball - *restore_cache_package_json - *install_production + - *verify_production - *save_cache_package_json - *notify_failure_status diff --git a/tools/ci/production-build b/tools/ci/production-build index d2e66e7209..d33b811433 100755 --- a/tools/ci/production-build +++ b/tools/ci/production-build @@ -35,6 +35,7 @@ cp -a \ tools/ci/success-http-headers-bionic.txt \ tools/ci/success-http-headers-focal.txt \ tools/ci/production-install \ + tools/ci/production-verify \ tools/ci/production-extract-tarball \ \ /tmp/ diff --git a/tools/ci/production-install b/tools/ci/production-install index e7c8101bed..ff6371eef6 100755 --- a/tools/ci/production-install +++ b/tools/ci/production-install @@ -1,7 +1,6 @@ #!/usr/bin/env bash # This test installs a Zulip production environment (from the release -# tarball from production-build), and then runs some Nagios checks and -# other tools to verify that everything is working properly. +# tarball from production-build). set -e set -x @@ -24,103 +23,5 @@ fi # Install Zulip "$ZULIP_PATH"/scripts/setup/install --self-signed-cert --hostname 127.0.0.1 --email circleci@example.com -cat >>/etc/zulip/settings.py < /tmp/wget-output || true # || true so we see errors.log if this 500s -grep -vi '\(Vary\|Content-Language\|expires\|issued by\|modified\|saved\|[.][.][.]\|Date\|[-][-]\)' /tmp/wget-output > /tmp/http-headers-processed - -# Simplify the diff by getting replacing 4-5 digit length numbers with . -sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: |' /tmp/http-headers-processed -sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: |' /tmp/success-http-headers-"$os_version_codename".txt -if ! diff -ur /tmp/http-headers-processed /tmp/success-http-headers-"$os_version_codename".txt; then - set +x - echo - echo "FAILURE: The HTTP Headers returned from loading the homepage on the server do not match the contents of tools/ci/success-http-headers.txt. Typically, this means that the server threw a 500 when trying to load the homepage." - echo "Displaying the contents of the server's error log:" - echo - cat /var/log/zulip/errors.log - echo - echo "Displaying the contents of the main server log:" - echo - cat /var/log/zulip/server.log - exit 1 -fi - -# Start the RabbitMQ queue worker related section -echo; echo "Now confirming all the RabbitMQ queue processors are correctly registered!"; echo -# These hacky shell scripts just extract the sorted list of queue processors, running and expected -supervisorctl status | cut -f1 -dR | cut -f2- -d: | grep events | cut -f1 -d" " | cut -f3- -d_ | cut -f1 -d- | sort -u > /tmp/running_queue_processors.txt -su zulip -c /home/zulip/deployments/current/scripts/lib/queue_workers.py | grep -v ^test$ | sort -u > /tmp/expected_queue_processors.txt -if ! diff /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt >/dev/null; then - set +x - echo "FAILURE: Runnable queue processors declared in zerver/worker/queue_processors.py " - echo "do not match those in puppet/manifests/zulip/base.pp" - echo "See https://zulip.readthedocs.io/en/latest/subsystems/queuing.html for details." - echo - diff -ur /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt - exit 1 -fi - -echo; echo "Now running RabbitMQ consumer Nagios tests"; echo -# First run the check that usually runs in cron and populates the state files -/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers - -# Then, compute the list of all Django queue workers to run Nagios checks against -consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer) -for consumer in $consumer_list; do - if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then - set +x - echo - echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:" - set -x - rabbitmqctl list_consumers - supervisorctl status - tail -n +1 /var/log/zulip/events*.log - exit 1 - fi -done - -# Some of the Nagios tests have been temporarily disabled to work -# around a Travis CI infrastructure issue. -echo; echo "Now running additional Nagios tests"; echo -if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_queue_worker_errors || \ - ! su zulip -c /usr/lib/nagios/plugins/zulip_postgres_appdb/check_fts_update_log; then # || \ -# ! su zulip -c "/usr/lib/nagios/plugins/zulip_app_frontend/check_send_receive_time --site=https://127.0.0.1/api --nagios --insecure"; then - set +x - echo - echo "FAILURE: Nagios checks don't pass:" - echo - echo "DEBUG: printing Zulip server's error log:" - cat /var/log/zulip/errors.log - exit 1 -fi -echo "Production installation test successful!" +echo "Production installation complete!" exit 0 diff --git a/tools/ci/production-verify b/tools/ci/production-verify new file mode 100755 index 0000000000..0625fd65ab --- /dev/null +++ b/tools/ci/production-verify @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +# This tests a Zulip production environment (installed via +# production-install) with some Nagios checks and other tools to +# verify that everything is working properly. +set -e +set -x + +if [ -f /etc/os-release ]; then + os_info="$(. /etc/os-release; printf '%s\n' "$VERSION_CODENAME")" + { read -r os_version_codename || true; } <<< "$os_info" +fi + +cat >>/etc/zulip/settings.py < /tmp/wget-output || true # || true so we see errors.log if this 500s +grep -vi '\(Vary\|Content-Language\|expires\|issued by\|modified\|saved\|[.][.][.]\|Date\|[-][-]\)' /tmp/wget-output > /tmp/http-headers-processed + +# Simplify the diff by getting replacing 4-5 digit length numbers with . +sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: |' /tmp/http-headers-processed +sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: |' /tmp/success-http-headers-"$os_version_codename".txt +if ! diff -ur /tmp/http-headers-processed /tmp/success-http-headers-"$os_version_codename".txt; then + set +x + echo + echo "FAILURE: The HTTP Headers returned from loading the homepage on the server do not match the contents of tools/ci/success-http-headers.txt. Typically, this means that the server threw a 500 when trying to load the homepage." + echo "Displaying the contents of the server's error log:" + echo + cat /var/log/zulip/errors.log + echo + echo "Displaying the contents of the main server log:" + echo + cat /var/log/zulip/server.log + exit 1 +fi + +# Start the RabbitMQ queue worker related section +echo; echo "Now confirming all the RabbitMQ queue processors are correctly registered!"; echo +# These hacky shell scripts just extract the sorted list of queue processors, running and expected +supervisorctl status | cut -f1 -dR | cut -f2- -d: | grep events | cut -f1 -d" " | cut -f3- -d_ | cut -f1 -d- | sort -u > /tmp/running_queue_processors.txt +su zulip -c /home/zulip/deployments/current/scripts/lib/queue_workers.py | grep -v ^test$ | sort -u > /tmp/expected_queue_processors.txt +if ! diff /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt >/dev/null; then + set +x + echo "FAILURE: Runnable queue processors declared in zerver/worker/queue_processors.py " + echo "do not match those in puppet/manifests/zulip/base.pp" + echo "See https://zulip.readthedocs.io/en/latest/subsystems/queuing.html for details." + echo + diff -ur /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt + exit 1 +fi + +echo; echo "Now running RabbitMQ consumer Nagios tests"; echo +# First run the check that usually runs in cron and populates the state files +/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers + +# Then, compute the list of all Django queue workers to run Nagios checks against +consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer) +for consumer in $consumer_list; do + if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then + set +x + echo + echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:" + set -x + rabbitmqctl list_consumers + supervisorctl status + tail -n +1 /var/log/zulip/events*.log + exit 1 + fi +done + +# Some of the Nagios tests have been temporarily disabled to work +# around a Travis CI infrastructure issue. +echo; echo "Now running additional Nagios tests"; echo +if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_queue_worker_errors || \ + ! su zulip -c /usr/lib/nagios/plugins/zulip_postgres_appdb/check_fts_update_log; then # || \ +# ! su zulip -c "/usr/lib/nagios/plugins/zulip_app_frontend/check_send_receive_time --site=https://127.0.0.1/api --nagios --insecure"; then + set +x + echo + echo "FAILURE: Nagios checks don't pass:" + echo + echo "DEBUG: printing Zulip server's error log:" + cat /var/log/zulip/errors.log + exit 1 +fi +echo "Production installation test successful!" +exit 0