tornado: Move SIGTERM shutdown handler into a callback.

A SIGTERM can show up at any point in the ioloop, even in places which
are not prepared to handle it.  This results in the process ignoring
the `sys.exit` which the SIGTERM handler calls, with an uncaught
SystemExit exception:

```
2021-11-09 15:37:49.368 ERR  [tornado.application:9803] Uncaught exception
Traceback (most recent call last):
  File "/home/zulip/deployments/2021-11-08-05-10-23/zulip-py3-venv/lib/python3.6/site-packages/tornado/http1connection.py", line 238, in _read_message
    delegate.finish()
  File "/home/zulip/deployments/2021-11-08-05-10-23/zulip-py3-venv/lib/python3.6/site-packages/tornado/httpserver.py", line 314, in finish
    self.delegate.finish()
  File "/home/zulip/deployments/2021-11-08-05-10-23/zulip-py3-venv/lib/python3.6/site-packages/tornado/routing.py", line 251, in finish
    self.delegate.finish()
  File "/home/zulip/deployments/2021-11-08-05-10-23/zulip-py3-venv/lib/python3.6/site-packages/tornado/web.py", line 2097, in finish
    self.execute()
  File "/home/zulip/deployments/2021-11-08-05-10-23/zulip-py3-venv/lib/python3.6/site-packages/tornado/web.py", line 2130, in execute
    **self.path_kwargs)
  File "/home/zulip/deployments/2021-11-08-05-10-23/zulip-py3-venv/lib/python3.6/site-packages/tornado/gen.py", line 307, in wrapper
    yielded = next(result)
  File "/home/zulip/deployments/2021-11-08-05-10-23/zulip-py3-venv/lib/python3.6/site-packages/tornado/web.py", line 1510, in _execute
    result = method(*self.path_args, **self.path_kwargs)
  File "/home/zulip/deployments/2021-11-08-05-10-23/zerver/tornado/handlers.py", line 150, in get
    request = self.convert_tornado_request_to_django_request()
  File "/home/zulip/deployments/2021-11-08-05-10-23/zerver/tornado/handlers.py", line 113, in convert_tornado_request_to_django_request
    request = WSGIRequest(environ)
  File "/home/zulip/deployments/2021-11-08-05-10-23/zulip-py3-venv/lib/python3.6/site-packages/django/core/handlers/wsgi.py", line 66, in __init__
    script_name = get_script_name(environ)
  File "/home/zulip/deployments/2021-11-08-05-10-23/zerver/tornado/event_queue.py", line 611, in <lambda>
    signal.signal(signal.SIGTERM, lambda signum, stack: sys.exit(1))
SystemExit: 1
```

Supervisor then terminates the process with a SIGKILL, which results
in dropping data held in the tornado process, as it does not dump its
queue.

The only command which is safe to run in the signal handler is
`ioloop.add_callback_from_signal`, which schedules the callback to run
during the course of the normal ioloop.  This callbacks does an
orderly shutdown of the server and the ioloop before exiting.
This commit is contained in:
Alex Vandiver 2021-11-11 18:27:02 -08:00 committed by Tim Abbott
parent 847bf8207f
commit bc5539d871
2 changed files with 16 additions and 4 deletions

View File

@ -104,7 +104,7 @@ class Command(BaseCommand):
from zerver.tornado.ioloop_logging import logging_data from zerver.tornado.ioloop_logging import logging_data
logging_data["port"] = str(port) logging_data["port"] = str(port)
setup_event_queue(port) setup_event_queue(http_server, port)
add_client_gc_hook(missedmessage_hook) add_client_gc_hook(missedmessage_hook)
setup_tornado_rabbitmq() setup_tornado_rabbitmq()

View File

@ -22,6 +22,7 @@ from typing import (
List, List,
Mapping, Mapping,
MutableMapping, MutableMapping,
NoReturn,
Optional, Optional,
Sequence, Sequence,
Set, Set,
@ -603,12 +604,24 @@ def send_restart_events(immediate: bool = False) -> None:
client.add_event(event) client.add_event(event)
def setup_event_queue(port: int) -> None: def handle_sigterm(server: tornado.httpserver.HTTPServer) -> NoReturn:
logging.warning("Got SIGTERM, shutting down...")
server.stop()
tornado.ioloop.IOLoop.instance().stop()
sys.exit(1)
def setup_event_queue(server: tornado.httpserver.HTTPServer, port: int) -> None:
ioloop = tornado.ioloop.IOLoop.instance()
if not settings.TEST_SUITE: if not settings.TEST_SUITE:
load_event_queues(port) load_event_queues(port)
atexit.register(dump_event_queues, port) atexit.register(dump_event_queues, port)
# Make sure we dump event queues even if we exit via signal # Make sure we dump event queues even if we exit via signal
signal.signal(signal.SIGTERM, lambda signum, stack: sys.exit(1)) signal.signal(
signal.SIGTERM,
lambda signum, frame: ioloop.add_callback_from_signal(handle_sigterm, server),
)
add_reload_hook(lambda: dump_event_queues(port)) add_reload_hook(lambda: dump_event_queues(port))
try: try:
@ -617,7 +630,6 @@ def setup_event_queue(port: int) -> None:
pass pass
# Set up event queue garbage collection # Set up event queue garbage collection
ioloop = tornado.ioloop.IOLoop.instance()
pc = tornado.ioloop.PeriodicCallback( pc = tornado.ioloop.PeriodicCallback(
lambda: gc_event_queues(port), EVENT_QUEUE_GC_FREQ_MSECS, ioloop lambda: gc_event_queues(port), EVENT_QUEUE_GC_FREQ_MSECS, ioloop
) )