From b6cc21b438b339f4a8494a551144368dbc0e53da Mon Sep 17 00:00:00 2001 From: Greg Price Date: Tue, 3 Oct 2017 16:29:53 -0700 Subject: [PATCH] debug: Add facility to dump tracemalloc snapshots. Originally this used signals, namely SIGRTMIN. But in prod, the signal handler never fired; I debugged fruitlessly for a while, and suspect uwsgi was foiling it in a mysterious way (which is kind of the only way uwsgi does anything.) So, we listen on a socket. Bit more code, and a bit trickier to invoke, but it works. This was developed for the investigation of memory-bloating on chat.zulip.org that led to a331b4f64 "Optimize query_all_subs_by_stream()". For usage instructions, see docstring. --- zerver/lib/debug.py | 86 ++++++++++++++++++++++++++++++++++++++++++-- zerver/middleware.py | 2 ++ zproject/settings.py | 1 + 3 files changed, 87 insertions(+), 2 deletions(-) diff --git a/zerver/lib/debug.py b/zerver/lib/debug.py index c2c9234e4a..babc13f884 100644 --- a/zerver/lib/debug.py +++ b/zerver/lib/debug.py @@ -1,12 +1,21 @@ import code -import traceback +import gc +import logging +import os import signal - +import socket +import threading +import traceback +import tracemalloc from types import FrameType +from django.conf import settings +from django.utils.timezone import now as timezone_now from typing import Optional +logger = logging.getLogger('zulip.debug') + # Interactive debugging code from # http://stackoverflow.com/questions/132058/showing-the-stack-trace-from-a-running-python-application # (that link also points to code for an interactive remote debugger @@ -29,3 +38,76 @@ def interactive_debug(sig: int, frame: FrameType) -> None: def interactive_debug_listen() -> None: signal.signal(signal.SIGUSR1, lambda sig, stack: traceback.print_stack(stack)) signal.signal(signal.SIGUSR2, interactive_debug) + +def tracemalloc_dump(): + # type: () -> None + if not tracemalloc.is_tracing(): + logger.warning("pid {}: tracemalloc off, nothing to dump" + .format(os.getpid())) + return + # Despite our name for it, `timezone_now` always deals in UTC. + basename = "snap.{}.{}".format(os.getpid(), + timezone_now().strftime("%F-%T")) + path = os.path.join(settings.TRACEMALLOC_DUMP_DIR, basename) + os.makedirs(settings.TRACEMALLOC_DUMP_DIR, exist_ok=True) + + gc.collect() + tracemalloc.take_snapshot().dump(path) + + procstat = open('/proc/{}/stat'.format(os.getpid()), 'rb').read().split() + rss_pages = int(procstat[23]) + logger.info("tracemalloc dump: tracing {} MiB ({} MiB peak), using {} MiB; rss {} MiB; dumped {}" + .format(tracemalloc.get_traced_memory()[0] // 1048576, + tracemalloc.get_traced_memory()[1] // 1048576, + tracemalloc.get_tracemalloc_memory() // 1048576, + rss_pages // 256, + basename)) + +def tracemalloc_listen_sock(sock): + # type: (socket.socket) -> None + logger.debug('pid {}: tracemalloc_listen_sock started!'.format(os.getpid())) + while True: + sock.recv(1) + tracemalloc_dump() + +listener_pid = None # Optional[int] + +def tracemalloc_listen(): + # type: () -> None + global listener_pid + if listener_pid == os.getpid(): + # Already set up -- and in this process, not just its parent. + return + logger.debug('pid {}: tracemalloc_listen working...'.format(os.getpid())) + listener_pid = os.getpid() + + sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) + path = "/tmp/tracemalloc.{}".format(os.getpid()) + sock.bind(path) + thread = threading.Thread(target=lambda: tracemalloc_listen_sock(sock), + daemon=True) + thread.start() + logger.debug('pid {}: tracemalloc_listen done: {}'.format( + os.getpid(), path)) + +def maybe_tracemalloc_listen(): + # type: () -> None + '''If tracemalloc tracing enabled, listen for requests to dump a snapshot. + + To trigger once this is listening: + echo | socat -u stdin unix-sendto:/tmp/tracemalloc.$pid + + To enable in the Zulip web server: edit /etc/zulip/uwsgi.ini , + and add e.g. ` PYTHONTRACEMALLOC=5` to the `env=` line. + This function is called in middleware, so the process will + automatically start listening. + + To enable in other contexts: see upstream docs + https://docs.python.org/3/library/tracemalloc . + You may also have to add a call to this function somewhere. + + ''' + if os.environ.get('PYTHONTRACEMALLOC'): + # If the server was started with `tracemalloc` tracing on, then + # listen for a signal to dump `tracemalloc` snapshots. + tracemalloc_listen() diff --git a/zerver/middleware.py b/zerver/middleware.py index 1d6caed89d..2628dbe0c4 100644 --- a/zerver/middleware.py +++ b/zerver/middleware.py @@ -20,6 +20,7 @@ from django.views.csrf import csrf_failure as html_csrf_failure from zerver.lib.bugdown import get_bugdown_requests, get_bugdown_time from zerver.lib.cache import get_remote_cache_requests, get_remote_cache_time +from zerver.lib.debug import maybe_tracemalloc_listen from zerver.lib.exceptions import ErrorCode, JsonableError, RateLimited from zerver.lib.queue import queue_json_publish from zerver.lib.response import json_error, json_response_from_error @@ -233,6 +234,7 @@ class LogRequests(MiddlewareMixin): # method here too def process_request(self, request): # type: (HttpRequest) -> None + maybe_tracemalloc_listen() request._log_data = dict() record_request_start_data(request._log_data) if connection.connection is not None: diff --git a/zproject/settings.py b/zproject/settings.py index 854e847099..1cd609ee8c 100644 --- a/zproject/settings.py +++ b/zproject/settings.py @@ -1188,6 +1188,7 @@ ZULIP_PATHS = [ ("ANALYTICS_LOCK_DIR", "/home/zulip/deployments/analytics-lock-dir"), ("API_KEY_ONLY_WEBHOOK_LOG_PATH", "/var/log/zulip/webhooks_errors.log"), ("SOFT_DEACTIVATION_LOG_PATH", "/var/log/zulip/soft_deactivation.log"), + ("TRACEMALLOC_DUMP_DIR", "/var/log/zulip/tracemalloc"), ] # The Event log basically logs most significant database changes,