debug: Add facility to dump tracemalloc snapshots.

Originally this used signals, namely SIGRTMIN.  But in prod, the
signal handler never fired; I debugged fruitlessly for a while, and
suspect uwsgi was foiling it in a mysterious way (which is kind of
the only way uwsgi does anything.)

So, we listen on a socket.  Bit more code, and a bit trickier to
invoke, but it works.

This was developed for the investigation of memory-bloating on
chat.zulip.org that led to a331b4f64 "Optimize query_all_subs_by_stream()".

For usage instructions, see docstring.
This commit is contained in:
Greg Price 2017-10-03 16:29:53 -07:00 committed by Tim Abbott
parent 6dd639454e
commit b6cc21b438
3 changed files with 87 additions and 2 deletions

View File

@ -1,12 +1,21 @@
import code
import traceback
import gc
import logging
import os
import signal
import socket
import threading
import traceback
import tracemalloc
from types import FrameType
from django.conf import settings
from django.utils.timezone import now as timezone_now
from typing import Optional
logger = logging.getLogger('zulip.debug')
# Interactive debugging code from
# http://stackoverflow.com/questions/132058/showing-the-stack-trace-from-a-running-python-application
# (that link also points to code for an interactive remote debugger
@ -29,3 +38,76 @@ def interactive_debug(sig: int, frame: FrameType) -> None:
def interactive_debug_listen() -> None:
signal.signal(signal.SIGUSR1, lambda sig, stack: traceback.print_stack(stack))
signal.signal(signal.SIGUSR2, interactive_debug)
def tracemalloc_dump():
# type: () -> None
if not tracemalloc.is_tracing():
logger.warning("pid {}: tracemalloc off, nothing to dump"
.format(os.getpid()))
return
# Despite our name for it, `timezone_now` always deals in UTC.
basename = "snap.{}.{}".format(os.getpid(),
timezone_now().strftime("%F-%T"))
path = os.path.join(settings.TRACEMALLOC_DUMP_DIR, basename)
os.makedirs(settings.TRACEMALLOC_DUMP_DIR, exist_ok=True)
gc.collect()
tracemalloc.take_snapshot().dump(path)
procstat = open('/proc/{}/stat'.format(os.getpid()), 'rb').read().split()
rss_pages = int(procstat[23])
logger.info("tracemalloc dump: tracing {} MiB ({} MiB peak), using {} MiB; rss {} MiB; dumped {}"
.format(tracemalloc.get_traced_memory()[0] // 1048576,
tracemalloc.get_traced_memory()[1] // 1048576,
tracemalloc.get_tracemalloc_memory() // 1048576,
rss_pages // 256,
basename))
def tracemalloc_listen_sock(sock):
# type: (socket.socket) -> None
logger.debug('pid {}: tracemalloc_listen_sock started!'.format(os.getpid()))
while True:
sock.recv(1)
tracemalloc_dump()
listener_pid = None # Optional[int]
def tracemalloc_listen():
# type: () -> None
global listener_pid
if listener_pid == os.getpid():
# Already set up -- and in this process, not just its parent.
return
logger.debug('pid {}: tracemalloc_listen working...'.format(os.getpid()))
listener_pid = os.getpid()
sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
path = "/tmp/tracemalloc.{}".format(os.getpid())
sock.bind(path)
thread = threading.Thread(target=lambda: tracemalloc_listen_sock(sock),
daemon=True)
thread.start()
logger.debug('pid {}: tracemalloc_listen done: {}'.format(
os.getpid(), path))
def maybe_tracemalloc_listen():
# type: () -> None
'''If tracemalloc tracing enabled, listen for requests to dump a snapshot.
To trigger once this is listening:
echo | socat -u stdin unix-sendto:/tmp/tracemalloc.$pid
To enable in the Zulip web server: edit /etc/zulip/uwsgi.ini ,
and add e.g. ` PYTHONTRACEMALLOC=5` to the `env=` line.
This function is called in middleware, so the process will
automatically start listening.
To enable in other contexts: see upstream docs
https://docs.python.org/3/library/tracemalloc .
You may also have to add a call to this function somewhere.
'''
if os.environ.get('PYTHONTRACEMALLOC'):
# If the server was started with `tracemalloc` tracing on, then
# listen for a signal to dump `tracemalloc` snapshots.
tracemalloc_listen()

View File

@ -20,6 +20,7 @@ from django.views.csrf import csrf_failure as html_csrf_failure
from zerver.lib.bugdown import get_bugdown_requests, get_bugdown_time
from zerver.lib.cache import get_remote_cache_requests, get_remote_cache_time
from zerver.lib.debug import maybe_tracemalloc_listen
from zerver.lib.exceptions import ErrorCode, JsonableError, RateLimited
from zerver.lib.queue import queue_json_publish
from zerver.lib.response import json_error, json_response_from_error
@ -233,6 +234,7 @@ class LogRequests(MiddlewareMixin):
# method here too
def process_request(self, request):
# type: (HttpRequest) -> None
maybe_tracemalloc_listen()
request._log_data = dict()
record_request_start_data(request._log_data)
if connection.connection is not None:

View File

@ -1188,6 +1188,7 @@ ZULIP_PATHS = [
("ANALYTICS_LOCK_DIR", "/home/zulip/deployments/analytics-lock-dir"),
("API_KEY_ONLY_WEBHOOK_LOG_PATH", "/var/log/zulip/webhooks_errors.log"),
("SOFT_DEACTIVATION_LOG_PATH", "/var/log/zulip/soft_deactivation.log"),
("TRACEMALLOC_DUMP_DIR", "/var/log/zulip/tracemalloc"),
]
# The Event log basically logs most significant database changes,