mirror of https://github.com/zulip/zulip.git
debug: Add facility to dump tracemalloc snapshots.
Originally this used signals, namely SIGRTMIN. But in prod, the
signal handler never fired; I debugged fruitlessly for a while, and
suspect uwsgi was foiling it in a mysterious way (which is kind of
the only way uwsgi does anything.)
So, we listen on a socket. Bit more code, and a bit trickier to
invoke, but it works.
This was developed for the investigation of memory-bloating on
chat.zulip.org that led to a331b4f64
"Optimize query_all_subs_by_stream()".
For usage instructions, see docstring.
This commit is contained in:
parent
6dd639454e
commit
b6cc21b438
|
@ -1,12 +1,21 @@
|
||||||
|
|
||||||
import code
|
import code
|
||||||
import traceback
|
import gc
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
import signal
|
import signal
|
||||||
|
import socket
|
||||||
|
import threading
|
||||||
|
import traceback
|
||||||
|
import tracemalloc
|
||||||
from types import FrameType
|
from types import FrameType
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils.timezone import now as timezone_now
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger('zulip.debug')
|
||||||
|
|
||||||
# Interactive debugging code from
|
# Interactive debugging code from
|
||||||
# http://stackoverflow.com/questions/132058/showing-the-stack-trace-from-a-running-python-application
|
# http://stackoverflow.com/questions/132058/showing-the-stack-trace-from-a-running-python-application
|
||||||
# (that link also points to code for an interactive remote debugger
|
# (that link also points to code for an interactive remote debugger
|
||||||
|
@ -29,3 +38,76 @@ def interactive_debug(sig: int, frame: FrameType) -> None:
|
||||||
def interactive_debug_listen() -> None:
|
def interactive_debug_listen() -> None:
|
||||||
signal.signal(signal.SIGUSR1, lambda sig, stack: traceback.print_stack(stack))
|
signal.signal(signal.SIGUSR1, lambda sig, stack: traceback.print_stack(stack))
|
||||||
signal.signal(signal.SIGUSR2, interactive_debug)
|
signal.signal(signal.SIGUSR2, interactive_debug)
|
||||||
|
|
||||||
|
def tracemalloc_dump():
|
||||||
|
# type: () -> None
|
||||||
|
if not tracemalloc.is_tracing():
|
||||||
|
logger.warning("pid {}: tracemalloc off, nothing to dump"
|
||||||
|
.format(os.getpid()))
|
||||||
|
return
|
||||||
|
# Despite our name for it, `timezone_now` always deals in UTC.
|
||||||
|
basename = "snap.{}.{}".format(os.getpid(),
|
||||||
|
timezone_now().strftime("%F-%T"))
|
||||||
|
path = os.path.join(settings.TRACEMALLOC_DUMP_DIR, basename)
|
||||||
|
os.makedirs(settings.TRACEMALLOC_DUMP_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
gc.collect()
|
||||||
|
tracemalloc.take_snapshot().dump(path)
|
||||||
|
|
||||||
|
procstat = open('/proc/{}/stat'.format(os.getpid()), 'rb').read().split()
|
||||||
|
rss_pages = int(procstat[23])
|
||||||
|
logger.info("tracemalloc dump: tracing {} MiB ({} MiB peak), using {} MiB; rss {} MiB; dumped {}"
|
||||||
|
.format(tracemalloc.get_traced_memory()[0] // 1048576,
|
||||||
|
tracemalloc.get_traced_memory()[1] // 1048576,
|
||||||
|
tracemalloc.get_tracemalloc_memory() // 1048576,
|
||||||
|
rss_pages // 256,
|
||||||
|
basename))
|
||||||
|
|
||||||
|
def tracemalloc_listen_sock(sock):
|
||||||
|
# type: (socket.socket) -> None
|
||||||
|
logger.debug('pid {}: tracemalloc_listen_sock started!'.format(os.getpid()))
|
||||||
|
while True:
|
||||||
|
sock.recv(1)
|
||||||
|
tracemalloc_dump()
|
||||||
|
|
||||||
|
listener_pid = None # Optional[int]
|
||||||
|
|
||||||
|
def tracemalloc_listen():
|
||||||
|
# type: () -> None
|
||||||
|
global listener_pid
|
||||||
|
if listener_pid == os.getpid():
|
||||||
|
# Already set up -- and in this process, not just its parent.
|
||||||
|
return
|
||||||
|
logger.debug('pid {}: tracemalloc_listen working...'.format(os.getpid()))
|
||||||
|
listener_pid = os.getpid()
|
||||||
|
|
||||||
|
sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
|
||||||
|
path = "/tmp/tracemalloc.{}".format(os.getpid())
|
||||||
|
sock.bind(path)
|
||||||
|
thread = threading.Thread(target=lambda: tracemalloc_listen_sock(sock),
|
||||||
|
daemon=True)
|
||||||
|
thread.start()
|
||||||
|
logger.debug('pid {}: tracemalloc_listen done: {}'.format(
|
||||||
|
os.getpid(), path))
|
||||||
|
|
||||||
|
def maybe_tracemalloc_listen():
|
||||||
|
# type: () -> None
|
||||||
|
'''If tracemalloc tracing enabled, listen for requests to dump a snapshot.
|
||||||
|
|
||||||
|
To trigger once this is listening:
|
||||||
|
echo | socat -u stdin unix-sendto:/tmp/tracemalloc.$pid
|
||||||
|
|
||||||
|
To enable in the Zulip web server: edit /etc/zulip/uwsgi.ini ,
|
||||||
|
and add e.g. ` PYTHONTRACEMALLOC=5` to the `env=` line.
|
||||||
|
This function is called in middleware, so the process will
|
||||||
|
automatically start listening.
|
||||||
|
|
||||||
|
To enable in other contexts: see upstream docs
|
||||||
|
https://docs.python.org/3/library/tracemalloc .
|
||||||
|
You may also have to add a call to this function somewhere.
|
||||||
|
|
||||||
|
'''
|
||||||
|
if os.environ.get('PYTHONTRACEMALLOC'):
|
||||||
|
# If the server was started with `tracemalloc` tracing on, then
|
||||||
|
# listen for a signal to dump `tracemalloc` snapshots.
|
||||||
|
tracemalloc_listen()
|
||||||
|
|
|
@ -20,6 +20,7 @@ from django.views.csrf import csrf_failure as html_csrf_failure
|
||||||
|
|
||||||
from zerver.lib.bugdown import get_bugdown_requests, get_bugdown_time
|
from zerver.lib.bugdown import get_bugdown_requests, get_bugdown_time
|
||||||
from zerver.lib.cache import get_remote_cache_requests, get_remote_cache_time
|
from zerver.lib.cache import get_remote_cache_requests, get_remote_cache_time
|
||||||
|
from zerver.lib.debug import maybe_tracemalloc_listen
|
||||||
from zerver.lib.exceptions import ErrorCode, JsonableError, RateLimited
|
from zerver.lib.exceptions import ErrorCode, JsonableError, RateLimited
|
||||||
from zerver.lib.queue import queue_json_publish
|
from zerver.lib.queue import queue_json_publish
|
||||||
from zerver.lib.response import json_error, json_response_from_error
|
from zerver.lib.response import json_error, json_response_from_error
|
||||||
|
@ -233,6 +234,7 @@ class LogRequests(MiddlewareMixin):
|
||||||
# method here too
|
# method here too
|
||||||
def process_request(self, request):
|
def process_request(self, request):
|
||||||
# type: (HttpRequest) -> None
|
# type: (HttpRequest) -> None
|
||||||
|
maybe_tracemalloc_listen()
|
||||||
request._log_data = dict()
|
request._log_data = dict()
|
||||||
record_request_start_data(request._log_data)
|
record_request_start_data(request._log_data)
|
||||||
if connection.connection is not None:
|
if connection.connection is not None:
|
||||||
|
|
|
@ -1188,6 +1188,7 @@ ZULIP_PATHS = [
|
||||||
("ANALYTICS_LOCK_DIR", "/home/zulip/deployments/analytics-lock-dir"),
|
("ANALYTICS_LOCK_DIR", "/home/zulip/deployments/analytics-lock-dir"),
|
||||||
("API_KEY_ONLY_WEBHOOK_LOG_PATH", "/var/log/zulip/webhooks_errors.log"),
|
("API_KEY_ONLY_WEBHOOK_LOG_PATH", "/var/log/zulip/webhooks_errors.log"),
|
||||||
("SOFT_DEACTIVATION_LOG_PATH", "/var/log/zulip/soft_deactivation.log"),
|
("SOFT_DEACTIVATION_LOG_PATH", "/var/log/zulip/soft_deactivation.log"),
|
||||||
|
("TRACEMALLOC_DUMP_DIR", "/var/log/zulip/tracemalloc"),
|
||||||
]
|
]
|
||||||
|
|
||||||
# The Event log basically logs most significant database changes,
|
# The Event log basically logs most significant database changes,
|
||||||
|
|
Loading…
Reference in New Issue