error_notify: Remove custom email error reporting handler.

Restore the default django.utils.log.AdminEmailHandler when
ERROR_REPORTING is enabled.  Those with more sophisticated needs can
turn it off and use Sentry or a Sentry-compatible system.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg 2023-06-09 16:01:05 -07:00 committed by Tim Abbott
parent bd2f327a25
commit b285813beb
14 changed files with 5 additions and 580 deletions

View File

@ -85,7 +85,6 @@ zulip-workers:zulip_events_email_mirror RUNNING pid 10
zulip-workers:zulip_events_email_senders RUNNING pid 10769, uptime 19:40:49
zulip-workers:zulip_events_embed_links RUNNING pid 11035, uptime 19:40:46
zulip-workers:zulip_events_embedded_bots RUNNING pid 11139, uptime 19:40:43
zulip-workers:zulip_events_error_reports RUNNING pid 11154, uptime 19:40:40
zulip-workers:zulip_events_invites RUNNING pid 11261, uptime 19:40:36
zulip-workers:zulip_events_missedmessage_emails RUNNING pid 11346, uptime 19:40:21
zulip-workers:zulip_events_missedmessage_mobile_notifications RUNNING pid 11351, uptime 19:40:19

View File

@ -21,11 +21,7 @@ The [Django][django-errors] framework provides much of the
infrastructure needed by our error reporting system:
- The ability to send emails to the server's administrators with any
500 errors, using the `mail_admins` function. We enhance these data
with extra details (like what user was involved in the error) in
`zerver/logging_handlers.py`, and then send them to the
administrator in `zerver/lib/error_notify.py` (which also supports
sending Zulips to a stream about production errors).
500 errors, using `django.utils.log.AdminEmailHandler`.
- The ability to rate-limit certain errors to avoid sending hundreds
of emails in an outage (see `_RateLimitFilter` in
`zerver/lib/logging_util.py`)
@ -270,7 +266,6 @@ a new view:
- The time when the browser was idle again after switching views
(intended to catch issues where we generate a lot of deferred work).
[django-errors]: https://docs.djangoproject.com/en/3.2/howto/error-reporting/
[python-logging]: https://docs.python.org/3/library/logging.html
[django-logging]: https://docs.djangoproject.com/en/3.2/topics/logging/
[sentry]: https://sentry.io

View File

@ -133,7 +133,6 @@ class zulip::app_frontend_base {
'email_mirror',
'embed_links',
'embedded_bots',
'error_reports',
'invites',
'email_senders',
'missedmessage_emails',

View File

@ -403,12 +403,6 @@ define service {
check_command check_rabbitmq_consumers!embedded_bots
}
define service {
use rabbitmq-consumer-service
service_description Check RabbitMQ error_reports consumers
check_command check_rabbitmq_consumers!error_reports
}
define service {
use rabbitmq-consumer-service
service_description Check RabbitMQ invites consumers

View File

@ -15,7 +15,6 @@ normal_queues = [
"email_senders",
"embed_links",
"embedded_bots",
"error_reports",
"invites",
"missedmessage_emails",
"missedmessage_mobile_notifications",

View File

@ -75,7 +75,6 @@ not_yet_fully_covered = [
"zerver/lib/bot_lib.py",
"zerver/lib/camo.py",
"zerver/lib/debug.py",
"zerver/lib/error_notify.py",
"zerver/lib/export.py",
"zerver/lib/fix_unreads.py",
"zerver/lib/import_realm.py",

View File

@ -1,4 +1,3 @@
import re
from typing import Any, Dict, Optional
from django.http import HttpRequest
@ -30,7 +29,3 @@ class ZulipExceptionReporterFilter(SafeExceptionReporterFilter):
if var in filtered_post:
filtered_post[var] = "**********"
return filtered_post
def clean_data_from_query_parameters(val: str) -> str:
return re.sub(r"([a-z_-]+=)([^&]+)([&]|$)", r"\1******\3", val)

View File

@ -1,57 +0,0 @@
# System documented in https://zulip.readthedocs.io/en/latest/subsystems/logging.html
from collections import defaultdict
from typing import Any, Dict
from django.core.mail import mail_admins
from zerver.filters import clean_data_from_query_parameters
def do_report_error(report: Dict[str, Any]) -> None:
report = defaultdict(lambda: None, report)
topic = "{node}: {message}".format(**report).replace("\n", "\\n").replace("\r", "\\r")
logger_str = "Logger {logger_name}, from module {log_module} line {log_lineno}:".format(
**report
)
if report.get("user") and report["user"].get("user_full_name"):
user_info = "{user[user_full_name]} <{user[user_email]}> ({user[user_role]})".format(
**report
)
else:
user_info = "Anonymous user (not logged in)"
user_info += " on {host} deployment".format(**report)
deployment = "Deployed code:\n"
for field, val in report["deployment_data"].items():
deployment += f"- {field}: {val}\n"
if report["has_request"]:
request_repr = """\
Request info:
- path: {path}
- {method}: {data}
""".format(
**report
)
for field in ["REMOTE_ADDR", "QUERY_STRING", "SERVER_NAME"]:
val = report.get(field.lower())
if field == "QUERY_STRING":
val = clean_data_from_query_parameters(str(val))
request_repr += f'- {field}: "{val}"\n'
else:
request_repr = "Request info: none\n"
message = f"""\
{logger_str}
Error generated by {user_info}
{report['stack_trace']}
{deployment}
{request_repr}"""
mail_admins(topic, message, fail_silently=True)

View File

@ -112,11 +112,6 @@ class ReturnTrue(logging.Filter):
return True
class ReturnEnabled(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
return settings.LOGGING_ENABLED
class RequireReallyDeployed(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
return settings.PRODUCTION

View File

@ -1,21 +1,7 @@
# System documented in https://zulip.readthedocs.io/en/latest/subsystems/logging.html
import logging
import os
import platform
import subprocess
import traceback
from typing import Any, Dict, Optional, Protocol, runtime_checkable
from urllib.parse import SplitResult
from django.conf import settings
from django.http import HttpRequest
from django.utils.translation import override as override_language
from django.views.debug import get_exception_reporter_filter
from sentry_sdk import capture_exception
from version import ZULIP_VERSION
from zerver.lib.logging_util import find_log_caller_module
from zerver.lib.queue import queue_json_publish
from typing import Optional
def try_git_describe() -> Optional[str]:
@ -28,139 +14,3 @@ def try_git_describe() -> Optional[str]:
).strip()
except (FileNotFoundError, subprocess.CalledProcessError): # nocoverage
return None
def add_request_metadata(report: Dict[str, Any], request: HttpRequest) -> None:
report["has_request"] = True
report["path"] = request.path
report["method"] = request.method
report["remote_addr"] = request.META.get("REMOTE_ADDR", None)
report["query_string"] = request.META.get("QUERY_STRING", None)
report["server_name"] = request.META.get("SERVER_NAME", None)
try:
from django.contrib.auth.models import AnonymousUser
user_profile = request.user
if isinstance(user_profile, AnonymousUser):
user_full_name = None
user_email = None
user_role = None
else:
user_full_name = user_profile.full_name
user_email = user_profile.email
with override_language(settings.LANGUAGE_CODE):
# str() to force the lazy-translation to apply now,
# since it won't serialize into the worker queue.
user_role = str(user_profile.get_role_name())
except Exception:
# Unexpected exceptions here should be handled gracefully
traceback.print_exc()
user_full_name = None
user_email = None
user_role = None
report["user"] = {
"user_email": user_email,
"user_full_name": user_full_name,
"user_role": user_role,
}
exception_filter = get_exception_reporter_filter(request)
try:
report["data"] = (
exception_filter.get_post_parameters(request)
if request.method == "POST"
else request.GET
)
except Exception:
# exception_filter.get_post_parameters will throw
# RequestDataTooBig if there's a really big file uploaded
report["data"] = {}
try:
report["host"] = SplitResult("", request.get_host(), "", "", "").hostname
except Exception:
# request.get_host() will throw a DisallowedHost
# exception if the host is invalid
report["host"] = platform.node()
@runtime_checkable
class HasRequest(Protocol):
request: HttpRequest
class AdminNotifyHandler(logging.Handler):
"""An logging handler that sends the log/exception to the queue to be
turned into an email and/or a Zulip message for the server admins.
"""
# adapted in part from django/utils/log.py
def __init__(self) -> None:
logging.Handler.__init__(self)
def emit(self, record: logging.LogRecord) -> None:
report: Dict[str, Any] = {}
try:
report["node"] = platform.node()
report["host"] = platform.node()
report["deployment_data"] = dict(
git=try_git_describe(),
ZULIP_VERSION=ZULIP_VERSION,
)
if record.exc_info:
stack_trace = "".join(traceback.format_exception(*record.exc_info))
message = str(record.exc_info[1])
else:
stack_trace = "No stack trace available"
message = record.getMessage()
if "\n" in message:
# Some exception code paths in queue processors
# seem to result in super-long messages
stack_trace = message
message = message.split("\n")[0]
report["stack_trace"] = stack_trace
report["message"] = message
report["logger_name"] = record.name
report["log_module"] = find_log_caller_module(record)
report["log_lineno"] = record.lineno
if isinstance(record, HasRequest):
add_request_metadata(report, record.request)
except Exception:
report["message"] = "Exception in preparing exception report!"
logging.warning(report["message"], exc_info=True)
report["stack_trace"] = "See /var/log/zulip/errors.log"
capture_exception()
if settings.DEBUG_ERROR_REPORTING: # nocoverage
logging.warning("Reporting an error to admins...")
logging.warning(
"Reporting an error to admins: %s %s %s %s %s",
record.levelname,
report["logger_name"],
report["log_module"],
report["message"],
report["stack_trace"],
)
try:
queue_json_publish(
"error_reports",
dict(
type="server",
report=report,
),
)
except Exception:
# If this breaks, complain loudly but don't pass the traceback up the stream
# However, we *don't* want to use logging.exception since that could trigger a loop.
logging.warning("Reporting an exception triggered an exception!", exc_info=True)
capture_exception()

View File

@ -1,303 +0,0 @@
import logging
import sys
from functools import wraps
from types import TracebackType
from typing import Callable, Dict, Iterator, NoReturn, Optional, Tuple, Type, Union
from unittest import mock
from unittest.mock import MagicMock, patch
from django.conf import settings
from django.contrib.auth.models import AnonymousUser
from django.http import HttpRequest, HttpResponse
from django.utils.log import AdminEmailHandler
from typing_extensions import Concatenate, ParamSpec
from zerver.lib.test_classes import ZulipTestCase
from zerver.lib.test_helpers import mock_queue_publish
from zerver.logging_handlers import AdminNotifyHandler, HasRequest
from zerver.models import UserProfile
ParamT = ParamSpec("ParamT")
captured_request: Optional[HttpRequest] = None
captured_exc_info: Optional[
Union[Tuple[Type[BaseException], BaseException, TracebackType], Tuple[None, None, None]]
] = None
def capture_and_throw(
view_func: Callable[Concatenate[HttpRequest, UserProfile, ParamT], HttpResponse]
) -> Callable[Concatenate[HttpRequest, ParamT], NoReturn]:
@wraps(view_func)
def wrapped_view(
request: HttpRequest,
/,
*args: ParamT.args,
**kwargs: ParamT.kwargs,
) -> NoReturn:
global captured_request
captured_request = request
try:
raise Exception("Request error")
except Exception as e:
global captured_exc_info
captured_exc_info = sys.exc_info()
raise e
return wrapped_view
class AdminNotifyHandlerTest(ZulipTestCase):
logger = logging.getLogger("django")
def setUp(self) -> None:
super().setUp()
self.handler = AdminNotifyHandler()
# Prevent the exceptions we're going to raise from being printed
# You may want to disable this when debugging tests
settings.LOGGING_ENABLED = False
global captured_exc_info
global captured_request
captured_request = None
captured_exc_info = None
def tearDown(self) -> None:
settings.LOGGING_ENABLED = True
super().tearDown()
def get_admin_zulip_handler(self) -> AdminNotifyHandler:
return [h for h in logging.getLogger("").handlers if isinstance(h, AdminNotifyHandler)][0]
@patch("zerver.logging_handlers.try_git_describe")
def test_basic(self, mock_function: MagicMock) -> None:
mock_function.return_value = None
"""A random exception passes happily through AdminNotifyHandler"""
handler = self.get_admin_zulip_handler()
try:
raise Exception("Testing error!")
except Exception:
exc_info = sys.exc_info()
record = self.logger.makeRecord(
"name", logging.ERROR, "function", 16, "message", {}, exc_info
)
handler.emit(record)
def simulate_error(self) -> logging.LogRecord:
self.login("hamlet")
with patch(
"zerver.lib.rest.authenticated_json_view", side_effect=capture_and_throw
) as view_decorator_patch, self.assertLogs(
"django.request", level="ERROR"
) as request_error_log, self.assertLogs(
"zerver.middleware.json_error_handler", level="ERROR"
) as json_error_handler_log, self.settings(
TEST_SUITE=False
):
result = self.client_get("/json/users")
self.assert_json_error(result, "Internal server error", status_code=500)
view_decorator_patch.assert_called_once()
self.assertEqual(
request_error_log.output, ["ERROR:django.request:Internal Server Error: /json/users"]
)
self.assertTrue(
"ERROR:zerver.middleware.json_error_handler:Traceback (most recent call last):"
in json_error_handler_log.output[0]
)
self.assertTrue("Exception: Request error" in json_error_handler_log.output[0])
record = self.logger.makeRecord(
"name",
logging.ERROR,
"function",
15,
"message",
{},
captured_exc_info,
extra={"request": captured_request},
)
return record
def run_handler(self, record: logging.LogRecord) -> Dict[str, object]:
with patch("zerver.worker.queue_processors.do_report_error") as patched_notify:
self.handler.emit(record)
patched_notify.assert_called_once()
return patched_notify.call_args[0][0]
@patch("zerver.logging_handlers.try_git_describe")
def test_long_exception_request(self, mock_function: MagicMock) -> None:
mock_function.return_value = None
"""A request with no stack and multi-line report.getMessage() is handled properly"""
record = self.simulate_error()
record.exc_info = None
record.msg = "message\nmoremessage\nmore"
report = self.run_handler(record)
self.assertIn("user", report)
assert isinstance(report["user"], dict)
self.assertIn("user_email", report["user"])
self.assertIn("user_role", report["user"])
self.assertIn("message", report)
self.assertIn("stack_trace", report)
self.assertEqual(report["stack_trace"], "message\nmoremessage\nmore")
self.assertEqual(report["message"], "message")
@patch("zerver.logging_handlers.try_git_describe")
def test_request(self, mock_function: MagicMock) -> None:
mock_function.return_value = None
"""A normal request is handled properly"""
record = self.simulate_error()
assert isinstance(record, HasRequest)
report = self.run_handler(record)
self.assertIn("user", report)
assert isinstance(report["user"], dict)
self.assertIn("user_email", report["user"])
self.assertIn("user_role", report["user"])
self.assertIn("message", report)
self.assertIn("stack_trace", report)
# Test that `add_request_metadata` throwing an exception is fine
with patch("zerver.logging_handlers.traceback.print_exc"):
with patch(
"zerver.logging_handlers.add_request_metadata",
side_effect=Exception("Unexpected exception!"),
):
report = self.run_handler(record)
self.assertNotIn("user", report)
self.assertIn("message", report)
self.assertEqual(report["stack_trace"], "See /var/log/zulip/errors.log")
# Check anonymous user is handled correctly
record.request.user = AnonymousUser()
report = self.run_handler(record)
self.assertIn("host", report)
self.assertIn("user", report)
assert isinstance(report["user"], dict)
self.assertIn("user_email", report["user"])
self.assertIn("user_role", report["user"])
self.assertIn("message", report)
self.assertIn("stack_trace", report)
# Put it back so we continue to test the non-anonymous case
record.request.user = self.example_user("hamlet")
# Now simulate a DisallowedHost exception
with mock.patch.object(
record.request, "get_host", side_effect=Exception("Get host failure!")
) as m:
report = self.run_handler(record)
self.assertIn("host", report)
self.assertIn("user", report)
assert isinstance(report["user"], dict)
self.assertIn("user_email", report["user"])
self.assertIn("user_role", report["user"])
self.assertIn("message", report)
self.assertIn("stack_trace", report)
m.assert_called_once()
# Test an exception_filter exception
with patch("zerver.logging_handlers.get_exception_reporter_filter", return_value=15):
record.request.method = "POST"
report = self.run_handler(record)
record.request.method = "GET"
self.assertIn("host", report)
self.assertIn("user", report)
assert isinstance(report["user"], dict)
self.assertIn("user_email", report["user"])
self.assertIn("user_role", report["user"])
self.assertIn("message", report)
self.assertIn("stack_trace", report)
# Test the catch-all exception handler doesn't throw
with patch(
"zerver.worker.queue_processors.do_report_error", side_effect=Exception("queue error")
):
self.handler.emit(record)
with mock_queue_publish(
"zerver.logging_handlers.queue_json_publish", side_effect=Exception("queue error")
) as m:
with patch("logging.warning") as log_mock:
self.handler.emit(record)
m.assert_called_once()
log_mock.assert_called_once_with(
"Reporting an exception triggered an exception!", exc_info=True
)
with mock_queue_publish("zerver.logging_handlers.queue_json_publish") as m:
with patch("logging.warning") as log_mock:
self.handler.emit(record)
m.assert_called_once()
log_mock.assert_not_called()
# Test no exc_info
record.exc_info = None
report = self.run_handler(record)
self.assertIn("host", report)
self.assertIn("user", report)
assert isinstance(report["user"], dict)
self.assertIn("user_email", report["user"])
self.assertIn("user_role", report["user"])
self.assertIn("message", report)
self.assertEqual(report["stack_trace"], "No stack trace available")
# Test arbitrary exceptions from request.user
del record.request.user
with patch("zerver.logging_handlers.traceback.print_exc"):
report = self.run_handler(record)
self.assertIn("host", report)
self.assertIn("user", report)
assert isinstance(report["user"], dict)
self.assertIn("user_email", report["user"])
self.assertIn("user_role", report["user"])
self.assertIn("message", report)
self.assertIn("stack_trace", report)
self.assertEqual(report["user"]["user_email"], None)
class LoggingConfigTest(ZulipTestCase):
@staticmethod
def all_loggers() -> Iterator[logging.Logger]:
# There is no documented API for enumerating the loggers; but the
# internals of `logging` haven't changed in ages, so just use them.
for logger in logging.Logger.manager.loggerDict.values():
if not isinstance(logger, logging.Logger):
continue
yield logger
def test_django_emails_disabled(self) -> None:
for logger in self.all_loggers():
# The `handlers` attribute is undocumented, but see comment on
# `all_loggers`.
for handler in logger.handlers:
assert not isinstance(handler, AdminEmailHandler)
class ErrorFiltersTest(ZulipTestCase):
def test_clean_data_from_query_parameters(self) -> None:
from zerver.filters import clean_data_from_query_parameters
self.assertEqual(
clean_data_from_query_parameters("api_key=abcdz&stream=1"),
"api_key=******&stream=******",
)
self.assertEqual(
clean_data_from_query_parameters("api_key=abcdz&stream=foo&topic=bar"),
"api_key=******&stream=******&topic=******",
)
class RateLimitFilterTest(ZulipTestCase):
# This logger has special settings configured in
# test_extra_settings.py.
logger = logging.getLogger("zulip.test_zulip_admins_handler")
def test_recursive_filter_handling(self) -> None:
def mocked_cache_get(key: str) -> int:
self.logger.error(
"Log an error to trigger recursive filter() calls in _RateLimitFilter."
)
raise Exception
with patch("zerver.lib.logging_util.cache.get", side_effect=mocked_cache_get) as m:
self.logger.error("Log an error to trigger initial _RateLimitFilter.filter() call.")
# cache.get should have only been called once, by the original filter() call:
m.assert_called_once()

View File

@ -65,7 +65,6 @@ from zerver.lib.email_mirror import (
)
from zerver.lib.email_mirror import process_message as mirror_email
from zerver.lib.email_notifications import MissedMessageData, handle_missedmessage_emails
from zerver.lib.error_notify import do_report_error
from zerver.lib.exceptions import RateLimitedError
from zerver.lib.export import export_realm_wrapper
from zerver.lib.outgoing_webhook import do_rest_call, get_outgoing_webhook_service_handler
@ -819,24 +818,6 @@ class PushNotificationsWorker(QueueProcessingWorker):
retry_event(self.queue_name, event, failure_processor)
@assign_queue("error_reports")
class ErrorReporter(QueueProcessingWorker):
def consume(self, event: Mapping[str, Any]) -> None:
error_types = ["browser", "server"]
assert event["type"] in error_types
# Drop any old remaining browser-side errors; these now use
# Sentry.
if event["type"] == "browser":
return
if not settings.ERROR_REPORTING:
return
logging.info("Processing traceback for %s", event.get("user_email"))
do_report_error(event["report"])
@assign_queue("digest_emails")
class DigestWorker(QueueProcessingWorker): # nocoverage
# Who gets a digest is entirely determined by the enqueue_digest_emails

View File

@ -692,11 +692,8 @@ if IS_WORKER:
else:
FILE_LOG_PATH = SERVER_LOG_PATH
# This is disabled in a few tests.
LOGGING_ENABLED = True
DEFAULT_ZULIP_HANDLERS = [
*(["zulip_admins"] if ERROR_REPORTING else []),
*(["mail_admins"] if ERROR_REPORTING else []),
"console",
"file",
"errors_file",
@ -744,9 +741,6 @@ LOGGING: Dict[str, Any] = {
"nop": {
"()": "zerver.lib.logging_util.ReturnTrue",
},
"require_logging_enabled": {
"()": "zerver.lib.logging_util.ReturnEnabled",
},
"require_really_deployed": {
"()": "zerver.lib.logging_util.RequireReallyDeployed",
},
@ -760,15 +754,14 @@ LOGGING: Dict[str, Any] = {
},
},
"handlers": {
"zulip_admins": {
"mail_admins": {
"level": "ERROR",
"class": "zerver.logging_handlers.AdminNotifyHandler",
"class": "django.utils.log.AdminEmailHandler",
"filters": (
["ZulipLimiter", "require_debug_false", "require_really_deployed"]
if not DEBUG_ERROR_REPORTING
else []
),
"formatter": "default",
},
"auth_file": {
"level": "DEBUG",
@ -856,7 +849,6 @@ LOGGING: Dict[str, Any] = {
# root logger
"": {
"level": "INFO",
"filters": ["require_logging_enabled"],
"handlers": DEFAULT_ZULIP_HANDLERS,
},
# Django, alphabetized
@ -967,12 +959,6 @@ LOGGING: Dict[str, Any] = {
"handlers": ["file", "errors_file"],
"propagate": False,
},
# This logger is used only for automated tests validating the
# error-handling behavior of the zulip_admins handler.
"zulip.test_zulip_admins_handler": {
"handlers": ["zulip_admins"],
"propagate": False,
},
"zulip.zerver.webhooks": {
"level": "DEBUG",
"handlers": ["file", "errors_file", "webhook_file"],

View File

@ -116,13 +116,6 @@ if not PUPPETEER_TESTS:
"BACKEND": "django.core.cache.backends.locmem.LocMemCache",
}
# This logger is used only for automated tests validating the
# error-handling behavior of the zulip_admins handler.
LOGGING["loggers"]["zulip.test_zulip_admins_handler"] = {
"handlers": ["zulip_admins"],
"propagate": False,
}
# Here we set various loggers to be less noisy for unit tests.
def set_loglevel(logger_name: str, level: str) -> None:
LOGGING["loggers"].setdefault(logger_name, {})["level"] = level