zulip/scripts/log-search

#!/usr/bin/env python3

import argparse
import calendar
import gzip
import logging
import os
import re
import signal
import sys
from datetime import datetime, timedelta
from enum import Enum, auto
from typing import List, Match, Optional, Set, TextIO, Tuple

ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(ZULIP_PATH)

from scripts.lib.setup_path import setup_path

setup_path()

os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"

from typing import Protocol

from django.conf import settings

from scripts.lib.zulip_tools import (
    BOLD,
    CYAN,
    ENDC,
    FAIL,
    GRAY,
    OKBLUE,
    get_config,
    get_config_file,
)


def parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Search logfiles, ignoring commonly-fetched URLs.")
    log_selection = parser.add_argument_group("File selection")
    log_selection_options = log_selection.add_mutually_exclusive_group()
    access_log_retention_days = int(
        get_config(get_config_file(), "application_server", "access_log_retention_days", "14")
    )
    log_selection_options.add_argument(
        "--log-files",
        "-n",
        help="Number of log files to search",
        choices=range(1, access_log_retention_days + 2),
        metavar=f"[1-{access_log_retention_days+1}]",
        type=int,
    )
    log_selection_options.add_argument(
        "--all-logs",
        "-A",
        help="Parse all logfiles, not just most recent",
        action="store_true",
    )
    log_selection_options.add_argument(
        "--min-hours",
        "-H",
        help="Estimated minimum number of hours; includes previous log file, if estimated less than this",
        type=int,
        choices=range(24),
        default=3,
    )
    log_selection.add_argument(
        "--nginx",
        "-N",
        help="Parse from NGINX logs, not server.log",
        action="store_true",
    )

    filtering = parser.add_argument_group("Filtering")
    filtering.add_argument(
        "filter_terms",
        help="IP address, hostname, user-id, path, or status code to search for; multiple are AND'ed together",
        nargs="+",
    )
    filtering.add_argument(
        "--all-lines",
        "-L",
        help="Show all matching lines; equivalent to -suemtpr",
        action="store_true",
    )
    filtering.add_argument("--static", "-s", help="Include static file paths", action="store_true")
    filtering.add_argument("--uploads", "-u", help="Include file upload paths", action="store_true")
    filtering.add_argument("--avatars", "-a", help="Include avatar paths", action="store_true")
    filtering.add_argument("--events", "-e", help="Include event fetch paths", action="store_true")
    filtering.add_argument("--messages", "-m", help="Include message paths", action="store_true")
    filtering.add_argument(
        "--typing",
        "-t",
        help="Include typing notification path",
        action="store_true",
    )
    filtering.add_argument("--presence", "-p", help="Include presence paths", action="store_true")
    filtering.add_argument(
        "--report", "-r", help="Include timing report paths", action="store_true"
    )
    filtering.add_argument(
        "--no-other", "-O", help="Exclude paths not explicitly included", action="store_true"
    )

    output = parser.add_argument_group("Output")
    output.add_argument("--full-line", "-F", help="Show full matching line", action="store_true")
    output.add_argument("--timeline", "-T", help="Show start, end, and gaps", action="store_true")
    return parser


def maybe_gzip(logfile_name: str) -> TextIO:
    if logfile_name.endswith(".gz"):
        return gzip.open(logfile_name, "rt")
    return open(logfile_name)  # noqa: SIM115


NGINX_LOG_LINE_RE = re.compile(
    r"""
      (?P<ip> \S+ ) \s+
      - \s+
      (?P<user> \S+ ) \s+
      \[
         (?P<date> \d+/\w+/\d+ )
         :
         (?P<time> \d+:\d+:\d+ )
         \s+ [+-]\d+
      \] \s+
      "
         (?P<method> \S+ )
         \s+
         (?P<path> [^"]+ )
         \s+
         (?P<http_version> HTTP/[^"]+ )
      " \s+
      (?P<code> \d+ ) \s+
      (?P<bytes> \d+ ) \s+
      "(?P<referer> [^"]* )" \s+
      "(?P<user_agent> [^"]* )" \s+
      (?P<hostname> \S+ ) \s+
      (?P<duration> \S+ )
    """,
    re.X,
)

PYTHON_LOG_LINE_RE = re.compile(
    r"""
      (?P<date> \d+-\d+-\d+ ) \s+
      (?P<time> \d+:\d+:\d+\.\d+ ) \s+
      INFO \s+  # All access log lines are INFO
      (pid:\d+ \s+) ?
      \[ (?P<source> zr(:\d+)?) \] \s+
      (?P<ip>
        \d{1,3}(\.\d{1,3}){3}
      | ([a-f0-9:]+:+){1,7}[a-f0-9]*
      ) \s+
      (?P<method> [A-Z]+ ) \s+
      (?P<code> \d+ ) \s+
      (?P<duration> \S+ ) \s+ # This can be "217ms" or "1.7s"
      ( \( [^)]+ \) \s+ )*
      (?P<path> /\S* ) \s+
      .*   # Multiple extra things can go here
      \(
        (?P<user>
           ( (?P<user_id> \d+ ) | unauth )
           @
           (?P<hostname> \S+ )
         | zulip-server:\S+
         | scim-client:\S+
         | internal
        ) \s+ via \s+ (?P<user_agent> .* )
      \)
    """,
    re.X,
)


class FilterType(Enum):
    HOSTNAME = auto()
    CLIENT_IP = auto()
    USER_ID = auto()
    METHOD = auto()
    PATH = auto()
    STATUS = auto()


class FilterFunc(Protocol):
    def __call__(self, m: Match[str], t: str = ...) -> bool: ...


def main() -> None:
    args = parser().parse_args()

    (filter_types, filter_funcs) = parse_filters(args)
    logfile_names = parse_logfile_names(args)
    if args.timeline and args.nginx:
        print("! nginx logs not suggested for timeline, due to imprecision", file=sys.stderr)

    use_color = sys.stdout.isatty()
    lowered_terms = [term.lower() for term in args.filter_terms]
    try:
        for logfile_name in reversed(logfile_names):
            with maybe_gzip(logfile_name) as logfile:
                for logline in logfile:
                    # As a performance optimization, just do a substring
                    # check before we parse the line fully
                    lowered = logline.lower()
                    if not all(f in lowered for f in lowered_terms):
                        continue

                    if args.nginx:
                        match = NGINX_LOG_LINE_RE.match(logline)
                    else:
                        match = PYTHON_LOG_LINE_RE.match(logline)
                    if match is None:
                        # We expect other types of loglines in the Python logfiles
                        if args.nginx:
                            print(f"! Failed to parse:\n{logline}", file=sys.stderr)
                        continue
                    if passes_filters(filter_funcs, match, args):
                        print_line(
                            match,
                            args,
                            filter_types=filter_types,
                            use_color=use_color,
                        )
    except BrokenPipeError:
        # Python flushes standard streams on exit; redirect remaining output
        # to devnull to avoid another BrokenPipeError at shutdown
        devnull = os.open(os.devnull, os.O_WRONLY)
        os.dup2(devnull, sys.stdout.fileno())
        sys.exit(1)
    except KeyboardInterrupt:
        sys.exit(signal.SIGINT + 128)


def parse_logfile_names(args: argparse.Namespace) -> List[str]:
    if args.nginx:
        base_path = "/var/log/nginx/access.log"
    else:
        base_path = "/var/log/zulip/server.log"

    logfile_names = [base_path]
    if args.all_logs:
        logfile_count = 15
    elif args.log_files is not None:
        logfile_count = args.log_files
    else:
        # Detect if there was a logfile rotation in the last
        # (min-hours)-ish hours, and if so include the previous
        # logfile as well.
        logfile_count = 1
        try:
            current_size = os.path.getsize(base_path)
            past_size = os.path.getsize(base_path + ".1")
            if current_size < (args.min_hours / 24.0) * past_size:
                logfile_count = 2
        except FileNotFoundError:
            pass
    for n in range(1, logfile_count):
        logname = f"{base_path}.{n}"
        if n > 1:
            logname += ".gz"
        logfile_names.append(logname)
    return logfile_names


def parse_filters(
    args: argparse.Namespace,
) -> Tuple[Set[FilterType], List[FilterFunc]]:
    # The heuristics below are not intended to be precise -- they
    # certainly count things as "IPv4" or "IPv6" addresses that are
    # invalid.  However, we expect the input here to already be
    # reasonably well-formed.

    filter_types = set()
    filter_funcs = []
    filter_terms = []

    for filter_term in args.filter_terms:
        if re.match(r"[1-5][0-9][0-9]$", filter_term):
            filter_func = lambda m, t=filter_term: m["code"] == t
            filter_type = FilterType.STATUS
            if not args.nginx and filter_term == "502":
                logging.warning("Adding --nginx -- 502's do not appear in Django logs.")
                args.nginx = True
        elif re.match(r"[1-5]xx$", filter_term):
            filter_term = filter_term[0]
            filter_func = lambda m, t=filter_term: m["code"].startswith(t)
            filter_type = FilterType.STATUS
        elif re.match(r"\d+$", filter_term):
            if args.nginx:
                raise parser().error("Cannot parse user-ids with nginx logs; try without --nginx")
            filter_func = lambda m, t=filter_term: m["user_id"] == t
            filter_type = FilterType.USER_ID
        elif re.match(r"\d{1,3}(\.\d{1,3}){3}$", filter_term):
            filter_func = lambda m, t=filter_term: m["ip"] == t
            filter_type = FilterType.CLIENT_IP
        elif re.match(r"([a-f0-9:]+:+){1,7}[a-f0-9]+$", filter_term):
            filter_func = lambda m, t=filter_term: m["ip"] == t
            filter_type = FilterType.CLIENT_IP
        elif re.match(r"DELETE|GET|HEAD|OPTIONS|PATCH|POST|PUT", filter_term):
            filter_func = lambda m, t=filter_term: m["method"].upper() == t
            filter_type = FilterType.METHOD
        elif re.match(r"[a-z0-9]([a-z0-9-]*[a-z0-9])?$", filter_term.lower()):
            filter_term = filter_term.lower()
            if args.nginx:
                filter_func = lambda m, t=filter_term: m["hostname"].startswith(t + ".")
            else:
                filter_func = lambda m, t=filter_term: m["hostname"] == t
            filter_type = FilterType.HOSTNAME
        elif re.match(r"[a-z0-9-]+(\.[a-z0-9-]+)+$", filter_term.lower()) and re.search(
            r"[a-z-]", filter_term.lower()
        ):
            if not args.nginx:
                raise parser().error("Cannot parse full domains with Python logs; try --nginx")
            filter_term = filter_term.lower()
            filter_func = lambda m, t=filter_term: m["hostname"] == t
            filter_type = FilterType.HOSTNAME
        elif re.match(r"/\S*$", filter_term):
            filter_func = lambda m, t=filter_term: m["path"] == t
            filter_type = FilterType.PATH
            args.all_lines = True
        else:
            raise RuntimeError(
                f"Can't parse {filter_term} as an IP, hostname, user-id, path, or status code."
            )
        if filter_type in filter_types:
            parser().error("Supplied the same time of value more than once, which cannot match!")
        filter_types.add(filter_type)
        filter_funcs.append(filter_func)
        filter_terms.append(filter_term)

    # Push back the modified raw strings, so we can use them for fast substring searches
    args.filter_terms = filter_terms

    return (filter_types, filter_funcs)


def passes_filters(
    string_filters: List[FilterFunc],
    match: Match[str],
    args: argparse.Namespace,
) -> bool:
    if not all(f(match) for f in string_filters):
        return False

    if args.all_lines:
        return True

    path = match["path"]
    if path.startswith("/static/"):
        return args.static
    elif path.startswith("/user_uploads/"):
        return args.uploads
    elif path.startswith(("/user_avatars/", "/avatar/")):
        return args.avatars
    elif re.match(r"/(json|api/v1)/events($|\?|/)", path):
        return args.events
    elif path in ("/api/v1/typing", "/json/typing"):
        return args.typing
    elif re.match(r"/(json|api/v1)/messages($|\?|/)", path):
        return args.messages
    elif path in ("/api/v1/users/me/presence", "/json/users/me/presence"):
        return args.presence
    elif path.startswith(("/api/v1/report/", "/json/report/")) or path == "/error_tracing":
        return args.report
    else:
        return not args.no_other


last_match_end: Optional[datetime] = None
month_lookup = {v: f"{k:02d}" for k, v in enumerate(calendar.month_abbr)}


def print_line(
    match: Match[str],
    args: argparse.Namespace,
    filter_types: Set[FilterType],
    use_color: bool,
) -> None:
    global last_match_end

    if args.full_line:
        print(match.group(0))
        return

    if args.nginx:
        day_of_month, month_abbr, year = match["date"].split("/")
        date = f"{year}-{month_lookup[month_abbr]}-{day_of_month}"
    else:
        date = match["date"]
    if args.all_logs or args.log_files is not None and args.log_files > 1:
        ts = date + " " + match["time"]
    else:
        ts = match["time"]

    if match["duration"].endswith("ms"):
        duration_ms = int(match["duration"][:-2])
    else:
        duration_ms = int(float(match["duration"][:-1]) * 1000)

    code = int(match["code"])
    indicator = " "
    color = ""
    if code == 401:
        indicator = ":"
        color = CYAN
    elif code == 499:
        indicator = "-"
        color = GRAY
    elif code >= 400 and code < 499:
        indicator = ">"
        color = OKBLUE
    elif code >= 500 and code <= 599:
        indicator = "!"
        color = FAIL

    if use_color:
        url = f"{BOLD}{match['path']}"
    else:
        url = match["path"]
        color = ""

    if FilterType.HOSTNAME not in filter_types:
        hostname = match["hostname"]
        if hostname is None:
            hostname = "???." + settings.EXTERNAL_HOST
        elif not args.nginx:
            if hostname != "root":
                hostname += "." + settings.EXTERNAL_HOST
            elif settings.EXTERNAL_HOST == "zulipchat.com":
                hostname = "zulip.com"
            else:
                hostname = settings.EXTERNAL_HOST
        url = "https://" + hostname + url

    user_id = ""
    if not args.nginx and match["user_id"] is not None:
        user_id = match["user_id"] + "@"

    if args.timeline:
        logline_end = datetime.fromisoformat(date + " " + match["time"])
        logline_start = logline_end - timedelta(milliseconds=duration_ms)
        if last_match_end is not None:
            gap_ms = int((logline_start - last_match_end) / timedelta(milliseconds=1))
            if gap_ms > 5000:
                print()
                print(f"========== {int(gap_ms/1000):>4} second gap ==========")
                print()
            elif gap_ms > 1000:
                print(f"============ {gap_ms:>5}ms gap ============")
            elif gap_ms > 0:
                print(f"------------ {gap_ms:>5}ms gap ------------")
            else:
                print(f"!!!!!!!!!! {abs(gap_ms):>5}ms overlap !!!!!!!!!!")
        if args.all_logs or args.log_files is not None and args.log_files > 1:
            print(logline_start.isoformat(" ", timespec="milliseconds") + " (start)")
        else:
            print(logline_start.time().isoformat(timespec="milliseconds") + " (start)")
        last_match_end = logline_end

    parts = [
        ts,
        f"{duration_ms:>5}ms",
        f"{user_id:7}" if not args.nginx and FilterType.USER_ID not in filter_types else None,
        f"{match['ip']:39}" if FilterType.CLIENT_IP not in filter_types else None,
        indicator + match["code"],
        f"{match['method']:6}",
        url,
    ]

    print(color + " ".join(p for p in parts if p is not None) + (ENDC if use_color else ""))


if __name__ == "__main__":
    main()