zulip/zerver/openapi/openapi.py

# Zulip's OpenAPI-based API documentation system is documented at
#   https://zulip.readthedocs.io/en/latest/documentation/api.html
#
# This file contains helper functions to interact with the OpenAPI
# definitions and validate that Zulip's implementation matches what is
# described in our documentation.

import json
import os
import re
from collections.abc import Mapping
from typing import Any, Literal

import orjson
from openapi_core import OpenAPI
from openapi_core.protocols import Request, Response
from openapi_core.testing import MockRequest, MockResponse
from openapi_core.validation.exceptions import ValidationError as OpenAPIValidationError
from pydantic import BaseModel

OPENAPI_SPEC_PATH = os.path.abspath(
    os.path.join(os.path.dirname(__file__), "../openapi/zulip.yaml")
)

# A list of endpoint-methods such that the endpoint
# has documentation but not with this particular method.
EXCLUDE_UNDOCUMENTED_ENDPOINTS = {
    ("/users", "patch"),
}
# Consists of endpoints with some documentation remaining.
# These are skipped but return true as the validator cannot exclude objects
EXCLUDE_DOCUMENTED_ENDPOINTS: set[tuple[str, str]] = set()


# Most of our code expects allOf to be preprocessed away because that is what
# yamole did.  Its algorithm for doing so is not standards compliant, but we
# replicate it here.
def naively_merge(a: dict[str, object], b: dict[str, object]) -> dict[str, object]:
    ret: dict[str, object] = a.copy()
    for key, b_value in b.items():
        if key == "example" or key not in ret:
            ret[key] = b_value
            continue
        a_value = ret[key]
        if isinstance(b_value, list):
            assert isinstance(a_value, list)
            ret[key] = a_value + b_value
        elif isinstance(b_value, dict):
            assert isinstance(a_value, dict)
            ret[key] = naively_merge(a_value, b_value)
    return ret


def naively_merge_allOf(obj: object) -> object:
    if isinstance(obj, dict):
        return naively_merge_allOf_dict(obj)
    elif isinstance(obj, list):
        return list(map(naively_merge_allOf, obj))
    else:
        return obj


def naively_merge_allOf_dict(obj: dict[str, object]) -> dict[str, object]:
    if "allOf" in obj:
        ret = obj.copy()
        subschemas = ret.pop("allOf")
        ret = naively_merge_allOf_dict(ret)
        assert isinstance(subschemas, list)
        for subschema in subschemas:
            assert isinstance(subschema, dict)
            ret = naively_merge(ret, naively_merge_allOf_dict(subschema))
        return ret
    return {key: naively_merge_allOf(value) for key, value in obj.items()}


class OpenAPISpec:
    def __init__(self, openapi_path: str) -> None:
        self.openapi_path = openapi_path
        self.mtime: float | None = None
        self._openapi: dict[str, Any] = {}
        self._endpoints_dict: dict[str, str] = {}
        self._spec: OpenAPI | None = None

    def check_reload(self) -> None:
        # Because importing yaml takes significant time, and we only
        # use python-yaml for our API docs, importing it lazily here
        # is a significant optimization to `manage.py` startup.
        #
        # There is a bit of a race here...we may have two processes
        # accessing this module level object and both trying to
        # populate self.data at the same time.  Hopefully this will
        # only cause some extra processing at startup and not data
        # corruption.

        import yaml
        from jsonref import JsonRef

        with open(self.openapi_path) as f:
            mtime = os.fstat(f.fileno()).st_mtime
            # Using == rather than >= to cover the corner case of users placing an
            # earlier version than the current one
            if self.mtime == mtime:
                return

            openapi = yaml.load(f, Loader=yaml.CSafeLoader)

        spec = OpenAPI.from_dict(openapi)
        self._spec = spec
        self._openapi = naively_merge_allOf_dict(JsonRef.replace_refs(openapi))
        self.create_endpoints_dict()
        self.mtime = mtime

    def create_endpoints_dict(self) -> None:
        # Algorithm description:
        # We have 2 types of endpoints
        # 1.with path arguments 2. without path arguments
        # In validate_against_openapi_schema we directly check
        # if we have a without path endpoint, since it does not
        # require regex. Hence they are not part of the regex dict
        # and now we are left with only:
        # endpoint with path arguments.
        # Now for this case, the regex has been created carefully,
        # numeric arguments are matched with [0-9] only and
        # emails are matched with their regex. This is why there are zero
        # collisions. Hence if this regex matches
        # an incorrect endpoint then there is some backend problem.
        # For example if we have users/{name}/presence then it will
        # conflict with users/me/presence even in the backend.
        # Care should be taken though that if we have special strings
        # such as email they must be substituted with proper regex.

        email_regex = r"([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})"
        self._endpoints_dict = {}
        for endpoint in self._openapi["paths"]:
            if "{" not in endpoint:
                continue
            path_regex = "^" + endpoint + "$"
            # Numeric arguments have id at their end
            # so find such arguments and replace them with numeric
            # regex
            path_regex = re.sub(r"{[^}]*id}", r"[0-9]*", path_regex)
            # Email arguments end with email
            path_regex = re.sub(r"{[^}]*email}", email_regex, path_regex)
            # All other types of arguments are supposed to be
            # all-encompassing string.
            path_regex = re.sub(r"{[^}]*}", r"[^\/]*", path_regex)
            path_regex = path_regex.replace(r"/", r"\/")
            self._endpoints_dict[path_regex] = endpoint

    def openapi(self) -> dict[str, Any]:
        """Reload the OpenAPI file if it has been modified after the last time
        it was read, and then return the parsed data.
        """
        self.check_reload()
        assert len(self._openapi) > 0
        return self._openapi

    def endpoints_dict(self) -> dict[str, str]:
        """Reload the OpenAPI file if it has been modified after the last time
        it was read, and then return the parsed data.
        """
        self.check_reload()
        assert len(self._endpoints_dict) > 0
        return self._endpoints_dict

    def spec(self) -> OpenAPI:
        """Reload the OpenAPI file if it has been modified after the last time
        it was read, and then return the openapi_core validator object. Similar
        to preceding functions. Used for proper access to OpenAPI objects.
        """
        self.check_reload()
        assert self._spec is not None
        return self._spec


class SchemaError(Exception):
    pass


openapi_spec = OpenAPISpec(OPENAPI_SPEC_PATH)


def get_schema(endpoint: str, method: str, status_code: str) -> dict[str, Any]:
    if len(status_code) == 3 and (
        "oneOf"
        in openapi_spec.openapi()["paths"][endpoint][method.lower()]["responses"][status_code][
            "content"
        ]["application/json"]["schema"]
    ):
        # Currently at places where multiple schemas are defined they only
        # differ in example so either can be used.
        status_code += "_0"
    if len(status_code) == 3:
        schema = openapi_spec.openapi()["paths"][endpoint][method.lower()]["responses"][
            status_code
        ]["content"]["application/json"]["schema"]
        return schema
    else:
        subschema_index = int(status_code[4])
        status_code = status_code[0:3]
        schema = openapi_spec.openapi()["paths"][endpoint][method.lower()]["responses"][
            status_code
        ]["content"]["application/json"]["schema"]["oneOf"][subschema_index]
        return schema


def get_openapi_fixture(
    endpoint: str, method: str, status_code: str = "200"
) -> list[dict[str, Any]]:
    """Fetch a fixture from the full spec object."""
    if "example" not in get_schema(endpoint, method, status_code):
        return openapi_spec.openapi()["paths"][endpoint][method.lower()]["responses"][status_code][
            "content"
        ]["application/json"]["examples"].values()
    return [
        {
            "description": get_schema(endpoint, method, status_code)["description"],
            "value": get_schema(endpoint, method, status_code)["example"],
        }
    ]


def get_curl_include_exclude(endpoint: str, method: str) -> list[dict[str, Any]]:
    """Fetch all the kinds of parameters required for curl examples."""
    if (
        "x-curl-examples-parameters"
        not in openapi_spec.openapi()["paths"][endpoint][method.lower()]
    ):
        return [{"type": "exclude", "parameters": {"enum": [""]}}]
    return openapi_spec.openapi()["paths"][endpoint][method.lower()]["x-curl-examples-parameters"][
        "oneOf"
    ]


def check_requires_administrator(endpoint: str, method: str) -> bool:
    """Fetch if the endpoint requires admin config."""
    return openapi_spec.openapi()["paths"][endpoint][method.lower()].get(
        "x-requires-administrator", False
    )


def check_additional_imports(endpoint: str, method: str) -> list[str] | None:
    """Fetch the additional imports required for an endpoint."""
    return openapi_spec.openapi()["paths"][endpoint][method.lower()].get(
        "x-python-examples-extra-imports", None
    )


def get_responses_description(endpoint: str, method: str) -> str:
    """Fetch responses description of an endpoint."""
    return openapi_spec.openapi()["paths"][endpoint][method.lower()].get(
        "x-response-description", ""
    )


def get_parameters_description(endpoint: str, method: str) -> str:
    """Fetch parameters description of an endpoint."""
    return openapi_spec.openapi()["paths"][endpoint][method.lower()].get(
        "x-parameter-description", ""
    )


def generate_openapi_fixture(endpoint: str, method: str) -> list[str]:
    """Generate fixture to be rendered"""
    fixture = []
    for status_code in sorted(
        openapi_spec.openapi()["paths"][endpoint][method.lower()]["responses"]
    ):
        if (
            "oneOf"
            in openapi_spec.openapi()["paths"][endpoint][method.lower()]["responses"][status_code][
                "content"
            ]["application/json"]["schema"]
        ):
            subschema_count = len(
                openapi_spec.openapi()["paths"][endpoint][method.lower()]["responses"][status_code][
                    "content"
                ]["application/json"]["schema"]["oneOf"]
            )
        else:
            subschema_count = 1
        for subschema_index in range(subschema_count):
            if subschema_count != 1:
                subschema_status_code = status_code + "_" + str(subschema_index)
            else:
                subschema_status_code = status_code
            fixture_dict = get_openapi_fixture(endpoint, method, subschema_status_code)
            for example in fixture_dict:
                fixture_json = json.dumps(
                    example["value"], indent=4, sort_keys=True, separators=(",", ": ")
                )
                if "description" in example:
                    fixture.extend(example["description"].strip().splitlines())
                fixture.append("``` json")
                fixture.extend(fixture_json.splitlines())
                fixture.append("```")
    return fixture


def get_openapi_description(endpoint: str, method: str) -> str:
    """Fetch a description from the full spec object."""
    endpoint_documentation = openapi_spec.openapi()["paths"][endpoint][method.lower()]
    endpoint_description = endpoint_documentation["description"]
    check_deprecated_consistency(
        endpoint_documentation.get("deprecated", False), endpoint_description
    )
    return endpoint_description


def get_openapi_summary(endpoint: str, method: str) -> str:
    """Fetch a summary from the full spec object."""
    return openapi_spec.openapi()["paths"][endpoint][method.lower()]["summary"]


def get_endpoint_from_operationid(operationid: str) -> tuple[str, str]:
    for endpoint in openapi_spec.openapi()["paths"]:
        for method in openapi_spec.openapi()["paths"][endpoint]:
            operationId = openapi_spec.openapi()["paths"][endpoint][method].get("operationId")
            if operationId == operationid:
                return (endpoint, method)
    raise AssertionError("No such page exists in OpenAPI data.")


def get_openapi_paths() -> set[str]:
    return set(openapi_spec.openapi()["paths"].keys())


NO_EXAMPLE = object()


class Parameter(BaseModel):
    kind: Literal["query", "path", "formData"]
    name: str
    description: str
    json_encoded: bool
    value_schema: dict[str, Any]
    example: object
    required: bool
    deprecated: bool


def get_openapi_parameters(
    endpoint: str, method: str, include_url_parameters: bool = True
) -> list[Parameter]:
    operation = openapi_spec.openapi()["paths"][endpoint][method.lower()]
    parameters = []

    # We do a `.get()` for this last bit to distinguish documented
    # endpoints with no parameters (empty list) from undocumented
    # endpoints (KeyError exception).
    for parameter in operation.get("parameters", []):
        # Also, we skip parameters defined in the URL.
        if not include_url_parameters and parameter["in"] == "path":
            continue

        json_encoded = "content" in parameter
        if json_encoded:
            schema = parameter["content"]["application/json"]["schema"]
        else:
            schema = parameter["schema"]

        if "example" in parameter:
            example = parameter["example"]
        elif json_encoded and "example" in parameter["content"]["application/json"]:
            example = parameter["content"]["application/json"]["example"]
        else:
            example = schema.get("example", NO_EXAMPLE)

        parameters.append(
            Parameter(
                kind=parameter["in"],
                name=parameter["name"],
                description=parameter["description"],
                json_encoded=json_encoded,
                value_schema=schema,
                example=example,
                required=parameter.get("required", False),
                deprecated=parameter.get("deprecated", False),
            )
        )

    if "requestBody" in operation and "application/x-www-form-urlencoded" in (
        content := operation["requestBody"]["content"]
    ):
        media_type = content["application/x-www-form-urlencoded"]
        required = media_type["schema"].get("required", [])
        for key, schema in media_type["schema"]["properties"].items():
            json_encoded = (
                "encoding" in media_type
                and key in (encodings := media_type["encoding"])
                and encodings[key].get("contentType") == "application/json"
            ) or schema.get("type") == "object"

            parameters.append(
                Parameter(
                    kind="formData",
                    name=key,
                    description=schema["description"],
                    json_encoded=json_encoded,
                    value_schema=schema,
                    example=schema.get("example"),
                    required=key in required,
                    deprecated=schema.get("deprecated", False),
                )
            )

    return parameters


def get_openapi_return_values(endpoint: str, method: str) -> dict[str, Any]:
    operation = openapi_spec.openapi()["paths"][endpoint][method.lower()]
    schema = operation["responses"]["200"]["content"]["application/json"]["schema"]
    # We do not currently have documented endpoints that have multiple schemas
    # ("oneOf", "anyOf", "allOf") for success ("200") responses. If this changes,
    # then the assertion below will need to be removed, and this function updated
    # so that endpoint responses will be rendered as expected.
    assert "properties" in schema
    return schema["properties"]


def find_openapi_endpoint(path: str) -> str | None:
    for path_regex, endpoint in openapi_spec.endpoints_dict().items():
        matches = re.match(path_regex, path)
        if matches:
            return endpoint
    return None


def validate_against_openapi_schema(
    content: dict[str, Any], path: str, method: str, status_code: str
) -> bool:
    mock_request = MockRequest("http://localhost:9991/", method, "/api/v1" + path)
    mock_response = MockResponse(
        orjson.dumps(content),
        status_code=int(status_code),
    )
    return validate_test_response(mock_request, mock_response)


def validate_test_response(request: Request, response: Response) -> bool:
    """Compare a "content" dict with the defined schema for a specific method
    in an endpoint. Return true if validated and false if skipped.
    """

    if request.path.startswith("/json/"):
        path = request.path.removeprefix("/json")
    elif request.path.startswith("/api/v1/"):
        path = request.path.removeprefix("/api/v1")
    else:
        return False
    assert request.method is not None
    method = request.method.lower()
    status_code = str(response.status_code)

    # This first set of checks are primarily training wheels that we
    # hope to eliminate over time as we improve our API documentation.

    if path not in openapi_spec.openapi()["paths"]:
        endpoint = find_openapi_endpoint(path)
        # If it doesn't match it hasn't been documented yet.
        if endpoint is None:
            return False
    else:
        endpoint = path
    # Excluded endpoint/methods
    if (endpoint, method) in EXCLUDE_UNDOCUMENTED_ENDPOINTS:
        return False
    # Return true for endpoints with only response documentation remaining
    if (endpoint, method) in EXCLUDE_DOCUMENTED_ENDPOINTS:  # nocoverage
        return True
    # Code is not declared but appears in various 400 responses. If
    # common, it can be added to 400 response schema
    if status_code.startswith("4"):
        # This return statement should ideally be not here. But since
        # we have not defined 400 responses for various paths this has
        # been added as all 400 have the same schema.  When all 400
        # response have been defined this should be removed.
        return True

    try:
        openapi_spec.spec().validate_response(request, response)
    except OpenAPIValidationError as error:
        message = f"Response validation error at {method} /api/v1{path} ({status_code}):"
        message += f"\n\n{type(error).__name__}: {error}"
        message += (
            "\n\nFor help debugging these errors see: "
            "https://zulip.readthedocs.io/en/latest/documentation/api.html#debugging-schema-validation-errors"
        )
        raise SchemaError(message) from None

    return True


def validate_schema(schema: dict[str, Any]) -> None:
    """Check if opaque objects are present in the OpenAPI spec; this is an
    important part of our policy for ensuring every detail of Zulip's
    API responses is correct.

    This is done by checking for the presence of the
    `additionalProperties` attribute for all objects (dictionaries).
    """
    if "oneOf" in schema:
        for subschema in schema["oneOf"]:
            validate_schema(subschema)
    elif schema["type"] == "array":
        validate_schema(schema["items"])
    elif schema["type"] == "object":
        if "additionalProperties" not in schema:
            raise SchemaError(
                "additionalProperties needs to be defined for objects to make sure they have no"
                " additional properties left to be documented."
            )
        for property_schema in schema.get("properties", {}).values():
            validate_schema(property_schema)
        if schema["additionalProperties"]:
            validate_schema(schema["additionalProperties"])


def deprecated_note_in_description(description: str) -> bool:
    if "**Changes**: Deprecated" in description:
        return True

    return "**Deprecated**" in description


def check_deprecated_consistency(deprecated: bool, description: str) -> None:
    # Test to make sure deprecated parameters are marked so.
    if deprecated_note_in_description(description):
        assert (
            deprecated
        ), f"Missing `deprecated: true` despite being described as deprecated:\n\n{description}\n"
    if deprecated:
        assert deprecated_note_in_description(
            description
        ), f"Marked as `deprecated: true`, but changes documentation doesn't properly explain as **Deprecated** in the standard format\n\n:{description}\n"


# Skip those JSON endpoints whose query parameters are different from
# their `/api/v1` counterpart.  This is a legacy code issue that we
# plan to fix by changing the implementation.
SKIP_JSON = {
    ("/fetch_api_key", "post"),
}


def validate_request(
    url: str,
    method: str,
    data: str | bytes | Mapping[str, Any],
    http_headers: dict[str, str],
    json_url: bool,
    status_code: str,
    intentionally_undocumented: bool = False,
) -> None:
    assert isinstance(data, dict)
    mock_request = MockRequest(
        "http://localhost:9991/",
        method,
        "/api/v1" + url,
        headers=http_headers,
        args={k: str(v) for k, v in data.items()},
    )
    validate_test_request(mock_request, status_code, intentionally_undocumented)


def validate_test_request(
    request: Request,
    status_code: str,
    intentionally_undocumented: bool = False,
) -> None:
    assert request.method is not None
    method = request.method.lower()
    if request.path.startswith("/json/"):
        url = request.path.removeprefix("/json")
        # Some JSON endpoints have different parameters compared to
        # their `/api/v1` counterparts.
        if (url, method) in SKIP_JSON:
            return
    else:
        assert request.path.startswith("/api/v1/")
        url = request.path.removeprefix("/api/v1")

    # TODO: Add support for file upload endpoints that lack the /json/
    # or /api/v1/ prefix.
    if url == "/user_uploads" or url.startswith("/realm/emoji/"):
        return

    # Requests that do not validate against the OpenAPI spec must either:
    # * Have returned a 400 (bad request) error
    # * Have returned a 200 (success) with this request marked as intentionally
    # undocumented behavior.
    if status_code.startswith("4"):
        return
    if status_code.startswith("2") and intentionally_undocumented:
        return

    # Now using the openapi_core APIs, validate the request schema
    # against the OpenAPI documentation.
    try:
        openapi_spec.spec().validate_request(request)
    except OpenAPIValidationError as error:
        # Show a block error message explaining the options for fixing it.
        msg = f"""

Error!  The OpenAPI schema for {method} {url} is not consistent
with the parameters passed in this HTTP request.  Consider:

* Updating the OpenAPI schema defined in zerver/openapi/zulip.yaml
* Adjusting the test to pass valid parameters.  If the test
  fails due to intentionally_undocumented features, you need to pass
  `intentionally_undocumented=True` to self.client_{method.lower()} or
  self.api_{method.lower()} to document your intent.

See https://zulip.readthedocs.io/en/latest/documentation/api.html for help.

The error logged by the OpenAPI validator is below:
{error}
"""
        raise SchemaError(msg)