rate_limit: Add a flag to lump all TOR exit node IPs together.
TOR users are legitimate users of the system; however, that system can
also be used for abuse -- specifically, by evading IP-based
rate-limiting.
For the purposes of IP-based rate-limiting, add a
RATE_LIMIT_TOR_TOGETHER flag, defaulting to false, which lumps all
requests from TOR exit nodes into the same bucket. This may allow a
TOR user to deny other TOR users access to the find-my-account and
new-realm endpoints, but this is a low cost for cutting off a
significant potential abuse vector.
If enabled, the list of TOR exit nodes is fetched from their public
endpoint once per hour, via a cron job, and cached on disk. Django
processes load this data from disk, and cache it in memcached.
Requests are spared from the burden of checking disk on failure via a
circuitbreaker, which trips of there are two failures in a row, and
only begins trying again after 10 minutes.
2021-11-03 21:43:02 +01:00
|
|
|
import os
|
|
|
|
from argparse import ArgumentParser
|
|
|
|
from typing import Any, Set
|
|
|
|
|
|
|
|
import orjson
|
|
|
|
from django.conf import settings
|
2023-10-12 19:43:45 +02:00
|
|
|
from typing_extensions import override
|
2022-01-13 22:02:54 +01:00
|
|
|
from urllib3.util import Retry
|
rate_limit: Add a flag to lump all TOR exit node IPs together.
TOR users are legitimate users of the system; however, that system can
also be used for abuse -- specifically, by evading IP-based
rate-limiting.
For the purposes of IP-based rate-limiting, add a
RATE_LIMIT_TOR_TOGETHER flag, defaulting to false, which lumps all
requests from TOR exit nodes into the same bucket. This may allow a
TOR user to deny other TOR users access to the find-my-account and
new-realm endpoints, but this is a low cost for cutting off a
significant potential abuse vector.
If enabled, the list of TOR exit nodes is fetched from their public
endpoint once per hour, via a cron job, and cached on disk. Django
processes load this data from disk, and cache it in memcached.
Requests are spared from the burden of checking disk on failure via a
circuitbreaker, which trips of there are two failures in a row, and
only begins trying again after 10 minutes.
2021-11-03 21:43:02 +01:00
|
|
|
|
|
|
|
from zerver.lib.management import ZulipBaseCommand
|
|
|
|
from zerver.lib.outgoing_http import OutgoingSession
|
|
|
|
|
|
|
|
|
|
|
|
class TorDataSession(OutgoingSession):
|
|
|
|
def __init__(self, max_retries: int) -> None:
|
2022-01-22 01:55:31 +01:00
|
|
|
Retry.DEFAULT_BACKOFF_MAX = 64
|
rate_limit: Add a flag to lump all TOR exit node IPs together.
TOR users are legitimate users of the system; however, that system can
also be used for abuse -- specifically, by evading IP-based
rate-limiting.
For the purposes of IP-based rate-limiting, add a
RATE_LIMIT_TOR_TOGETHER flag, defaulting to false, which lumps all
requests from TOR exit nodes into the same bucket. This may allow a
TOR user to deny other TOR users access to the find-my-account and
new-realm endpoints, but this is a low cost for cutting off a
significant potential abuse vector.
If enabled, the list of TOR exit nodes is fetched from their public
endpoint once per hour, via a cron job, and cached on disk. Django
processes load this data from disk, and cache it in memcached.
Requests are spared from the burden of checking disk on failure via a
circuitbreaker, which trips of there are two failures in a row, and
only begins trying again after 10 minutes.
2021-11-03 21:43:02 +01:00
|
|
|
retry = Retry(
|
|
|
|
total=max_retries,
|
|
|
|
backoff_factor=2.0,
|
|
|
|
status_forcelist={ # Retry on these
|
|
|
|
429, # The formal rate-limiting response code
|
|
|
|
500, # Server error
|
|
|
|
502, # Bad gateway
|
|
|
|
503, # Service unavailable
|
|
|
|
},
|
|
|
|
)
|
|
|
|
super().__init__(role="tor_data", timeout=3, max_retries=retry)
|
|
|
|
|
|
|
|
|
|
|
|
class Command(ZulipBaseCommand):
|
|
|
|
help = """Fetch the list of TOR exit nodes, and write the list of IP addresses
|
|
|
|
to a file for access from Django for rate-limiting purposes.
|
|
|
|
|
|
|
|
Does nothing unless RATE_LIMIT_TOR_TOGETHER is enabled.
|
|
|
|
"""
|
|
|
|
|
2023-10-12 19:43:45 +02:00
|
|
|
@override
|
rate_limit: Add a flag to lump all TOR exit node IPs together.
TOR users are legitimate users of the system; however, that system can
also be used for abuse -- specifically, by evading IP-based
rate-limiting.
For the purposes of IP-based rate-limiting, add a
RATE_LIMIT_TOR_TOGETHER flag, defaulting to false, which lumps all
requests from TOR exit nodes into the same bucket. This may allow a
TOR user to deny other TOR users access to the find-my-account and
new-realm endpoints, but this is a low cost for cutting off a
significant potential abuse vector.
If enabled, the list of TOR exit nodes is fetched from their public
endpoint once per hour, via a cron job, and cached on disk. Django
processes load this data from disk, and cache it in memcached.
Requests are spared from the burden of checking disk on failure via a
circuitbreaker, which trips of there are two failures in a row, and
only begins trying again after 10 minutes.
2021-11-03 21:43:02 +01:00
|
|
|
def add_arguments(self, parser: ArgumentParser) -> None:
|
|
|
|
parser.add_argument(
|
|
|
|
"--max-retries",
|
|
|
|
type=int,
|
|
|
|
default=10,
|
|
|
|
help="Number of times to retry fetching data from TOR",
|
|
|
|
)
|
|
|
|
|
2023-10-12 19:43:45 +02:00
|
|
|
@override
|
2022-01-12 00:25:29 +01:00
|
|
|
def handle(self, *args: Any, **options: Any) -> None:
|
rate_limit: Add a flag to lump all TOR exit node IPs together.
TOR users are legitimate users of the system; however, that system can
also be used for abuse -- specifically, by evading IP-based
rate-limiting.
For the purposes of IP-based rate-limiting, add a
RATE_LIMIT_TOR_TOGETHER flag, defaulting to false, which lumps all
requests from TOR exit nodes into the same bucket. This may allow a
TOR user to deny other TOR users access to the find-my-account and
new-realm endpoints, but this is a low cost for cutting off a
significant potential abuse vector.
If enabled, the list of TOR exit nodes is fetched from their public
endpoint once per hour, via a cron job, and cached on disk. Django
processes load this data from disk, and cache it in memcached.
Requests are spared from the burden of checking disk on failure via a
circuitbreaker, which trips of there are two failures in a row, and
only begins trying again after 10 minutes.
2021-11-03 21:43:02 +01:00
|
|
|
if not settings.RATE_LIMIT_TOR_TOGETHER:
|
|
|
|
return
|
|
|
|
|
|
|
|
certificates = os.environ.get("CUSTOM_CA_CERTIFICATES")
|
|
|
|
session = TorDataSession(max_retries=options["max_retries"])
|
|
|
|
response = session.get(
|
|
|
|
"https://check.torproject.org/exit-addresses",
|
|
|
|
verify=certificates,
|
|
|
|
)
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
# Format:
|
|
|
|
# ExitNode 4273E6D162ED2717A1CF4207A254004CD3F5307B
|
|
|
|
# Published 2021-11-02 11:01:07
|
|
|
|
# LastStatus 2021-11-02 23:00:00
|
|
|
|
# ExitAddress 176.10.99.200 2021-11-02 23:17:02
|
|
|
|
exit_nodes: Set[str] = set()
|
|
|
|
for line in response.text.splitlines():
|
|
|
|
if line.startswith("ExitAddress "):
|
|
|
|
exit_nodes.add(line.split()[1])
|
|
|
|
|
|
|
|
# Write to a tmpfile to ensure we can't read a partially-written file
|
|
|
|
with open(settings.TOR_EXIT_NODE_FILE_PATH + ".tmp", "wb") as f:
|
|
|
|
f.write(orjson.dumps(list(exit_nodes)))
|
|
|
|
|
|
|
|
# Do an atomic rename into place
|
|
|
|
os.rename(
|
|
|
|
settings.TOR_EXIT_NODE_FILE_PATH + ".tmp",
|
|
|
|
settings.TOR_EXIT_NODE_FILE_PATH,
|
|
|
|
)
|