From 89b3129d48bf11a7fadb13f8d67a34beb89cbd55 Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Tue, 2 Apr 2024 13:06:59 -0700 Subject: [PATCH] documentation_crawler: Consider status.zulip.com external. Signed-off-by: Anders Kaseorg --- .../documentation_crawler/spiders/common/spiders.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py index 746acf3f71..9d56ee9d57 100644 --- a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py +++ b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py @@ -80,11 +80,14 @@ class BaseDocumentationSpider(scrapy.Spider): def _is_external_link(self, url: str) -> bool: split_url = urlsplit(url) - if split_url.hostname == "chat.zulip.org": + if split_url.hostname in ("chat.zulip.org", "status.zulip.com"): # Since most chat.zulip.org URLs will be links to specific # logged-in content that the spider cannot verify, or the # homepage, there's no need to check those (which can # cause errors when chat.zulip.org is being updated). + # + # status.zulip.com is externally hosted and, in a peculiar twist of + # cosmic irony, often itself offline. return True if split_url.hostname == "zulip.readthedocs.io" or f".{split_url.hostname}".endswith( (".zulip.com", ".zulip.org")