From bdc4721aee8228ffe1d43971281ce2cc82e76750 Mon Sep 17 00:00:00 2001 From: Aman Agrawal Date: Wed, 14 Oct 2020 09:48:28 +0530 Subject: [PATCH] BaseDocumentationSpider: Don't crawl webapp. --- .../documentation_crawler/spiders/common/spiders.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py index 145199f2da..9c66f242cb 100644 --- a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py +++ b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py @@ -107,6 +107,13 @@ class BaseDocumentationSpider(scrapy.Spider): return callback def _make_requests(self, url: str) -> Iterator[Request]: + # These URLs are for Zulip's webapp, which with recent changes + # can be accessible without login an account. While we do + # crawl documentation served by the webapp (E.g. /help/), we + # don't want to crawl the webapp itself, so we exclude these. + if url in ['http://localhost:9981/', 'http://localhost:9981'] or url.startswith('http://localhost:9981/#') or url.startswith('http://localhost:9981#'): + return + callback: Callable[[Response], Optional[Iterator[Request]]] = self.parse dont_filter = False method = 'GET'