BaseDocumentationSpider: Don't crawl webapp.

2020-10-14 09:48:28 +05:30 · 2020-10-14 09:48:28 +05:30 · bdc4721aee
parent 72b25553b2
commit bdc4721aee
1 changed files with 7 additions and 0 deletions
--- a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
+++ b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
@ -107,6 +107,13 @@ class BaseDocumentationSpider(scrapy.Spider):
        return callback

    def _make_requests(self, url: str) -> Iterator[Request]:
+        # These URLs are for Zulip's webapp, which with recent changes
+        # can be accessible without login an account.  While we do
+        # crawl documentation served by the webapp (E.g. /help/), we
+        # don't want to crawl the webapp itself, so we exclude these.
+        if url in ['http://localhost:9981/', 'http://localhost:9981'] or url.startswith('http://localhost:9981/#') or url.startswith('http://localhost:9981#'):
+            return
+
        callback: Callable[[Response], Optional[Iterator[Request]]] = self.parse
        dont_filter = False
        method = 'GET'