From bdc4721aee8228ffe1d43971281ce2cc82e76750 Mon Sep 17 00:00:00 2001
From: Aman Agrawal <amanagr@zulip.com>
Date: Wed, 14 Oct 2020 09:48:28 +0530
Subject: [PATCH] BaseDocumentationSpider: Don't crawl webapp.

---
 .../documentation_crawler/spiders/common/spiders.py        | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
index 145199f2da..9c66f242cb 100644
--- a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
+++ b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
@@ -107,6 +107,13 @@ class BaseDocumentationSpider(scrapy.Spider):
         return callback
 
     def _make_requests(self, url: str) -> Iterator[Request]:
+        # These URLs are for Zulip's webapp, which with recent changes
+        # can be accessible without login an account.  While we do
+        # crawl documentation served by the webapp (E.g. /help/), we
+        # don't want to crawl the webapp itself, so we exclude these.
+        if url in ['http://localhost:9981/', 'http://localhost:9981'] or url.startswith('http://localhost:9981/#') or url.startswith('http://localhost:9981#'):
+            return
+
         callback: Callable[[Response], Optional[Iterator[Request]]] = self.parse
         dont_filter = False
         method = 'GET'