diff --git a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py index 2d0394cfac..eed40ffcc9 100644 --- a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py +++ b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py @@ -34,6 +34,7 @@ class BaseDocumentationSpider(scrapy.Spider): def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self.has_error = False + self.skip_external = kwargs.get('skip_external', None) def _set_error_state(self) -> None: self.has_error = True @@ -47,6 +48,18 @@ class BaseDocumentationSpider(scrapy.Spider): def check_existing(self, response: Any) -> None: self.log(response) + def _is_external_link(self, url: str) -> bool: + if "zulip.readthedocs" in url or "zulipchat.com" in url or "zulip.org" in url: + # We want CI to check any links to Zulip sites. + return False + if (len(url) > 4 and url[:4] == "file") or ("localhost" in url): + # We also want CI to check any links to built documentation. + return False + if 'github.com/zulip' in url: + # Finally, links to our own GitHub organization should always work. + return False + return True + def check_permalink(self, response: Any) -> None: self.log(response) xpath_template = "//*[@id='{permalink}' or @name='{permalink}']" @@ -74,6 +87,9 @@ class BaseDocumentationSpider(scrapy.Spider): elif '#' in link.url: dont_filter = True callback = self.check_permalink + if (self.skip_external is not None): # checks if flag is set to skip external link check. + if (self._is_external_link(link.url)): + continue yield Request(link.url, method=method, callback=callback, dont_filter=dont_filter, errback=self.error_callback) diff --git a/tools/test-documentation b/tools/test-documentation index 900ee7b493..a8e9c3fdd2 100755 --- a/tools/test-documentation +++ b/tools/test-documentation @@ -12,6 +12,8 @@ case $1 in -h|--help) echo "--help, -h show this help message and exit" echo "--loglevel=LEVEL, -L LEVEL log level (default: ERROR)" + echo "--skip-check-links skip checking of links" + echo "--skip-external-links skip checking of external links" exit 0 ;; -L|--loglevel) @@ -20,6 +22,9 @@ case $1 in --skip-check-links) skip_check_links=1 ;; + --skip-external-links) + skip_external_links=1 + ;; esac cd "$(dirname "$0")"/../docs @@ -36,6 +41,22 @@ if [ -n "$skip_check_links" ]; then exit 0 fi +if [ -n "$skip_external_links" ]; then + color_message 94 "Testing only internal links in documentation..." + cd ../tools/documentation_crawler + set +e + scrapy crawl documentation_crawler -a skip_external=set "${loglevel[@]}" + # calling crawl directly as parameter needs to be passed + result=$? + if [ "$result" = 1 ]; then + color_message 91 "Failed!" + exit 1 + else + color_message 92 "Passed!" + exit 0 + fi +fi + color_message 94 "Testing links in documentation..." cd ../tools/documentation_crawler