tools: Suppress errors for github.com links.

I have suppressed errors for github.com by adding an function to
exclude domains as well as urls; this is necessary because GitHub has
marked this tool's User-Agent as a blocker crawler.

I have also suppressed reoccurring url errors that do definitely exist.

Fixes #17928.
This commit is contained in:
Adam Birds 2021-04-03 00:58:39 +00:00 committed by Tim Abbott
parent ec10a40b1c
commit 4d9f161e0f
1 changed files with 19 additions and 4 deletions

View File

@ -2,6 +2,7 @@ import json
import os import os
import re import re
from typing import Callable, Iterator, List, Optional, Union from typing import Callable, Iterator, List, Optional, Union
from urllib.parse import urlparse
import scrapy import scrapy
from scrapy.http import Request, Response from scrapy.http import Request, Response
@ -11,6 +12,15 @@ from scrapy.spidermiddlewares.httperror import HttpError
from scrapy.utils.url import url_has_any_extension from scrapy.utils.url import url_has_any_extension
from twisted.python.failure import Failure from twisted.python.failure import Failure
EXCLUDED_DOMAINS = [
# Returns 429 Rate-Limited Errors
"github.com",
"gist.github.com",
# Returns 503 Errors
"www.amazon.com",
"gitlab.com",
]
EXCLUDED_URLS = [ EXCLUDED_URLS = [
# Google calendar returns 404s on HEAD requests unconditionally # Google calendar returns 404s on HEAD requests unconditionally
"https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com", "https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com",
@ -19,6 +29,8 @@ EXCLUDED_URLS = [
# Returns 404 to HEAD requests unconditionally # Returns 404 to HEAD requests unconditionally
"https://www.git-tower.com/blog/command-line-cheat-sheet/", "https://www.git-tower.com/blog/command-line-cheat-sheet/",
"https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode", "https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode",
"https://www.transifex.com/zulip/zulip/announcements/",
"https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh",
# Requires authentication # Requires authentication
"https://circleci.com/gh/zulip/zulip/tree/master", "https://circleci.com/gh/zulip/zulip/tree/master",
"https://circleci.com/gh/zulip/zulip/16617", "https://circleci.com/gh/zulip/zulip/16617",
@ -164,6 +176,10 @@ class BaseDocumentationSpider(scrapy.Spider):
callback = self.check_fragment callback = self.check_fragment
if getattr(self, "skip_external", False) and self._is_external_link(url): if getattr(self, "skip_external", False) and self._is_external_link(url):
return return
if urlparse(url).netloc in EXCLUDED_DOMAINS:
return
if url in EXCLUDED_URLS:
return
yield Request( yield Request(
url, url,
method=method, method=method,
@ -204,13 +220,12 @@ class BaseDocumentationSpider(scrapy.Spider):
request.dont_filter = True request.dont_filter = True
yield request yield request
def exclude_error(self, url: str) -> bool:
return url in EXCLUDED_URLS
def error_callback(self, failure: Failure) -> Optional[Union[Failure, Iterator[Request]]]: def error_callback(self, failure: Failure) -> Optional[Union[Failure, Iterator[Request]]]:
if isinstance(failure.value, HttpError): if isinstance(failure.value, HttpError):
response = failure.value.response response = failure.value.response
if self.exclude_error(response.url): # Hack: The filtering above does not catch this URL,
# likely due to a redirect.
if urlparse(response.url).netloc == "idmsa.apple.com":
return None return None
if response.status == 405 and response.request.method == "HEAD": if response.status == 405 and response.request.method == "HEAD":
# Method 'HEAD' not allowed, repeat request with 'GET' # Method 'HEAD' not allowed, repeat request with 'GET'