mirror of https://github.com/zulip/zulip.git
tools: Suppress errors for github.com links.
I have suppressed errors for github.com by adding an function to exclude domains as well as urls; this is necessary because GitHub has marked this tool's User-Agent as a blocker crawler. I have also suppressed reoccurring url errors that do definitely exist. Fixes #17928.
This commit is contained in:
parent
ec10a40b1c
commit
4d9f161e0f
|
@ -2,6 +2,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import Callable, Iterator, List, Optional, Union
|
from typing import Callable, Iterator, List, Optional, Union
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy.http import Request, Response
|
from scrapy.http import Request, Response
|
||||||
|
@ -11,6 +12,15 @@ from scrapy.spidermiddlewares.httperror import HttpError
|
||||||
from scrapy.utils.url import url_has_any_extension
|
from scrapy.utils.url import url_has_any_extension
|
||||||
from twisted.python.failure import Failure
|
from twisted.python.failure import Failure
|
||||||
|
|
||||||
|
EXCLUDED_DOMAINS = [
|
||||||
|
# Returns 429 Rate-Limited Errors
|
||||||
|
"github.com",
|
||||||
|
"gist.github.com",
|
||||||
|
# Returns 503 Errors
|
||||||
|
"www.amazon.com",
|
||||||
|
"gitlab.com",
|
||||||
|
]
|
||||||
|
|
||||||
EXCLUDED_URLS = [
|
EXCLUDED_URLS = [
|
||||||
# Google calendar returns 404s on HEAD requests unconditionally
|
# Google calendar returns 404s on HEAD requests unconditionally
|
||||||
"https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com",
|
"https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com",
|
||||||
|
@ -19,6 +29,8 @@ EXCLUDED_URLS = [
|
||||||
# Returns 404 to HEAD requests unconditionally
|
# Returns 404 to HEAD requests unconditionally
|
||||||
"https://www.git-tower.com/blog/command-line-cheat-sheet/",
|
"https://www.git-tower.com/blog/command-line-cheat-sheet/",
|
||||||
"https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode",
|
"https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode",
|
||||||
|
"https://www.transifex.com/zulip/zulip/announcements/",
|
||||||
|
"https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh",
|
||||||
# Requires authentication
|
# Requires authentication
|
||||||
"https://circleci.com/gh/zulip/zulip/tree/master",
|
"https://circleci.com/gh/zulip/zulip/tree/master",
|
||||||
"https://circleci.com/gh/zulip/zulip/16617",
|
"https://circleci.com/gh/zulip/zulip/16617",
|
||||||
|
@ -164,6 +176,10 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||||
callback = self.check_fragment
|
callback = self.check_fragment
|
||||||
if getattr(self, "skip_external", False) and self._is_external_link(url):
|
if getattr(self, "skip_external", False) and self._is_external_link(url):
|
||||||
return
|
return
|
||||||
|
if urlparse(url).netloc in EXCLUDED_DOMAINS:
|
||||||
|
return
|
||||||
|
if url in EXCLUDED_URLS:
|
||||||
|
return
|
||||||
yield Request(
|
yield Request(
|
||||||
url,
|
url,
|
||||||
method=method,
|
method=method,
|
||||||
|
@ -204,13 +220,12 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||||
request.dont_filter = True
|
request.dont_filter = True
|
||||||
yield request
|
yield request
|
||||||
|
|
||||||
def exclude_error(self, url: str) -> bool:
|
|
||||||
return url in EXCLUDED_URLS
|
|
||||||
|
|
||||||
def error_callback(self, failure: Failure) -> Optional[Union[Failure, Iterator[Request]]]:
|
def error_callback(self, failure: Failure) -> Optional[Union[Failure, Iterator[Request]]]:
|
||||||
if isinstance(failure.value, HttpError):
|
if isinstance(failure.value, HttpError):
|
||||||
response = failure.value.response
|
response = failure.value.response
|
||||||
if self.exclude_error(response.url):
|
# Hack: The filtering above does not catch this URL,
|
||||||
|
# likely due to a redirect.
|
||||||
|
if urlparse(response.url).netloc == "idmsa.apple.com":
|
||||||
return None
|
return None
|
||||||
if response.status == 405 and response.request.method == "HEAD":
|
if response.status == 405 and response.request.method == "HEAD":
|
||||||
# Method 'HEAD' not allowed, repeat request with 'GET'
|
# Method 'HEAD' not allowed, repeat request with 'GET'
|
||||||
|
|
Loading…
Reference in New Issue