mirror of https://github.com/zulip/zulip.git
documentation_crawler: Improve error handling.
* Remove the custom has_error logic in favor of checking whether any errors were logged, which gives us a much better chance at catching unanticipated exceptions. * Use our error_callback for the initial requests of start_urls too. * Clean up mypy types. Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
This commit is contained in:
parent
4c60e994c8
commit
2f547eaee4
|
@ -18,8 +18,5 @@ class StatusCommand(Command):
|
||||||
crawler = self.crawler_process.create_crawler(spname)
|
crawler = self.crawler_process.create_crawler(spname)
|
||||||
self.crawler_process.crawl(crawler, skip_external=skip_external)
|
self.crawler_process.crawl(crawler, skip_external=skip_external)
|
||||||
self.crawler_process.start()
|
self.crawler_process.start()
|
||||||
# Get exceptions quantity from crawler stat data
|
if crawler.stats.get_value("log_count/ERROR"):
|
||||||
|
|
||||||
if crawler.spider.has_error:
|
|
||||||
# Return non-zero exit code if exceptions are contained
|
|
||||||
self.exitcode = 1
|
self.exitcode = 1
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
import logging
|
|
||||||
import re
|
import re
|
||||||
import scrapy
|
import scrapy
|
||||||
|
|
||||||
from scrapy import Request
|
from scrapy.http import Request, Response
|
||||||
from scrapy.linkextractors import IGNORED_EXTENSIONS
|
from scrapy.linkextractors import IGNORED_EXTENSIONS
|
||||||
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
|
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
|
||||||
|
from scrapy.spidermiddlewares.httperror import HttpError
|
||||||
from scrapy.utils.url import url_has_any_extension
|
from scrapy.utils.url import url_has_any_extension
|
||||||
|
from twisted.python.failure import Failure
|
||||||
|
|
||||||
from typing import Any, Generator, List, Optional
|
from typing import Callable, Iterable, List, Optional, Union
|
||||||
|
|
||||||
EXCLUDED_URLS = [
|
EXCLUDED_URLS = [
|
||||||
# Google calendar returns 404s on HEAD requests unconditionally
|
# Google calendar returns 404s on HEAD requests unconditionally
|
||||||
|
@ -34,21 +35,13 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||||
tags = ('a', 'area', 'img')
|
tags = ('a', 'area', 'img')
|
||||||
attrs = ('href', 'src')
|
attrs = ('href', 'src')
|
||||||
|
|
||||||
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
self.has_error = False
|
|
||||||
self.skip_external = kwargs.get('skip_external', None)
|
|
||||||
|
|
||||||
def _set_error_state(self) -> None:
|
|
||||||
self.has_error = True
|
|
||||||
|
|
||||||
def _has_extension(self, url: str) -> bool:
|
def _has_extension(self, url: str) -> bool:
|
||||||
return url_has_any_extension(url, self.file_extensions)
|
return url_has_any_extension(url, self.file_extensions)
|
||||||
|
|
||||||
def _is_external_url(self, url: str) -> bool:
|
def _is_external_url(self, url: str) -> bool:
|
||||||
return url.startswith('http') or self._has_extension(url)
|
return url.startswith('http') or self._has_extension(url)
|
||||||
|
|
||||||
def check_existing(self, response: Any) -> None:
|
def check_existing(self, response: Response) -> None:
|
||||||
self.log(response)
|
self.log(response)
|
||||||
|
|
||||||
def _is_external_link(self, url: str) -> bool:
|
def _is_external_link(self, url: str) -> bool:
|
||||||
|
@ -63,7 +56,7 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def check_permalink(self, response: Any) -> None:
|
def check_permalink(self, response: Response) -> None:
|
||||||
self.log(response)
|
self.log(response)
|
||||||
xpath_template = "//*[@id='{permalink}' or @name='{permalink}']"
|
xpath_template = "//*[@id='{permalink}' or @name='{permalink}']"
|
||||||
m = re.match(r".+\#(?P<permalink>.*)$", response.request.url) # Get anchor value.
|
m = re.match(r".+\#(?P<permalink>.*)$", response.request.url) # Get anchor value.
|
||||||
|
@ -72,55 +65,51 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||||
permalink = m.group('permalink')
|
permalink = m.group('permalink')
|
||||||
# Check permalink existing on response page.
|
# Check permalink existing on response page.
|
||||||
if not response.selector.xpath(xpath_template.format(permalink=permalink)):
|
if not response.selector.xpath(xpath_template.format(permalink=permalink)):
|
||||||
self._set_error_state()
|
self.logger.error(
|
||||||
raise Exception(
|
"Permalink #%s is not found on page %s", permalink, response.request.url)
|
||||||
"Permalink #{} is not found on page {}".format(permalink, response.request.url))
|
|
||||||
|
|
||||||
def parse(self, response: Any) -> Generator[Request, None, None]:
|
def _make_requests(self, url: str) -> Iterable[Request]:
|
||||||
|
callback = self.parse # type: Callable[[Response], Optional[Iterable[Request]]]
|
||||||
|
dont_filter = False
|
||||||
|
method = 'GET'
|
||||||
|
if self._is_external_url(url):
|
||||||
|
callback = self.check_existing
|
||||||
|
method = 'HEAD'
|
||||||
|
elif '#' in url:
|
||||||
|
dont_filter = True
|
||||||
|
callback = self.check_permalink
|
||||||
|
if getattr(self, 'skip_external', False) and self._is_external_link(url):
|
||||||
|
return
|
||||||
|
yield Request(url, method=method, callback=callback, dont_filter=dont_filter,
|
||||||
|
errback=self.error_callback)
|
||||||
|
|
||||||
|
def start_requests(self) -> Iterable[Request]:
|
||||||
|
for url in self.start_urls:
|
||||||
|
yield from self._make_requests(url)
|
||||||
|
|
||||||
|
def parse(self, response: Response) -> Iterable[Request]:
|
||||||
self.log(response)
|
self.log(response)
|
||||||
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
|
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
|
||||||
tags=self.tags, attrs=self.attrs, deny=self.deny,
|
tags=self.tags, attrs=self.attrs, deny=self.deny,
|
||||||
canonicalize=False).extract_links(response):
|
canonicalize=False).extract_links(response):
|
||||||
callback = self.parse # type: Any
|
yield from self._make_requests(link.url)
|
||||||
dont_filter = False
|
|
||||||
method = 'GET'
|
|
||||||
if self._is_external_url(link.url):
|
|
||||||
callback = self.check_existing
|
|
||||||
method = 'HEAD'
|
|
||||||
elif '#' in link.url:
|
|
||||||
dont_filter = True
|
|
||||||
callback = self.check_permalink
|
|
||||||
if self.skip_external:
|
|
||||||
if (self._is_external_link(link.url)):
|
|
||||||
continue
|
|
||||||
yield Request(link.url, method=method, callback=callback, dont_filter=dont_filter,
|
|
||||||
errback=self.error_callback)
|
|
||||||
|
|
||||||
def retry_request_with_get(self, request: Request) -> Generator[Request, None, None]:
|
def retry_request_with_get(self, request: Request) -> Iterable[Request]:
|
||||||
request.method = 'GET'
|
request.method = 'GET'
|
||||||
request.dont_filter = True
|
request.dont_filter = True
|
||||||
yield request
|
yield request
|
||||||
|
|
||||||
def exclude_error(self, url: str) -> bool:
|
def exclude_error(self, url: str) -> bool:
|
||||||
if url in EXCLUDED_URLS:
|
return url in EXCLUDED_URLS
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def error_callback(self, failure: Any) -> Optional[Generator[Any, None, None]]:
|
def error_callback(self, failure: Failure) -> Optional[Union[Failure, Iterable[Request]]]:
|
||||||
if hasattr(failure.value, 'response') and failure.value.response:
|
if failure.check(HttpError):
|
||||||
response = failure.value.response
|
response = failure.value.response
|
||||||
if self.exclude_error(response.url):
|
if self.exclude_error(response.url):
|
||||||
return None
|
return None
|
||||||
if response.status == 404:
|
|
||||||
self._set_error_state()
|
|
||||||
raise Exception('Page not found: {}'.format(response))
|
|
||||||
if response.status == 405 and response.request.method == 'HEAD':
|
if response.status == 405 and response.request.method == 'HEAD':
|
||||||
# Method 'HEAD' not allowed, repeat request with 'GET'
|
# Method 'HEAD' not allowed, repeat request with 'GET'
|
||||||
return self.retry_request_with_get(response.request)
|
return self.retry_request_with_get(response.request)
|
||||||
self.log("Error! Please check link: {}".format(response), logging.ERROR)
|
self.logger.error("Please check link: %s", response)
|
||||||
elif isinstance(failure.type, IOError):
|
|
||||||
self._set_error_state()
|
return failure
|
||||||
else:
|
|
||||||
self._set_error_state()
|
|
||||||
raise Exception(failure.value)
|
|
||||||
return None
|
|
||||||
|
|
Loading…
Reference in New Issue