diff --git a/requirements/scrapy.txt b/requirements/scrapy.txt new file mode 100644 index 0000000000..0b564e9515 --- /dev/null +++ b/requirements/scrapy.txt @@ -0,0 +1,9 @@ +Scrapy==1.2.0 +cryptography==1.5.2 +parsel==1.0.3 +pyOpenSSL==16.2.0 +w3lib==1.15.0 +PyDispatcher==2.0.5 +queuelib==1.4.2 +cffi==1.8.3 +pycparser==2.16 diff --git a/requirements/twisted.txt b/requirements/twisted.txt index cba1b324d1..662aa38cc2 100644 --- a/requirements/twisted.txt +++ b/requirements/twisted.txt @@ -2,3 +2,6 @@ Twisted==16.4.1 zope.interface==4.3.2 service-identity==16.0.0 attrs==16.2.0 + +# Needed for documentation links test +-r scrapy.txt diff --git a/tools/documentation_crawler/__init__.py b/tools/documentation_crawler/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/documentation_crawler/documentation_crawler/__init__.py b/tools/documentation_crawler/documentation_crawler/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/documentation_crawler/documentation_crawler/commands/__init__.py b/tools/documentation_crawler/documentation_crawler/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/documentation_crawler/documentation_crawler/commands/crawl_with_status.py b/tools/documentation_crawler/documentation_crawler/commands/crawl_with_status.py new file mode 100644 index 0000000000..99227f9e38 --- /dev/null +++ b/tools/documentation_crawler/documentation_crawler/commands/crawl_with_status.py @@ -0,0 +1,23 @@ +from scrapy.commands.crawl import Command +from scrapy.exceptions import UsageError +from typing import List, Any + + +class StatusCommand(Command): + def run(self, args, opts): + # type: (List[str], Any) -> None + if len(args) < 1: + raise UsageError() + elif len(args) > 1: + raise UsageError( + "running 'scrapy crawl' with more than one spider is no longer supported") + spname = args[0] + + crawler = self.crawler_process.create_crawler(spname) + self.crawler_process.crawl(crawler) + self.crawler_process.start() + # Get exceptions quantity from crawler stat data + stats = crawler.stats.get_stats() + if stats.get('spider_exceptions/Exception') or stats.get('downloader/exception_count'): + # Return non-zero exit code if exceptions are contained + self.exitcode = 1 diff --git a/tools/documentation_crawler/documentation_crawler/settings.py b/tools/documentation_crawler/documentation_crawler/settings.py new file mode 100644 index 0000000000..c396a6d1ef --- /dev/null +++ b/tools/documentation_crawler/documentation_crawler/settings.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for documentation_crawler project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'documentation_crawler' + +SPIDER_MODULES = ['documentation_crawler.spiders'] +NEWSPIDER_MODULE = 'documentation_crawler.spiders' +COMMANDS_MODULE = 'documentation_crawler.commands' +LOG_LEVEL = 'ERROR' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'documentation_crawler.middlewares.MyCustomSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'documentation_crawler.middlewares.MyCustomDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'documentation_crawler.pipelines.SomePipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/tools/documentation_crawler/documentation_crawler/spiders/__init__.py b/tools/documentation_crawler/documentation_crawler/spiders/__init__.py new file mode 100644 index 0000000000..ebd689ac51 --- /dev/null +++ b/tools/documentation_crawler/documentation_crawler/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/tools/documentation_crawler/documentation_crawler/spiders/check_documentation.py b/tools/documentation_crawler/documentation_crawler/spiders/check_documentation.py new file mode 100755 index 0000000000..6b60c2bbe3 --- /dev/null +++ b/tools/documentation_crawler/documentation_crawler/spiders/check_documentation.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +from __future__ import print_function +import os +import re +import scrapy +import pathlib2 +from scrapy import Request +from scrapy.linkextractors import IGNORED_EXTENSIONS +from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor +from scrapy.utils.url import url_has_any_extension +from typing import Any, Callable, Generator, List, Optional + + +def get_start_url(): + # type: () -> List[str] + # Get index html file as start url and convert it to file uri + dir_path = os.path.dirname(os.path.realpath(__file__)) + start_file = os.path.join(dir_path, os.path.join(*[os.pardir] * 4), + "docs/_build/html/index.html") + return [ + pathlib2.Path(os.path.abspath(start_file)).as_uri() + ] + + +class DocumentationSpider(scrapy.Spider): + name = "documentation_crawler" + deny_domains = ['localhost:9991'] # Exclude domain address. + start_urls = get_start_url() + file_extensions = ['.' + ext for ext in IGNORED_EXTENSIONS] + + def _has_extension(self, url): + # type: (str) -> bool + return url_has_any_extension(url, self.file_extensions) + + def check_existing(self, response): + # type: (Any) -> None + self.log(response) + + def check_permalink(self, response): + # type: (Any) -> None + self.log(response) + xpath_template = "//*[@id='{permalink}' or @name='{permalink}']" + m = re.match(r".+\#(?P.*)$", response.request.url) # Get anchor value. + if not m: + return + permalink = m.group('permalink') + # Check permalink existing on response page. + if not response.selector.xpath(xpath_template.format(permalink=permalink)): + raise Exception( + "Permalink #{} is not found on page {}".format(permalink, response.request.url)) + + def parse(self, response): + # type: (Any) -> Generator[Request, None, None] + self.log(response) + for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=[], + deny='\_sources\/.*\.txt', + canonicalize=False).extract_links(response): + callback = self.parse # type: Any + dont_filter = False + method = 'GET' + if link.url.startswith('http') or self._has_extension(link.url): + callback = self.check_existing + method = 'HEAD' + elif '#' in link.url: + dont_filter = True + callback = self.check_permalink + yield Request(link.url, method=method, callback=callback, dont_filter=dont_filter, + errback=self.error_callback) + + def retry_request_with_get(self, request): + # type: (Request) -> Generator[Request, None, None] + request.method = 'GET' + request.dont_filter = True + yield request + + def error_callback(self, failure): + # type: (Any) -> Optional[Generator[Any, None, None]] + if hasattr(failure.value, 'response'): + response = failure.value.response + if response.status == 405 and response.request.method == 'HEAD': + # Method 'HEAD' not allowed, repeat request with 'GET' + return self.retry_request_with_get(response.request) + raise Exception(failure.value.response) + else: + raise Exception(failure.value) diff --git a/tools/documentation_crawler/scrapy.cfg b/tools/documentation_crawler/scrapy.cfg new file mode 100644 index 0000000000..0eec1a808f --- /dev/null +++ b/tools/documentation_crawler/scrapy.cfg @@ -0,0 +1,6 @@ + +[settings] +default = documentation_crawler.settings + +[deploy] +project = documentation_crawler