From 2d50dcf7cceeea3ad326943329c082a23e3b827d Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Mon, 22 Apr 2019 20:51:14 -0700 Subject: [PATCH] test-help-documentation: Validate HTML with vnu.jar. The VNU_IGNORE whitelist lets in some crazy-invalid preexisting HTML, but hopefully this will stop the problem from getting much larger. Signed-off-by: Anders Kaseorg --- .../spiders/common/spiders.py | 41 +++++++++++++++++++ tools/test-help-documentation | 26 +++++++++--- 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py index 81b10e23b4..fe8a18a48f 100644 --- a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py +++ b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py @@ -1,3 +1,4 @@ +import json import re import scrapy @@ -24,6 +25,17 @@ EXCLUDED_URLS = [ 'https://www.linkedin.com/company/zulip-project', ] +VNU_IGNORE = re.compile(r'|'.join([ + # Real errors that should be fixed. + r'Duplicate ID “[^”]*”\.', + r'The first occurrence of ID “[^”]*” was here\.', + r'Attribute “markdown” not allowed on element “div” at this point\.', + r'No “p” element in scope but a “p” end tag seen\.', + + # Warnings that are probably less important. + r'The “type” attribute is unnecessary for JavaScript resources\.', +])) + class BaseDocumentationSpider(scrapy.Spider): name = None # type: Optional[str] @@ -68,6 +80,24 @@ class BaseDocumentationSpider(scrapy.Spider): self.logger.error( "Fragment #%s is not found on page %s", fragment, response.request.url) + def _vnu_callback(self, url: str) -> Callable[[Response], None]: + def callback(response: Response) -> None: + vnu_out = json.loads(response.text) + for message in vnu_out['messages']: + if not VNU_IGNORE.fullmatch(message['message']): + self.logger.error( + '"%s":%d.%d-%d.%d: %s: %s', + url, + message.get('firstLine', message['lastLine']), + message.get('firstColumn', message['lastColumn']), + message['lastLine'], + message['lastColumn'], + message['type'], + message['message'], + ) + + return callback + def _make_requests(self, url: str) -> Iterable[Request]: callback = self.parse # type: Callable[[Response], Optional[Iterable[Request]]] dont_filter = False @@ -89,6 +119,17 @@ class BaseDocumentationSpider(scrapy.Spider): def parse(self, response: Response) -> Iterable[Request]: self.log(response) + + if getattr(self, 'validate_html', False): + yield Request( + 'http://localhost:9988/?out=json', + method='POST', + headers={'Content-Type': response.headers['Content-Type']}, + body=response.body, + callback=self._vnu_callback(response.url), + errback=self.error_callback, + ) + for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'], tags=self.tags, attrs=self.attrs, deny=self.deny, canonicalize=False).extract_links(response): diff --git a/tools/test-help-documentation b/tools/test-help-documentation index 134c6f789b..5227ec0049 100755 --- a/tools/test-help-documentation +++ b/tools/test-help-documentation @@ -1,9 +1,10 @@ #!/usr/bin/env python3 import argparse +import contextlib import os import sys import subprocess -from typing import List +from typing import Iterator ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -30,13 +31,28 @@ os.makedirs('var/help-documentation', exist_ok=True) LOG_FILE = 'var/help-documentation/server.log' external_host = "localhost:9981" -extra_args = [] # type: List[str] +extra_args = ['-a', 'validate_html=set'] if options.skip_external_link_check: - extra_args = ['-a', 'skip_external=set'] + extra_args += ['-a', 'skip_external=set'] -with test_server_running(options.force, external_host, log_file=LOG_FILE, - dots=True, use_db=True): +@contextlib.contextmanager +def vnu_servlet() -> Iterator[None]: + with subprocess.Popen([ + 'java', '-cp', + os.path.join( + os.path.dirname(__file__), + '../node_modules/vnu-jar/build/dist/vnu.jar', + ), + 'nu.validator.servlet.Main', + '9988', + ]) as proc: + yield + proc.terminate() + +with vnu_servlet(), \ + test_server_running(options.force, external_host, log_file=LOG_FILE, + dots=True, use_db=True): ret_help_doc = subprocess.call(['scrapy', 'crawl_with_status'] + extra_args + ['help_documentation_crawler'], cwd='tools/documentation_crawler')