mirror of https://github.com/zulip/zulip.git
python: Clean up janky URL matching code with urlsplit.
Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
parent
1e96c6e9a2
commit
8f7a7877fe
|
@ -2,7 +2,7 @@ import json
|
|||
import os
|
||||
import re
|
||||
from typing import Callable, Iterator, List, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urlparse, urlsplit
|
||||
|
||||
import scrapy
|
||||
from scrapy.http import Request, Response
|
||||
|
@ -56,8 +56,8 @@ VNU_IGNORE_REGEX = re.compile(r"|".join(VNU_IGNORE))
|
|||
|
||||
DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../.."))
|
||||
|
||||
ZULIP_SERVER_GITHUB_FILE_URL_PREFIX = "https://github.com/zulip/zulip/blob/main"
|
||||
ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX = "https://github.com/zulip/zulip/tree/main"
|
||||
ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX = "/zulip/zulip/blob/main"
|
||||
ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX = "/zulip/zulip/tree/main"
|
||||
|
||||
|
||||
class BaseDocumentationSpider(scrapy.Spider):
|
||||
|
@ -80,24 +80,30 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||
self.log(response)
|
||||
|
||||
def _is_external_link(self, url: str) -> bool:
|
||||
if url.startswith("https://chat.zulip.org"):
|
||||
split_url = urlsplit(url)
|
||||
if split_url.hostname == "chat.zulip.org":
|
||||
# Since most chat.zulip.org URLs will be links to specific
|
||||
# logged-in content that the spider cannot verify, or the
|
||||
# homepage, there's no need to check those (which can
|
||||
# cause errors when chat.zulip.org is being updated).
|
||||
return True
|
||||
if "zulip.readthedocs" in url or "zulip.com" in url or "zulip.org" in url:
|
||||
if split_url.hostname == "zulip.readthedocs.io" or f".{split_url.hostname}".endswith(
|
||||
(".zulip.com", ".zulip.org")
|
||||
):
|
||||
# We want CI to check any links to Zulip sites.
|
||||
return False
|
||||
if (len(url) > 4 and url[:4] == "file") or ("localhost" in url):
|
||||
if split_url.scheme == "file" or split_url.hostname == "localhost":
|
||||
# We also want CI to check any links to built documentation.
|
||||
return False
|
||||
if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX) or url.startswith(
|
||||
ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX
|
||||
if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
|
||||
(
|
||||
f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/",
|
||||
f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/",
|
||||
)
|
||||
):
|
||||
# We can verify these links directly in the local Git repo without making any requests to GitHub servers.
|
||||
return False
|
||||
if "github.com/zulip" in url:
|
||||
if split_url.hostname == "github.com" and split_url.path.startswith("/zulip/"):
|
||||
# We want to check these links but due to rate limiting from GitHub, these checks often
|
||||
# fail in the CI. Thus, we should treat these as external links for now.
|
||||
# TODO: Figure out how to test github.com/zulip links in CI.
|
||||
|
@ -107,10 +113,7 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||
def check_fragment(self, response: Response) -> None:
|
||||
self.log(response)
|
||||
xpath_template = "//*[@id='{fragment}' or @name='{fragment}']"
|
||||
m = re.match(r".+\#(?P<fragment>.*)$", response.request.url) # Get fragment value.
|
||||
if not m:
|
||||
return
|
||||
fragment = m.group("fragment")
|
||||
fragment = urlsplit(response.request.url).fragment
|
||||
# Check fragment existing on response page.
|
||||
if not response.selector.xpath(xpath_template.format(fragment=fragment)):
|
||||
self.logger.error(
|
||||
|
@ -141,11 +144,8 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||
# do crawl documentation served by the web app (e.g. /help/),
|
||||
# we don't want to crawl the web app itself, so we exclude
|
||||
# these.
|
||||
if (
|
||||
url in ["http://localhost:9981/", "http://localhost:9981"]
|
||||
or url.startswith("http://localhost:9981/#")
|
||||
or url.startswith("http://localhost:9981#")
|
||||
):
|
||||
split_url = urlsplit(url)
|
||||
if split_url.netloc == "localhost:9981" and split_url.path in ["", "/"]:
|
||||
return
|
||||
|
||||
# This page has some invisible to the user anchor links like #all
|
||||
|
@ -160,29 +160,34 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||
callback = self.check_existing
|
||||
method = "HEAD"
|
||||
|
||||
if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX):
|
||||
file_path = url.replace(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX, DEPLOY_ROOT)
|
||||
hash_index = file_path.find("#")
|
||||
if hash_index != -1:
|
||||
file_path = file_path[:hash_index]
|
||||
if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
|
||||
f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/"
|
||||
):
|
||||
file_path = (
|
||||
DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX) :]
|
||||
)
|
||||
if not os.path.isfile(file_path):
|
||||
self.logger.error(
|
||||
"There is no local file associated with the GitHub URL: %s", url
|
||||
)
|
||||
return
|
||||
elif url.startswith(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX):
|
||||
dir_path = url.replace(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX, DEPLOY_ROOT)
|
||||
elif split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
|
||||
f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/"
|
||||
):
|
||||
dir_path = (
|
||||
DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX) :]
|
||||
)
|
||||
if not os.path.isdir(dir_path):
|
||||
self.logger.error(
|
||||
"There is no local directory associated with the GitHub URL: %s", url
|
||||
)
|
||||
return
|
||||
elif "#" in url:
|
||||
elif split_url.fragment != "":
|
||||
dont_filter = True
|
||||
callback = self.check_fragment
|
||||
if getattr(self, "skip_external", False) and self._is_external_link(url):
|
||||
return
|
||||
if urlparse(url).netloc in EXCLUDED_DOMAINS:
|
||||
if split_url.hostname in EXCLUDED_DOMAINS:
|
||||
return
|
||||
if url in EXCLUDED_URLS:
|
||||
return
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import logging
|
||||
import os
|
||||
import posixpath
|
||||
import random
|
||||
import secrets
|
||||
import shutil
|
||||
|
@ -8,6 +9,7 @@ import zipfile
|
|||
from collections import defaultdict
|
||||
from email.headerregistry import Address
|
||||
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Type, TypeVar
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
import orjson
|
||||
import requests
|
||||
|
@ -211,11 +213,15 @@ def build_realmemoji(
|
|||
emoji_url_map = {}
|
||||
emoji_id = 0
|
||||
for emoji_name, url in custom_emoji_list.items():
|
||||
if "emoji.slack-edge.com" in url:
|
||||
split_url = urlsplit(url)
|
||||
if split_url.hostname == "emoji.slack-edge.com":
|
||||
# Some of the emojis we get from the API have invalid links
|
||||
# this is to prevent errors related to them
|
||||
realmemoji = RealmEmoji(
|
||||
name=emoji_name, id=emoji_id, file_name=os.path.basename(url), deactivated=False
|
||||
name=emoji_name,
|
||||
id=emoji_id,
|
||||
file_name=posixpath.basename(split_url.path),
|
||||
deactivated=False,
|
||||
)
|
||||
|
||||
realmemoji_dict = model_to_dict(realmemoji, exclude=["realm", "author"])
|
||||
|
@ -1040,8 +1046,9 @@ def process_message_files(
|
|||
continue
|
||||
|
||||
url = fileinfo["url_private"]
|
||||
split_url = urlsplit(url)
|
||||
|
||||
if "files.slack.com" in url:
|
||||
if split_url.hostname == "files.slack.com":
|
||||
# For attachments with Slack download link
|
||||
has_attachment = True
|
||||
has_link = True
|
||||
|
|
|
@ -768,7 +768,7 @@ def apply_event(
|
|||
if event["op"] == "add":
|
||||
person = copy.deepcopy(person)
|
||||
if client_gravatar:
|
||||
if person["avatar_url"].startswith("https://secure.gravatar.com"):
|
||||
if person["avatar_url"].startswith("https://secure.gravatar.com/"):
|
||||
person["avatar_url"] = None
|
||||
person["is_active"] = True
|
||||
if not person["is_bot"]:
|
||||
|
@ -859,7 +859,7 @@ def apply_event(
|
|||
|
||||
if client_gravatar and "avatar_url" in person:
|
||||
# Respect the client_gravatar setting in the `users` data.
|
||||
if person["avatar_url"].startswith("https://secure.gravatar.com"):
|
||||
if person["avatar_url"].startswith("https://secure.gravatar.com/"):
|
||||
person["avatar_url"] = None
|
||||
person["avatar_url_medium"] = None
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import time
|
||||
from typing import Any, Callable, Dict, List, Mapping, Optional
|
||||
from unittest import mock
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
import orjson
|
||||
from django.conf import settings
|
||||
|
@ -458,7 +459,7 @@ class GetEventsTest(ZulipTestCase):
|
|||
message = get_message(apply_markdown=False, client_gravatar=False)
|
||||
self.assertEqual(message["display_recipient"], "Denmark")
|
||||
self.assertEqual(message["content"], "**hello**")
|
||||
self.assertTrue(message["avatar_url"].startswith("https://secure.gravatar.com"))
|
||||
self.assertEqual(urlsplit(message["avatar_url"]).hostname, "secure.gravatar.com")
|
||||
|
||||
message = get_message(apply_markdown=True, client_gravatar=False)
|
||||
self.assertEqual(message["display_recipient"], "Denmark")
|
||||
|
@ -693,7 +694,8 @@ class FetchInitialStateDataTest(ZulipTestCase):
|
|||
gravatar_users_id = [
|
||||
user_dict["user_id"]
|
||||
for user_dict in raw_users.values()
|
||||
if "avatar_url" in user_dict and "gravatar.com" in user_dict["avatar_url"]
|
||||
if "avatar_url" in user_dict
|
||||
and urlsplit(user_dict["avatar_url"]).hostname == "secure.gravatar.com"
|
||||
]
|
||||
|
||||
# Test again with client_gravatar = True
|
||||
|
|
|
@ -1167,7 +1167,7 @@ class SlackImporter(ZulipTestCase):
|
|||
)
|
||||
files = [
|
||||
dict(
|
||||
url_private="files.slack.com/apple.png",
|
||||
url_private="https://files.slack.com/apple.png",
|
||||
title="Apple",
|
||||
name="apple.png",
|
||||
mimetype="image/png",
|
||||
|
@ -1176,7 +1176,7 @@ class SlackImporter(ZulipTestCase):
|
|||
size=3000000,
|
||||
),
|
||||
dict(
|
||||
url_private="example.com/banana.zip",
|
||||
url_private="https://example.com/banana.zip",
|
||||
title="banana",
|
||||
),
|
||||
]
|
||||
|
@ -1211,7 +1211,9 @@ class SlackImporter(ZulipTestCase):
|
|||
self.assert_length(uploads_list, 1)
|
||||
|
||||
image_path = zerver_attachment[0]["path_id"]
|
||||
expected_content = f"[Apple](/user_uploads/{image_path})\n[banana](example.com/banana.zip)"
|
||||
expected_content = (
|
||||
f"[Apple](/user_uploads/{image_path})\n[banana](https://example.com/banana.zip)"
|
||||
)
|
||||
self.assertEqual(info["content"], expected_content)
|
||||
|
||||
self.assertTrue(info["has_link"])
|
||||
|
|
Loading…
Reference in New Issue