From 4361ce12469682f4ea75be1b87b2df15cf55f9d9 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Mon, 21 Sep 2020 18:10:16 -0700 Subject: [PATCH] markdown: Use tlds package to keep updated list of TLDs. Also remove a useage of "blacklist." --- requirements/common.in | 3 + requirements/dev.txt | 4 + requirements/prod.txt | 4 + version.py | 2 +- zerver/lib/markdown/__init__.py | 13 +- zerver/lib/markdown/tlds-alpha-by-domain.txt | 318 ------------------- 6 files changed, 17 insertions(+), 327 deletions(-) delete mode 100644 zerver/lib/markdown/tlds-alpha-by-domain.txt diff --git a/requirements/common.in b/requirements/common.in index a7cae91546..896ab23762 100644 --- a/requirements/common.in +++ b/requirements/common.in @@ -186,3 +186,6 @@ openapi-core # For reporting errors to sentry.io sentry-sdk + +# For detecting URLs to link +tlds diff --git a/requirements/dev.txt b/requirements/dev.txt index 10721a8787..5743dc9b06 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1136,6 +1136,10 @@ tinycss2==1.0.2 \ --hash=sha256:6427d0e3faa0a5e0e8c9f6437e2de26148a7a197a8b0992789f23d9a802788cf \ --hash=sha256:9fdacc0e22d344ddd2ca053837c133900fe820ae1222f63b79617490a498507a \ # via cairosvg, cssselect2 +tlds==2020092000 \ + --hash=sha256:18ee63f6376a9d15d1d9f893ecbab06713e863e3a3364832ba843ab8b1003aea \ + --hash=sha256:842a02a5c69d645149bf57e3f8620f033aa3d0cf37aab3f1ab6e9b70ffdb9c41 \ + # via -r requirements/common.in tornado==4.5.3 \ --hash=sha256:5ef073ac6180038ccf99411fe05ae9aafb675952a2c8db60592d5daf8401f803 \ --hash=sha256:6d14e47eab0e15799cf3cdcc86b0b98279da68522caace2bd7ce644287685f0a \ diff --git a/requirements/prod.txt b/requirements/prod.txt index 9783323f08..c058832a30 100644 --- a/requirements/prod.txt +++ b/requirements/prod.txt @@ -775,6 +775,10 @@ stripe==2.51.0 \ https://github.com/zulip/talon/archive/7d8bdc4dbcfcc5a73298747293b99fe53da55315.zip#egg=talon==1.2.10.zulip1 \ --hash=sha256:21d87c437379287d09df7a2d2af7bd818d4fa00be619dff446dacbdb4338d921 \ # via -r requirements/common.in +tlds==2020092000 \ + --hash=sha256:18ee63f6376a9d15d1d9f893ecbab06713e863e3a3364832ba843ab8b1003aea \ + --hash=sha256:842a02a5c69d645149bf57e3f8620f033aa3d0cf37aab3f1ab6e9b70ffdb9c41 \ + # via -r requirements/common.in tornado==4.5.3 \ --hash=sha256:5ef073ac6180038ccf99411fe05ae9aafb675952a2c8db60592d5daf8401f803 \ --hash=sha256:6d14e47eab0e15799cf3cdcc86b0b98279da68522caace2bd7ce644287685f0a \ diff --git a/version.py b/version.py index 5bad098dda..1d4bb217c5 100644 --- a/version.py +++ b/version.py @@ -44,4 +44,4 @@ API_FEATURE_LEVEL = 33 # historical commits sharing the same major version, in which case a # minor version bump suffices. -PROVISION_VERSION = '107.1' +PROVISION_VERSION = '107.2' diff --git a/zerver/lib/markdown/__init__.py b/zerver/lib/markdown/__init__.py index 132d81c62c..3a32cf3606 100644 --- a/zerver/lib/markdown/__init__.py +++ b/zerver/lib/markdown/__init__.py @@ -4,7 +4,6 @@ import datetime import functools import html import logging -import os import re import time import urllib @@ -38,6 +37,7 @@ import requests from django.conf import settings from django.db.models import Q from markdown.extensions import codehilite, nl2br, sane_lists, tables +from tlds import tld_set from typing_extensions import TypedDict from zerver.lib import mention as mention @@ -190,7 +190,7 @@ def get_web_link_regex() -> str: https?://[\w.:@-]+? # If it has a protocol, anything goes. |(?: # Or, if not, be more strict to avoid false-positives (?:[\w-]+\.)+ # One or more domain components, separated by dots - (?:{tlds}) # TLDs (filled in via format from tlds-alpha-by-domain.txt) + (?:{tlds}) # TLDs ) ) (?:/ # A path, beginning with / @@ -276,13 +276,10 @@ def image_preview_enabled(message: Optional[Message]=None, return realm.inline_image_preview def list_of_tlds() -> List[str]: - # HACK we manually blacklist a few domains - blacklist = ['PY\n', "MD\n"] + # Skip a few overly-common false-positives from file extensions + common_false_positives = set(['py', 'md']) + tlds = list(tld_set - common_false_positives) - # tlds-alpha-by-domain.txt comes from https://data.iana.org/TLD/tlds-alpha-by-domain.txt - tlds_file = os.path.join(os.path.dirname(__file__), 'tlds-alpha-by-domain.txt') - tlds = [tld.lower().strip() for tld in open(tlds_file) - if tld not in blacklist and not tld[0].startswith('#')] tlds.sort(key=len, reverse=True) return tlds diff --git a/zerver/lib/markdown/tlds-alpha-by-domain.txt b/zerver/lib/markdown/tlds-alpha-by-domain.txt deleted file mode 100644 index 68833edc4a..0000000000 --- a/zerver/lib/markdown/tlds-alpha-by-domain.txt +++ /dev/null @@ -1,318 +0,0 @@ -# Version 2013040200, Last Updated Tue Apr 2 07:07:01 2013 UTC -AC -AD -AE -AERO -AF -AG -AI -AL -AM -AN -AO -AQ -AR -ARPA -AS -ASIA -AT -AU -AW -AX -AZ -BA -BB -BD -BE -BF -BG -BH -BI -BIZ -BJ -BM -BN -BO -BR -BS -BT -BV -BW -BY -BZ -CA -CAT -CC -CD -CF -CG -CH -CI -CK -CL -CM -CN -CO -COM -COOP -CR -CU -CV -CW -CX -CY -CZ -DE -DJ -DK -DM -DO -DZ -EC -EDU -EE -EG -ER -ES -ET -EU -FI -FJ -FK -FM -FO -FR -GA -GB -GD -GE -GF -GG -GH -GI -GL -GM -GN -GOV -GP -GQ -GR -GS -GT -GU -GW -GY -HK -HM -HN -HR -HT -HU -ID -IE -IL -IM -IN -INFO -INT -IO -IQ -IR -IS -IT -JE -JM -JO -JOBS -JP -KE -KG -KH -KI -KM -KN -KP -KR -KW -KY -KZ -LA -LB -LC -LI -LK -LR -LS -LT -LU -LV -LY -MA -MC -MD -ME -MG -MH -MIL -MK -ML -MM -MN -MO -MOBI -MP -MQ -MR -MS -MT -MU -MUSEUM -MV -MW -MX -MY -MZ -NA -NAME -NC -NE -NET -NF -NG -NI -NL -NO -NP -NR -NU -NZ -OM -ORG -PA -PE -PF -PG -PH -PK -PL -PM -PN -POST -PR -PRO -PS -PT -PW -PY -QA -RE -RO -RS -RU -RW -SA -SB -SC -SD -SE -SG -SH -SI -SJ -SK -SL -SM -SN -SO -SR -ST -SU -SV -SX -SY -SZ -TC -TD -TEL -TF -TG -TH -TJ -TK -TL -TM -TN -TO -TP -TR -TRAVEL -TT -TV -TW -TZ -UA -UG -UK -US -UY -UZ -VA -VC -VE -VG -VI -VN -VU -WF -WS -XN--0ZWM56D -XN--11B5BS3A9AJ6G -XN--3E0B707E -XN--45BRJ9C -XN--80AKHBYKNJ4F -XN--80AO21A -XN--90A3AC -XN--9T4B11YI5A -XN--CLCHC0EA0B2G2A9GCD -XN--DEBA0AD -XN--FIQS8S -XN--FIQZ9S -XN--FPCRJ9C3D -XN--FZC2C9E2C -XN--G6W251D -XN--GECRJ9C -XN--H2BRJ9C -XN--HGBK6AJ7F53BBA -XN--HLCJ6AYA9ESC7A -XN--J1AMH -XN--J6W193G -XN--JXALPDLP -XN--KGBECHTV -XN--KPRW13D -XN--KPRY57D -XN--LGBBAT1AD8J -XN--MGB9AWBF -XN--MGBAAM7A8H -XN--MGBAYH7GPA -XN--MGBBH1A71E -XN--MGBC0A9AZCG -XN--MGBERP4A5D4AR -XN--MGBX4CD0AB -XN--O3CW4H -XN--OGBPF8FL -XN--P1AI -XN--PGBS0DH -XN--S9BRJ9C -XN--WGBH1C -XN--WGBL6A -XN--XKC2AL3HYE2A -XN--XKC2DL3A5EE0H -XN--YFRO4I67O -XN--YGBI2AMMX -XN--ZCKZAH -XXX -YE -YT -ZA -ZM -ZW