markdown: Use tlds package to keep updated list of TLDs.

Also remove a useage of "blacklist."
This commit is contained in:
Alex Vandiver 2020-09-21 18:10:16 -07:00 committed by Tim Abbott
parent 29459ea61f
commit 4361ce1246
6 changed files with 17 additions and 327 deletions

View File

@ -186,3 +186,6 @@ openapi-core
# For reporting errors to sentry.io
sentry-sdk
# For detecting URLs to link
tlds

View File

@ -1136,6 +1136,10 @@ tinycss2==1.0.2 \
--hash=sha256:6427d0e3faa0a5e0e8c9f6437e2de26148a7a197a8b0992789f23d9a802788cf \
--hash=sha256:9fdacc0e22d344ddd2ca053837c133900fe820ae1222f63b79617490a498507a \
# via cairosvg, cssselect2
tlds==2020092000 \
--hash=sha256:18ee63f6376a9d15d1d9f893ecbab06713e863e3a3364832ba843ab8b1003aea \
--hash=sha256:842a02a5c69d645149bf57e3f8620f033aa3d0cf37aab3f1ab6e9b70ffdb9c41 \
# via -r requirements/common.in
tornado==4.5.3 \
--hash=sha256:5ef073ac6180038ccf99411fe05ae9aafb675952a2c8db60592d5daf8401f803 \
--hash=sha256:6d14e47eab0e15799cf3cdcc86b0b98279da68522caace2bd7ce644287685f0a \

View File

@ -775,6 +775,10 @@ stripe==2.51.0 \
https://github.com/zulip/talon/archive/7d8bdc4dbcfcc5a73298747293b99fe53da55315.zip#egg=talon==1.2.10.zulip1 \
--hash=sha256:21d87c437379287d09df7a2d2af7bd818d4fa00be619dff446dacbdb4338d921 \
# via -r requirements/common.in
tlds==2020092000 \
--hash=sha256:18ee63f6376a9d15d1d9f893ecbab06713e863e3a3364832ba843ab8b1003aea \
--hash=sha256:842a02a5c69d645149bf57e3f8620f033aa3d0cf37aab3f1ab6e9b70ffdb9c41 \
# via -r requirements/common.in
tornado==4.5.3 \
--hash=sha256:5ef073ac6180038ccf99411fe05ae9aafb675952a2c8db60592d5daf8401f803 \
--hash=sha256:6d14e47eab0e15799cf3cdcc86b0b98279da68522caace2bd7ce644287685f0a \

View File

@ -44,4 +44,4 @@ API_FEATURE_LEVEL = 33
# historical commits sharing the same major version, in which case a
# minor version bump suffices.
PROVISION_VERSION = '107.1'
PROVISION_VERSION = '107.2'

View File

@ -4,7 +4,6 @@ import datetime
import functools
import html
import logging
import os
import re
import time
import urllib
@ -38,6 +37,7 @@ import requests
from django.conf import settings
from django.db.models import Q
from markdown.extensions import codehilite, nl2br, sane_lists, tables
from tlds import tld_set
from typing_extensions import TypedDict
from zerver.lib import mention as mention
@ -190,7 +190,7 @@ def get_web_link_regex() -> str:
https?://[\w.:@-]+? # If it has a protocol, anything goes.
|(?: # Or, if not, be more strict to avoid false-positives
(?:[\w-]+\.)+ # One or more domain components, separated by dots
(?:{tlds}) # TLDs (filled in via format from tlds-alpha-by-domain.txt)
(?:{tlds}) # TLDs
)
)
(?:/ # A path, beginning with /
@ -276,13 +276,10 @@ def image_preview_enabled(message: Optional[Message]=None,
return realm.inline_image_preview
def list_of_tlds() -> List[str]:
# HACK we manually blacklist a few domains
blacklist = ['PY\n', "MD\n"]
# Skip a few overly-common false-positives from file extensions
common_false_positives = set(['py', 'md'])
tlds = list(tld_set - common_false_positives)
# tlds-alpha-by-domain.txt comes from https://data.iana.org/TLD/tlds-alpha-by-domain.txt
tlds_file = os.path.join(os.path.dirname(__file__), 'tlds-alpha-by-domain.txt')
tlds = [tld.lower().strip() for tld in open(tlds_file)
if tld not in blacklist and not tld[0].startswith('#')]
tlds.sort(key=len, reverse=True)
return tlds

View File

@ -1,318 +0,0 @@
# Version 2013040200, Last Updated Tue Apr 2 07:07:01 2013 UTC
AC
AD
AE
AERO
AF
AG
AI
AL
AM
AN
AO
AQ
AR
ARPA
AS
ASIA
AT
AU
AW
AX
AZ
BA
BB
BD
BE
BF
BG
BH
BI
BIZ
BJ
BM
BN
BO
BR
BS
BT
BV
BW
BY
BZ
CA
CAT
CC
CD
CF
CG
CH
CI
CK
CL
CM
CN
CO
COM
COOP
CR
CU
CV
CW
CX
CY
CZ
DE
DJ
DK
DM
DO
DZ
EC
EDU
EE
EG
ER
ES
ET
EU
FI
FJ
FK
FM
FO
FR
GA
GB
GD
GE
GF
GG
GH
GI
GL
GM
GN
GOV
GP
GQ
GR
GS
GT
GU
GW
GY
HK
HM
HN
HR
HT
HU
ID
IE
IL
IM
IN
INFO
INT
IO
IQ
IR
IS
IT
JE
JM
JO
JOBS
JP
KE
KG
KH
KI
KM
KN
KP
KR
KW
KY
KZ
LA
LB
LC
LI
LK
LR
LS
LT
LU
LV
LY
MA
MC
MD
ME
MG
MH
MIL
MK
ML
MM
MN
MO
MOBI
MP
MQ
MR
MS
MT
MU
MUSEUM
MV
MW
MX
MY
MZ
NA
NAME
NC
NE
NET
NF
NG
NI
NL
NO
NP
NR
NU
NZ
OM
ORG
PA
PE
PF
PG
PH
PK
PL
PM
PN
POST
PR
PRO
PS
PT
PW
PY
QA
RE
RO
RS
RU
RW
SA
SB
SC
SD
SE
SG
SH
SI
SJ
SK
SL
SM
SN
SO
SR
ST
SU
SV
SX
SY
SZ
TC
TD
TEL
TF
TG
TH
TJ
TK
TL
TM
TN
TO
TP
TR
TRAVEL
TT
TV
TW
TZ
UA
UG
UK
US
UY
UZ
VA
VC
VE
VG
VI
VN
VU
WF
WS
XN--0ZWM56D
XN--11B5BS3A9AJ6G
XN--3E0B707E
XN--45BRJ9C
XN--80AKHBYKNJ4F
XN--80AO21A
XN--90A3AC
XN--9T4B11YI5A
XN--CLCHC0EA0B2G2A9GCD
XN--DEBA0AD
XN--FIQS8S
XN--FIQZ9S
XN--FPCRJ9C3D
XN--FZC2C9E2C
XN--G6W251D
XN--GECRJ9C
XN--H2BRJ9C
XN--HGBK6AJ7F53BBA
XN--HLCJ6AYA9ESC7A
XN--J1AMH
XN--J6W193G
XN--JXALPDLP
XN--KGBECHTV
XN--KPRW13D
XN--KPRY57D
XN--LGBBAT1AD8J
XN--MGB9AWBF
XN--MGBAAM7A8H
XN--MGBAYH7GPA
XN--MGBBH1A71E
XN--MGBC0A9AZCG
XN--MGBERP4A5D4AR
XN--MGBX4CD0AB
XN--O3CW4H
XN--OGBPF8FL
XN--P1AI
XN--PGBS0DH
XN--S9BRJ9C
XN--WGBH1C
XN--WGBL6A
XN--XKC2AL3HYE2A
XN--XKC2DL3A5EE0H
XN--YFRO4I67O
XN--YGBI2AMMX
XN--ZCKZAH
XXX
YE
YT
ZA
ZM
ZW