Use IANA's TLD list for auto-linkification detecting

(imported from commit 9103fdc92405b92300a793bd1d4f493df64b5b9c)
This commit is contained in:
Leo Franchi 2013-04-02 11:08:00 -04:00
parent a5643efa14
commit 0055107cfd
3 changed files with 331 additions and 2 deletions

View File

@ -22,6 +22,17 @@ from zephyr.lib.cache import cache_with_key
# messages so that we can efficiently determine what needs to be re-rendered
version = 1
def list_of_tlds():
# HACK we manually blacklist .py
blacklist = ['PY\n', ]
# tlds-alpha-by-domain.txt comes from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
tlds_file = os.path.join(os.path.dirname(__file__), 'tlds-alpha-by-domain.txt')
tlds = [tld.lower().strip() for tld in open(tlds_file, 'r')
if not tld in blacklist and not tld[0].startswith('#')]
tlds.sort(key=len, reverse=True)
return tlds
def walk_tree(root, processor, stop_after_first=False):
results = []
stack = [root]
@ -432,8 +443,7 @@ class Bugdown(markdown.Extension):
#
# This rule must come after the http_autolink rule we add above to avoid double
# linkifying.
tlds = '|'.join(['co.uk', 'com', 'co', 'biz', 'gd', 'org', 'net', 'ly', 'edu', 'mil',
'gov', 'info', 'me', 'it', '.ca', 'tv', 'fm', 'io', 'gl'])
tlds = '|'.join(list_of_tlds())
link_regex = r"\b(?P<url>[^\s]+\.(%s)(?:/[^\s()\":]*?|([^\s()\":]*\([^\s()\":]*\)[^\s()\":]*))?)(?=([:;\?\),\.\'\"]\Z|[:;\?\),\.\'\"]\s|\Z|\s))" % (tlds,)
md.inlinePatterns.add('autolink', AutoLink(link_regex), '>http_autolink')

View File

@ -0,0 +1,318 @@
# Version 2013040200, Last Updated Tue Apr 2 07:07:01 2013 UTC
AC
AD
AE
AERO
AF
AG
AI
AL
AM
AN
AO
AQ
AR
ARPA
AS
ASIA
AT
AU
AW
AX
AZ
BA
BB
BD
BE
BF
BG
BH
BI
BIZ
BJ
BM
BN
BO
BR
BS
BT
BV
BW
BY
BZ
CA
CAT
CC
CD
CF
CG
CH
CI
CK
CL
CM
CN
CO
COM
COOP
CR
CU
CV
CW
CX
CY
CZ
DE
DJ
DK
DM
DO
DZ
EC
EDU
EE
EG
ER
ES
ET
EU
FI
FJ
FK
FM
FO
FR
GA
GB
GD
GE
GF
GG
GH
GI
GL
GM
GN
GOV
GP
GQ
GR
GS
GT
GU
GW
GY
HK
HM
HN
HR
HT
HU
ID
IE
IL
IM
IN
INFO
INT
IO
IQ
IR
IS
IT
JE
JM
JO
JOBS
JP
KE
KG
KH
KI
KM
KN
KP
KR
KW
KY
KZ
LA
LB
LC
LI
LK
LR
LS
LT
LU
LV
LY
MA
MC
MD
ME
MG
MH
MIL
MK
ML
MM
MN
MO
MOBI
MP
MQ
MR
MS
MT
MU
MUSEUM
MV
MW
MX
MY
MZ
NA
NAME
NC
NE
NET
NF
NG
NI
NL
NO
NP
NR
NU
NZ
OM
ORG
PA
PE
PF
PG
PH
PK
PL
PM
PN
POST
PR
PRO
PS
PT
PW
PY
QA
RE
RO
RS
RU
RW
SA
SB
SC
SD
SE
SG
SH
SI
SJ
SK
SL
SM
SN
SO
SR
ST
SU
SV
SX
SY
SZ
TC
TD
TEL
TF
TG
TH
TJ
TK
TL
TM
TN
TO
TP
TR
TRAVEL
TT
TV
TW
TZ
UA
UG
UK
US
UY
UZ
VA
VC
VE
VG
VI
VN
VU
WF
WS
XN--0ZWM56D
XN--11B5BS3A9AJ6G
XN--3E0B707E
XN--45BRJ9C
XN--80AKHBYKNJ4F
XN--80AO21A
XN--90A3AC
XN--9T4B11YI5A
XN--CLCHC0EA0B2G2A9GCD
XN--DEBA0AD
XN--FIQS8S
XN--FIQZ9S
XN--FPCRJ9C3D
XN--FZC2C9E2C
XN--G6W251D
XN--GECRJ9C
XN--H2BRJ9C
XN--HGBK6AJ7F53BBA
XN--HLCJ6AYA9ESC7A
XN--J1AMH
XN--J6W193G
XN--JXALPDLP
XN--KGBECHTV
XN--KPRW13D
XN--KPRY57D
XN--LGBBAT1AD8J
XN--MGB9AWBF
XN--MGBAAM7A8H
XN--MGBAYH7GPA
XN--MGBBH1A71E
XN--MGBC0A9AZCG
XN--MGBERP4A5D4AR
XN--MGBX4CD0AB
XN--O3CW4H
XN--OGBPF8FL
XN--P1AI
XN--PGBS0DH
XN--S9BRJ9C
XN--WGBH1C
XN--WGBL6A
XN--XKC2AL3HYE2A
XN--XKC2DL3A5EE0H
XN--YFRO4I67O
XN--YGBI2AMMX
XN--ZCKZAH
XXX
YE
YT
ZA
ZM
ZW

View File

@ -1994,6 +1994,7 @@ int x = 3
('http://raven.io', "<p>%s</p>", 'http://raven.io'),
('at https://humbughq.com/api. Check it!', "<p>at %s. Check it!</p>", 'https://humbughq.com/api'),
('goo.gl/abc', "<p>%s</p>", 'goo.gl/abc'),
('I spent a year at ucl.ac.uk', "<p>I spent a year at %s</p>", 'ucl.ac.uk'),
('http://d.pr/i/FMXO', "<p>%s</p>", 'http://d.pr/i/FMXO'),
('http://fmota.eu/blog/test.html', "<p>%s</p>", 'http://fmota.eu/blog/test.html'),
('http://j.mp/14Hwm3X', "<p>%s</p>", 'http://j.mp/14Hwm3X'),