diff --git a/tests/test_url.py b/tests/test_url.py index 1187f274..093ac67c 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -52,6 +52,30 @@ u'http://127.0.10.150', u'http://localhost', u'http://localhost:8000', + u'http://xn--mgbh0fb.xn--kgbechtv', + u'http://xn--mgbh0fb.xn--kgbechtv/', + u'http://xn--mgbh0fb.xn--kgbechtv:8080', + u'http://xn--mgbh0fb.xn--kgbechtv:8080/', + u'http://xn--mgbh0fb.xn--kgbechtv/foobar', + u'http://xn--mgbh0fb.xn--kgbechtv:8080/foobar', + u'http://xn--mgbh0fb.xn--kgbechtv/foo/?bar=baz&inga=42&quux', + u'http://xn--mgbh0fb.xn--kgbechtv:8080/foo/?bar=baz&inga=42&quux', + u'http://xn--mgbh0fb.xn--kgbechtv/foo_bar', + u'http://xn--mgbh0fb.xn--kgbechtv/123', + u'http://xn--mgbh0fb.xn--kgbechtv:8080/123', + u'http://xn--mgbh0fb.xn--kgbechtv/?foo=bar%20has-url-encoded%20stuff', + u'http://xn--mgbh0fb.xn--kgbechtv:8080/?foo=bar%20has-url-encoded%20stuff', + u'http://xn--mgbh0fb.xn--kgbechtv/foobar_(wikipedia)#cite-1', + u'http://xn--mgbh0fb.xn--kgbechtv:8080/foobar_(wikipedia)#cite-1', + u'http://xn--mgbh0fb.xn--kgbechtv/unicode_(✪)_in_parens', + u'http://xn--mgbh0fb.xn--kgbechtv:8080/unicode_(✪)_in_parens', + u'http://xn--mgbh0fb.xn--kgbechtv/(something)?after=parens', + u'ftp://xn--p1b6ci4b4b3a.xn--11b5bs3a9aj6g/foo', + u'https://userid:password@xn--fsqu00a.xn--0zwm56d', + u'https://userid:password@xn--fsqu00a.xn--0zwm56d:8080', + u'https://userid:password@xn--fsqu00a.xn--0zwm56d:8080/foobar' + u'https://用户名密码:密码@xn--fsqu00a.xn--0zwm56d' + u'https://%E7%94%A8:%E5%AF%86@xn--fsqu00a.xn--0zwm56d' ]) def test_returns_true_on_valid_url(address): assert url(address) @@ -118,6 +142,27 @@ def test_returns_true_on_valid_public_url(address, public): 'http://.www.foo.bar./', 'http://127.12.0.260', 'http://example.com/">user@example.com', + 'xn--mgbh0fb.xn--kgbechtv', + 'http://xn--mgbh0fb', + 'http://xn--mgbh0fb.xn---kgbechtv', + 'http://xn---mgbh0fb.xn--kgbechtv', + 'http://xn--mgbh0fb.xnk--gbechtv', + 'http://xnm--gbh0fb.xn--kgbechtv', + 'http:// xn--mgbh0fb.xn--kgbechtv', + ':// xn--mgbh0fb.xn--kgbechtv', + 'http://-xn--mgbh0fb.xn--kgbechtv', + 'http://xn--mgbh0fb-.xn--kgbechtv', + 'http://xn--mgbh0fb.-xn--kgbechtv', + 'http://xn--mgbh0fb.xn--kgbechtv-', + 'http://x-n--mgbh0fb.xn--kgbechtv', + 'http://xn--mgbh0fb.x-n--kgbechtv', + 'http://xn--mgbh0fb.xn--kgbechtv./', + 'http://xn--mgbh0fb..xn--kgbechtv', + 'http:///xn--mgbh0fb.xn--kgbechtv', + 'ttp://xn--mgbh0fb.xn--kgbechtv', + 'http://xn--mgbh0fb.xn--kgbechtv/">user@example.com', + u'http://xn--mgbh0fb.إختبار', + u'http://مثال.xn--kgbechtv', ]) def test_returns_failed_validation_on_invalid_url(address): assert isinstance(url(address), ValidationFailure) diff --git a/validators/__init__.py b/validators/__init__.py index 8fe8fc7c..2f32d48a 100644 --- a/validators/__init__.py +++ b/validators/__init__.py @@ -14,4 +14,4 @@ from .utils import ValidationFailure, validator # noqa from .uuid import uuid # noqa -__version__ = '0.12.1' +__version__ = '0.12.1337' \ No newline at end of file diff --git a/validators/domain.py b/validators/domain.py index 7f7f2672..da8a250c 100644 --- a/validators/domain.py +++ b/validators/domain.py @@ -5,7 +5,7 @@ pattern = re.compile( r'^(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|' # domain pt.1 r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|' # domain pt.2 - r'([a-zA-Z0-9][-_.a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.' # domain pt.3 + r'([a-zA-Z0-9][-_.a-zA-Z0-9]{0,61}[a-zA-Z0-9])){1,999}\.' # domain pt.3 r'([a-zA-Z]{2,13}|(xn--[a-zA-Z0-9]{2,30}))$' # TLD ) diff --git a/validators/url.py b/validators/url.py index fdccf0c0..142bf3e8 100644 --- a/validators/url.py +++ b/validators/url.py @@ -4,11 +4,10 @@ ip_middle_octet = u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5]))" ip_last_octet = u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" +protocol_identifier = u"(?:(?:https?|ftp)://)" regex = re.compile( - u"^" - # protocol identifier - u"(?:(?:https?|ftp)://)" + u"^" + protocol_identifier + u"" # user:pass authentication u"(?:[-a-z\u00a1-\uffff0-9._~%!$&'()*+,;=:]+" u"(?::[-a-z0-9._~%!$&'()*+,;=:]*)?@)?" @@ -53,7 +52,26 @@ re.UNICODE | re.IGNORECASE ) +regex_idna_converter = re.compile( + u"^" + # protocol group + u"(?P" + protocol_identifier + u")" + # user:pass group + u"(?P(?:[-a-z\u00a1-\uffff0-9._~%!$&'()*+,;=:]+" + u"(?::[-a-z0-9._~%!$&'()*+,;=:]*)?@)?)" + # fqdn group: intentionally loose, only meant to isolate any + # potential fqdn so that idna decoding can be attempted. + u"(?P[^/:]+)" + # port number group + u"(?P:\d{2,5})?" + # resource/query/fragment group + u"(?P/.*)?" + u"$", + re.UNICODE | re.IGNORECASE +) + pattern = re.compile(regex) +pattern_idna_converter = re.compile(regex_idna_converter) @validator @@ -109,6 +127,26 @@ def url(value, public=False): :param public: (default=False) Set True to only allow a public IP address """ result = pattern.match(value) + + # if initial match failed, attempt an idna conversion + if not result: + try: + # use regex to separate the potential idna fqdn + idna_result = pattern_idna_converter.match(value) + idna_dict = idna_result.groupdict() + # reassemble the URL after decoding the fqdn as idna + idna_value = u"{protocol}{fqdn}{port}{resource}".format( + protocol=idna_dict['protocol'], + userpass=idna_dict['userpass'] or "", + fqdn=idna_dict['fqdn'].decode('idna'), + port=idna_dict['port'] or "", + resource=idna_dict['resource'] or "" + ) + result = pattern.match(idna_value) + # if pattern doesn't match or host can't decode as idna then pass + except (AttributeError, UnicodeError): + pass + if not public: return result