From 39454a6f3a4f3ab8f0c6ea0c18f22b513fd5e890 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 25 Oct 2024 20:52:25 +0300 Subject: [PATCH 1/3] gh-125926: Fix urllib.parse.urljoin() for base URI with undefined authority Although this goes beyond the application of RFC 3986, urljoin() should support relative base URIs for backward compatibility. --- Lib/test/test_urlparse.py | 72 +++++++++++++++++++ Lib/urllib/parse.py | 2 +- ...-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst | 4 ++ 3 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index d49e4388696ab4..98f86cd64ab4c2 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -623,6 +623,78 @@ def test_urljoins(self): self.checkJoin(RFC1808_BASE, 'https:;', 'https:;') self.checkJoin(RFC1808_BASE, 'https:;x', 'https:;x') + def test_urljoins_relative_base(self): + # According to RFC 3986, Section 5.1, a base URI must conform to + # the absolute-URI syntax rule (Section 4.3). But urljoin() lacks + # a context to establish missed components of the relative base URI. + # It still has to return a sensitible result for backward compatibility. + # The following tests are figments of the imagination and artifacts + # of the current implementation that are not based on any standard. + self.checkJoin('', '', '') + self.checkJoin('', '//', '//', relroundtrip=False) + self.checkJoin('', '//v', '//v') + self.checkJoin('', '//v/w', '//v/w') + self.checkJoin('', '/w', '/w') + self.checkJoin('', '///w', '///w', relroundtrip=False) + self.checkJoin('', 'w', 'w') + + self.checkJoin('//', '', '//') + self.checkJoin('//', '//', '//') + self.checkJoin('//', '//v', '//v') + self.checkJoin('//', '//v/w', '//v/w') + self.checkJoin('//', '/w', '/w') + self.checkJoin('//', '///w', '///w', relroundtrip=False) + self.checkJoin('//', 'w', '/w') + + self.checkJoin('//a', '', '//a') + self.checkJoin('//a', '//', '//', relroundtrip=False) + self.checkJoin('//a', '//v', '//v') + self.checkJoin('//a', '//v/w', '//v/w') + self.checkJoin('//a', '/w', '/w') + self.checkJoin('//a', '///w', '///w', relroundtrip=False) + self.checkJoin('//a', 'w', '/w') + + for scheme in '', 'http:': + self.checkJoin('http:', scheme + '', 'http:') + self.checkJoin('http:', scheme + '//', 'http:') + self.checkJoin('http:', scheme + '//v', 'http://v') + self.checkJoin('http:', scheme + '//v/w', 'http://v/w') + self.checkJoin('http:', scheme + '/w', 'http:/w') + self.checkJoin('http:', scheme + '///w', 'http:/w') + self.checkJoin('http:', scheme + 'w', 'http:/w') + + self.checkJoin('http://', scheme + '', 'http://') + self.checkJoin('http://', scheme + '//', 'http://') + self.checkJoin('http://', scheme + '//v', 'http://v') + self.checkJoin('http://', scheme + '//v/w', 'http://v/w') + self.checkJoin('http://', scheme + '/w', 'http:///w') + self.checkJoin('http://', scheme + '///w', 'http:///w') + self.checkJoin('http://', scheme + 'w', 'http:///w') + + self.checkJoin('http://a', scheme + '', 'http://a') + self.checkJoin('http://a', scheme + '//', 'http://a') + self.checkJoin('http://a', scheme + '//v', 'http://v') + self.checkJoin('http://a', scheme + '//v/w', 'http://v/w') + self.checkJoin('http://a', scheme + '/w', 'http://a/w') + self.checkJoin('http://a', scheme + '///w', 'http://a/w') + self.checkJoin('http://a', scheme + 'w', 'http://a/w') + + self.checkJoin('/b/c', '', '/b/c') + self.checkJoin('/b/c', '//', '///b/c', relroundtrip=False) + self.checkJoin('/b/c', '//v', '//v/b/c') + self.checkJoin('/b/c', '//v/w', '//v/w') + self.checkJoin('/b/c', '/w', '/w') + self.checkJoin('/b/c', '///w', '///w', relroundtrip=False) + self.checkJoin('/b/c', 'w', '/b/w') + + self.checkJoin('///b/c', '', '///b/c') + self.checkJoin('///b/c', '//', '///b/c') + self.checkJoin('///b/c', '//v', '//v/b/c') + self.checkJoin('///b/c', '//v/w', '//v/w') + self.checkJoin('///b/c', '/w', '/w') + self.checkJoin('///b/c', '///w', '///w', relroundtrip=False) + self.checkJoin('///b/c', 'w', '/b/w') + def test_RFC2732(self): str_cases = [ ('http://Test.python.org:5432/foo/', 'test.python.org', 5432), diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 5b00ab25c6b4ca..38ad0a9403fe4c 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -577,7 +577,7 @@ def urljoin(base, url, allow_fragments=True): if scheme is None: scheme = bscheme - if scheme != bscheme or scheme not in uses_relative: + if scheme != bscheme or (scheme and scheme not in uses_relative): return _coerce_result(url) if scheme in uses_netloc: if netloc: diff --git a/Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst b/Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst new file mode 100644 index 00000000000000..7f98bcdc38e566 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst @@ -0,0 +1,4 @@ +Fix :func:`urllib.parse.urljoin` for base URI with undefined authority. +Although :rfc:`3986` only specify reference resolution for absolute base +URI, :func:`!urljoin` should continue to return sensible result for relative +base URI. From 7f548cfb7298b78afc61115441c8b5d022e476d0 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 25 Oct 2024 20:52:25 +0300 Subject: [PATCH 2/3] gh-125926: Fix urllib.parse.urljoin() for base URI with undefined authority Although this goes beyond the application of RFC 3986, urljoin() should support relative base URIs for backward compatibility. --- Lib/test/test_urlparse.py | 72 +++++++++++++++++++ Lib/urllib/parse.py | 4 +- ...-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst | 4 ++ 3 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index d49e4388696ab4..dbc7ca57d6031a 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -623,6 +623,78 @@ def test_urljoins(self): self.checkJoin(RFC1808_BASE, 'https:;', 'https:;') self.checkJoin(RFC1808_BASE, 'https:;x', 'https:;x') + def test_urljoins_relative_base(self): + # According to RFC 3986, Section 5.1, a base URI must conform to + # the absolute-URI syntax rule (Section 4.3). But urljoin() lacks + # a context to establish missed components of the relative base URI. + # It still has to return a sensitible result for backward compatibility. + # The following tests are figments of the imagination and artifacts + # of the current implementation that are not based on any standard. + self.checkJoin('', '', '') + self.checkJoin('', '//', '//', relroundtrip=False) + self.checkJoin('', '//v', '//v') + self.checkJoin('', '//v/w', '//v/w') + self.checkJoin('', '/w', '/w') + self.checkJoin('', '///w', '///w', relroundtrip=False) + self.checkJoin('', 'w', 'w') + + self.checkJoin('//', '', '//') + self.checkJoin('//', '//', '//') + self.checkJoin('//', '//v', '//v') + self.checkJoin('//', '//v/w', '//v/w') + self.checkJoin('//', '/w', '///w') + self.checkJoin('//', '///w', '///w') + self.checkJoin('//', 'w', '///w') + + self.checkJoin('//a', '', '//a') + self.checkJoin('//a', '//', '//a') + self.checkJoin('//a', '//v', '//v') + self.checkJoin('//a', '//v/w', '//v/w') + self.checkJoin('//a', '/w', '//a/w') + self.checkJoin('//a', '///w', '//a/w') + self.checkJoin('//a', 'w', '//a/w') + + for scheme in '', 'http:': + self.checkJoin('http:', scheme + '', 'http:') + self.checkJoin('http:', scheme + '//', 'http:') + self.checkJoin('http:', scheme + '//v', 'http://v') + self.checkJoin('http:', scheme + '//v/w', 'http://v/w') + self.checkJoin('http:', scheme + '/w', 'http:/w') + self.checkJoin('http:', scheme + '///w', 'http:/w') + self.checkJoin('http:', scheme + 'w', 'http:/w') + + self.checkJoin('http://', scheme + '', 'http://') + self.checkJoin('http://', scheme + '//', 'http://') + self.checkJoin('http://', scheme + '//v', 'http://v') + self.checkJoin('http://', scheme + '//v/w', 'http://v/w') + self.checkJoin('http://', scheme + '/w', 'http:///w') + self.checkJoin('http://', scheme + '///w', 'http:///w') + self.checkJoin('http://', scheme + 'w', 'http:///w') + + self.checkJoin('http://a', scheme + '', 'http://a') + self.checkJoin('http://a', scheme + '//', 'http://a') + self.checkJoin('http://a', scheme + '//v', 'http://v') + self.checkJoin('http://a', scheme + '//v/w', 'http://v/w') + self.checkJoin('http://a', scheme + '/w', 'http://a/w') + self.checkJoin('http://a', scheme + '///w', 'http://a/w') + self.checkJoin('http://a', scheme + 'w', 'http://a/w') + + self.checkJoin('/b/c', '', '/b/c') + self.checkJoin('/b/c', '//', '/b/c') + self.checkJoin('/b/c', '//v', '//v') + self.checkJoin('/b/c', '//v/w', '//v/w') + self.checkJoin('/b/c', '/w', '/w') + self.checkJoin('/b/c', '///w', '/w') + self.checkJoin('/b/c', 'w', '/b/w') + + self.checkJoin('///b/c', '', '///b/c') + self.checkJoin('///b/c', '//', '///b/c') + self.checkJoin('///b/c', '//v', '//v') + self.checkJoin('///b/c', '//v/w', '//v/w') + self.checkJoin('///b/c', '/w', '///w') + self.checkJoin('///b/c', '///w', '///w') + self.checkJoin('///b/c', 'w', '///b/w') + def test_RFC2732(self): str_cases = [ ('http://Test.python.org:5432/foo/', 'test.python.org', 5432), diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 5b00ab25c6b4ca..a721d777c82f82 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -577,9 +577,9 @@ def urljoin(base, url, allow_fragments=True): if scheme is None: scheme = bscheme - if scheme != bscheme or scheme not in uses_relative: + if scheme != bscheme or (scheme and scheme not in uses_relative): return _coerce_result(url) - if scheme in uses_netloc: + if not scheme or scheme in uses_netloc: if netloc: return _coerce_result(_urlunsplit(scheme, netloc, path, query, fragment)) diff --git a/Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst b/Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst new file mode 100644 index 00000000000000..7f98bcdc38e566 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst @@ -0,0 +1,4 @@ +Fix :func:`urllib.parse.urljoin` for base URI with undefined authority. +Although :rfc:`3986` only specify reference resolution for absolute base +URI, :func:`!urljoin` should continue to return sensible result for relative +base URI. From 7121bf882aaee42f16ba3d9a7ef739fbcbd18c10 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 26 Oct 2024 00:21:49 +0300 Subject: [PATCH 3/3] Update Lib/test/test_urlparse.py Co-authored-by: Alex Waygood --- Lib/test/test_urlparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 98f86cd64ab4c2..681c2d986972bb 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -627,7 +627,7 @@ def test_urljoins_relative_base(self): # According to RFC 3986, Section 5.1, a base URI must conform to # the absolute-URI syntax rule (Section 4.3). But urljoin() lacks # a context to establish missed components of the relative base URI. - # It still has to return a sensitible result for backward compatibility. + # It still has to return a sensible result for backwards compatibility. # The following tests are figments of the imagination and artifacts # of the current implementation that are not based on any standard. self.checkJoin('', '', '')