From 5d07c55734ff244889d61a57669a2a5ab93cbad6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 18 May 2025 22:21:06 +0300 Subject: [PATCH 1/2] [3.13] gh-133890: Handle UnicodeEncodeError in tarfile (GH-134147) UnicodeEncodeError is now handled the same way as OSError during TarFile member extraction. (cherry picked from commit 9983c7d4416cac8deb2fded1ec9c7daf786c3a02) Co-authored-by: Serhiy Storchaka --- Lib/tarfile.py | 4 +- Lib/test/test_tarfile.py | 49 +++++++++++++++++-- ...-05-17-18-08-35.gh-issue-133890.onn9_X.rst | 2 + 3 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst diff --git a/Lib/tarfile.py b/Lib/tarfile.py index ea036193411771..6846a954482b1f 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -2376,7 +2376,7 @@ def _get_extract_tarinfo(self, member, filter_function, path): unfiltered = tarinfo try: tarinfo = filter_function(tarinfo, path) - except (OSError, FilterError) as e: + except (OSError, UnicodeEncodeError, FilterError) as e: self._handle_fatal_error(e) except ExtractError as e: self._handle_nonfatal_error(e) @@ -2397,7 +2397,7 @@ def _extract_one(self, tarinfo, path, set_attrs, numeric_owner): self._extract_member(tarinfo, os.path.join(path, tarinfo.name), set_attrs=set_attrs, numeric_owner=numeric_owner) - except OSError as e: + except (OSError, UnicodeEncodeError) as e: self._handle_fatal_error(e) except ExtractError as e: self._handle_nonfatal_error(e) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 2e59c1d749c7cb..6c41e03708c3e4 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -3457,11 +3457,12 @@ class ArchiveMaker: with t.open() as tar: ... # `tar` is now a TarFile with 'filename' in it! """ - def __init__(self): + def __init__(self, **kwargs): self.bio = io.BytesIO() + self.tar_kwargs = dict(kwargs) def __enter__(self): - self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio) + self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio, **self.tar_kwargs) return self def __exit__(self, *exc): @@ -4040,7 +4041,10 @@ def test_tar_filter(self): # that in the test archive.) with tarfile.TarFile.open(tarname) as tar: for tarinfo in tar.getmembers(): - filtered = tarfile.tar_filter(tarinfo, '') + try: + filtered = tarfile.tar_filter(tarinfo, '') + except UnicodeEncodeError: + continue self.assertIs(filtered.name, tarinfo.name) self.assertIs(filtered.type, tarinfo.type) @@ -4051,11 +4055,48 @@ def test_data_filter(self): for tarinfo in tar.getmembers(): try: filtered = tarfile.data_filter(tarinfo, '') - except tarfile.FilterError: + except (tarfile.FilterError, UnicodeEncodeError): continue self.assertIs(filtered.name, tarinfo.name) self.assertIs(filtered.type, tarinfo.type) + @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths') + def test_filter_unencodable(self): + # Sanity check using a valid path. + tarinfo = tarfile.TarInfo(os_helper.TESTFN) + filtered = tarfile.tar_filter(tarinfo, '') + self.assertIs(filtered.name, tarinfo.name) + filtered = tarfile.data_filter(tarinfo, '') + self.assertIs(filtered.name, tarinfo.name) + + tarinfo = tarfile.TarInfo('test\x00') + self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '') + self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '') + tarinfo = tarfile.TarInfo('\ud800') + self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '') + self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '') + + @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths') + def test_extract_unencodable(self): + # Create a member with name \xed\xa0\x80 which is UTF-8 encoded + # lone surrogate \ud800. + with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc: + arc.add('\udced\udca0\udc80') + with os_helper.temp_cwd() as tmp: + tar = arc.open(encoding='utf-8', errors='surrogatepass', + errorlevel=1) + self.assertEqual(tar.getnames(), ['\ud800']) + with self.assertRaises(UnicodeEncodeError): + tar.extractall() + self.assertEqual(os.listdir(), []) + + tar = arc.open(encoding='utf-8', errors='surrogatepass', + errorlevel=0, debug=1) + with support.captured_stderr() as stderr: + tar.extractall() + self.assertEqual(os.listdir(), []) + self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue()) + def test_default_filter_warns(self): """Ensure the default filter warns""" with ArchiveMaker() as arc: diff --git a/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst b/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst new file mode 100644 index 00000000000000..44565a5424e65b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst @@ -0,0 +1,2 @@ +The :mod:`tarfile` module now handles :exc:`UnicodeEncodeError` in the same +way as :exc:`OSError` when cannot extract a member. From fc08e56b47b6763f9d567f699e8b7bb79d3730fb Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 19 May 2025 13:14:09 +0300 Subject: [PATCH 2/2] Use the filter argument. --- Lib/test/test_tarfile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 6c41e03708c3e4..604dad9ff0e751 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -4087,13 +4087,13 @@ def test_extract_unencodable(self): errorlevel=1) self.assertEqual(tar.getnames(), ['\ud800']) with self.assertRaises(UnicodeEncodeError): - tar.extractall() + tar.extractall(filter=tarfile.tar_filter) self.assertEqual(os.listdir(), []) tar = arc.open(encoding='utf-8', errors='surrogatepass', errorlevel=0, debug=1) with support.captured_stderr() as stderr: - tar.extractall() + tar.extractall(filter=tarfile.tar_filter) self.assertEqual(os.listdir(), []) self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue())