@@ -747,6 +747,49 @@ def test_codec_stream_writer(self):
747747
748748class CAPICodecErrors (unittest .TestCase ):
749749
750+ @classmethod
751+ def _generate_exception_args (cls ):
752+ for objlen in range (5 ):
753+ maxind = 2 * max (2 , objlen )
754+ for start in range (- maxind , maxind + 1 ):
755+ for end in range (- maxind , maxind + 1 ):
756+ yield objlen , start , end
757+
758+ @classmethod
759+ def generate_encode_errors (cls ):
760+ return tuple (
761+ UnicodeEncodeError ('utf-8' , '0' * objlen , start , end , 'why' )
762+ for objlen , start , end in cls ._generate_exception_args ()
763+ )
764+
765+ @classmethod
766+ def generate_decode_errors (cls ):
767+ return tuple (
768+ UnicodeDecodeError ('utf-8' , b'0' * objlen , start , end , 'why' )
769+ for objlen , start , end in cls ._generate_exception_args ()
770+ )
771+
772+ @classmethod
773+ def generate_translate_errors (cls ):
774+ return tuple (
775+ UnicodeTranslateError ('0' * objlen , start , end , 'why' )
776+ for objlen , start , end in cls ._generate_exception_args ()
777+ )
778+
779+ @classmethod
780+ def setUpClass (cls ):
781+ cls .unicode_encode_errors = cls .generate_encode_errors ()
782+ cls .unicode_decode_errors = cls .generate_decode_errors ()
783+ cls .unicode_translate_errors = cls .generate_translate_errors ()
784+ cls .all_unicode_errors = (
785+ cls .unicode_encode_errors
786+ + cls .unicode_decode_errors
787+ + cls .unicode_translate_errors
788+ )
789+ cls .bad_unicode_errors = (
790+ ValueError (),
791+ )
792+
750793 def test_codec_register_error (self ):
751794 # for cleaning up between tests
752795 from _codecs import _unregister_error as _codecs_unregister_error
@@ -780,33 +823,82 @@ def test_codec_lookup_error(self):
780823 self .assertIs (codec_lookup_error ('ignore' ), codecs .ignore_errors )
781824 self .assertIs (codec_lookup_error ('replace' ), codecs .replace_errors )
782825 self .assertIs (codec_lookup_error ('xmlcharrefreplace' ), codecs .xmlcharrefreplace_errors )
826+ self .assertIs (codec_lookup_error ('backslashreplace' ), codecs .backslashreplace_errors )
783827 self .assertIs (codec_lookup_error ('namereplace' ), codecs .namereplace_errors )
784828 self .assertRaises (LookupError , codec_lookup_error , 'unknown' )
785829
786- def test_codec_error_handlers (self ):
787- exceptions = [
788- # A UnicodeError with an empty message currently crashes:
789- # See: https://github.com/python/cpython/issues/123378
790- # UnicodeEncodeError('bad', '', 0, 1, 'reason'),
791- UnicodeEncodeError ('bad' , 'x' , 0 , 1 , 'reason' ),
792- UnicodeEncodeError ('bad' , 'xyz123' , 0 , 1 , 'reason' ),
793- UnicodeEncodeError ('bad' , 'xyz123' , 1 , 4 , 'reason' ),
794- ]
795-
796- strict_handler = _testcapi .codec_strict_errors
830+ def test_codec_strict_errors_handler (self ):
831+ handler = _testcapi .codec_strict_errors
832+ for exc in self .all_unicode_errors + self .bad_unicode_errors :
833+ with self .subTest (handler = handler , exc = exc ):
834+ self .assertRaises (type (exc ), handler , exc )
835+
836+ def test_codec_ignore_errors_handler (self ):
837+ handler = _testcapi .codec_ignore_errors
838+ self .do_test_codec_errors_handler (handler , self .all_unicode_errors )
839+
840+ def test_codec_replace_errors_handler (self ):
841+ handler = _testcapi .codec_replace_errors
842+ self .do_test_codec_errors_handler (handler , self .all_unicode_errors )
843+
844+ def test_codec_xmlcharrefreplace_errors_handler (self ):
845+ handler = _testcapi .codec_xmlcharrefreplace_errors
846+ self .do_test_codec_errors_handler (handler , self .unicode_encode_errors )
847+
848+ def test_codec_backslashreplace_errors_handler (self ):
849+ handler = _testcapi .codec_backslashreplace_errors
850+ self .do_test_codec_errors_handler (handler , self .all_unicode_errors )
851+
852+ def test_codec_namereplace_errors_handler (self ):
853+ handler = _testlimitedcapi .codec_namereplace_errors
854+ self .do_test_codec_errors_handler (handler , self .unicode_encode_errors )
855+
856+ def do_test_codec_errors_handler (self , handler , exceptions ):
857+ at_least_one = False
797858 for exc in exceptions :
798- with self .subTest (handler = strict_handler , exc = exc ):
799- self .assertRaises (UnicodeEncodeError , strict_handler , exc )
800-
801- for handler in [
802- _testcapi .codec_ignore_errors ,
803- _testcapi .codec_replace_errors ,
804- _testcapi .codec_xmlcharrefreplace_errors ,
805- _testlimitedcapi .codec_namereplace_errors ,
806- ]:
807- for exc in exceptions :
808- with self .subTest (handler = handler , exc = exc ):
809- self .assertIsInstance (handler (exc ), tuple )
859+ # See https://github.com/python/cpython/issues/123378 and related
860+ # discussion and issues for details.
861+ if self ._exception_may_crash (exc ):
862+ continue
863+
864+ at_least_one = True
865+ with self .subTest (handler = handler , exc = exc ):
866+ # test that the handler does not crash
867+ self .assertIsInstance (handler (exc ), tuple )
868+
869+ if exceptions :
870+ self .assertTrue (at_least_one , "all exceptions are crashing" )
871+
872+ for bad_exc in (
873+ self .bad_unicode_errors
874+ + tuple (e for e in self .all_unicode_errors if e not in exceptions )
875+ ):
876+ with self .subTest ('bad type' , handler = handler , exc = bad_exc ):
877+ self .assertRaises (TypeError , handler , bad_exc )
878+
879+ @classmethod
880+ def _exception_may_crash (cls , exc ):
881+ """Indicate whether a Unicode exception might currently crash
882+ the interpreter when used by a built-in codecs error handler.
883+
884+ Until gh-123378 is fixed, we skip the tests for these exceptions.
885+
886+ This should only be used by "do_test_codec_errors_handler".
887+ """
888+ message , start , end = exc .object , exc .start , exc .end
889+ match exc :
890+ case UnicodeEncodeError ():
891+ return end < start or (end - start ) >= len (message )
892+ case UnicodeDecodeError ():
893+ # The case "end - start >= len(message)" does not crash.
894+ return end < start
895+ case UnicodeTranslateError ():
896+ # Test "end <= start" because PyCodec_ReplaceErrors checks
897+ # the Unicode kind of a 0-length string which by convention
898+ # is PyUnicode_1BYTE_KIND and not PyUnicode_2BYTE_KIND as
899+ # the handler currently expects.
900+ return end <= start or (end - start ) >= len (message )
901+ return False
810902
811903
812904if __name__ == "__main__" :
0 commit comments