From 918acdf5ddb4decd58dddb14f64200ee6227586a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:21:58 +0100 Subject: [PATCH 1/7] Use `_PyUnicodeError_GetParams` for the 'surrogateescape' handler --- Python/codecs.c | 157 ++++++++++++++++++++++++++++-------------------- 1 file changed, 92 insertions(+), 65 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 2cb3875db35058..afb940d995bc70 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1056,7 +1056,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) #define ENC_UTF32LE 4 static int -get_standard_encoding(const char *encoding, int *bytelength) +get_standard_encoding_impl(const char *encoding, int *bytelength) { if (Py_TOLOWER(encoding[0]) == 'u' && Py_TOLOWER(encoding[1]) == 't' && @@ -1114,6 +1114,19 @@ get_standard_encoding(const char *encoding, int *bytelength) return ENC_UNKNOWN; } + +static int +get_standard_encoding(PyObject *encoding, int *code, int *bytelength) +{ + const char *encoding_cstr = PyUnicode_AsUTF8(encoding); + if (encoding_cstr == NULL) { + return -1; + } + *code = get_standard_encoding_impl(encoding_cstr, bytelength); + return 0; +} + + /* This handler is declared static until someone demonstrates a need to call it directly. */ static PyObject * @@ -1147,7 +1160,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc) Py_DECREF(encode); return NULL; } - code = get_standard_encoding(encoding, &bytelength); + code = get_standard_encoding_impl(encoding, &bytelength); Py_DECREF(encode); if (code == ENC_UNKNOWN) { /* Not supported, fail with original exception */ @@ -1226,7 +1239,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc) Py_DECREF(encode); return NULL; } - code = get_standard_encoding(encoding, &bytelength); + code = get_standard_encoding_impl(encoding, &bytelength); Py_DECREF(encode); if (code == ENC_UNKNOWN) { /* Not supported, fail with original exception */ @@ -1280,76 +1293,90 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } } + static PyObject * -PyCodec_SurrogateEscapeErrors(PyObject *exc) +_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc) { - PyObject *restuple; - PyObject *object; - Py_ssize_t i; - Py_ssize_t start; - Py_ssize_t end; - PyObject *res; + PyObject *obj; + Py_ssize_t start, end, slen; + if (_PyUnicodeError_GetParams(exc, + &obj, NULL, + &start, &end, &slen, false) < 0) + { + return NULL; + } - if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - char *outp; - if (PyUnicodeEncodeError_GetStart(exc, &start)) - return NULL; - if (PyUnicodeEncodeError_GetEnd(exc, &end)) - return NULL; - if (!(object = PyUnicodeEncodeError_GetObject(exc))) - return NULL; - res = PyBytes_FromStringAndSize(NULL, end-start); - if (!res) { - Py_DECREF(object); + PyObject *res = PyBytes_FromStringAndSize(NULL, slen); + if (res == NULL) { + Py_DECREF(obj); + return NULL; + } + + char *outp = PyBytes_AsString(res); + for (Py_ssize_t i = start; i < end; i++) { + /* object is guaranteed to be "ready" */ + Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); + if (ch < 0xdc80 || ch > 0xdcff) { + Py_DECREF(obj); + Py_DECREF(res); + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); return NULL; } - outp = PyBytes_AsString(res); - for (i = start; i < end; i++) { - /* object is guaranteed to be "ready" */ - Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); - if (ch < 0xdc80 || ch > 0xdcff) { - /* Not a UTF-8b surrogate, fail with original exception */ - PyErr_SetObject(PyExceptionInstance_Class(exc), exc); - Py_DECREF(res); - Py_DECREF(object); - return NULL; - } - *outp++ = ch - 0xdc00; + *outp++ = ch - 0xdc00; + } + Py_DECREF(obj); + + return Py_BuildValue("(Nn)", res, end); +} + + +static PyObject * +_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc) +{ + PyObject *obj; + Py_ssize_t start, end, slen; + if (_PyUnicodeError_GetParams(exc, + &obj, NULL, + &start, &end, &slen, true) < 0) + { + return NULL; + } + + Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ + int consumed = 0; + const unsigned char *p = (const unsigned char*)PyBytes_AS_STRING(obj); + while (consumed < 4 && consumed < slen) { + /* Refuse to escape ASCII bytes. */ + if (p[start + consumed] < 128) { + break; } - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); - Py_DECREF(object); - return restuple; + ch[consumed] = 0xdc00 + p[start + consumed]; + consumed++; + } + Py_DECREF(obj); + + if (consumed == 0) { + /* codec complained about ASCII byte. */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; + } + + PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); + if (str == NULL) { + return NULL; + } + return Py_BuildValue("(Nn)", str, start + consumed); +} + + +static PyObject * +PyCodec_SurrogateEscapeErrors(PyObject *exc) +{ + if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { + return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - PyObject *str; - const unsigned char *p; - Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ - int consumed = 0; - if (PyUnicodeDecodeError_GetStart(exc, &start)) - return NULL; - if (PyUnicodeDecodeError_GetEnd(exc, &end)) - return NULL; - if (!(object = PyUnicodeDecodeError_GetObject(exc))) - return NULL; - p = (const unsigned char*)PyBytes_AS_STRING(object); - while (consumed < 4 && consumed < end-start) { - /* Refuse to escape ASCII bytes. */ - if (p[start+consumed] < 128) - break; - ch[consumed] = 0xdc00 + p[start+consumed]; - consumed++; - } - Py_DECREF(object); - if (!consumed) { - /* codec complained about ASCII byte. */ - PyErr_SetObject(PyExceptionInstance_Class(exc), exc); - return NULL; - } - str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); - if (str == NULL) - return NULL; - return Py_BuildValue("(Nn)", str, start+consumed); + return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc); } else { wrong_exception_type(exc); From 758b84e10a3fbce3401fb64ef7105def54aeb3d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:25:04 +0100 Subject: [PATCH 2/7] fixup --- Python/codecs.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index afb940d995bc70..bbd2ff688458a9 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1056,7 +1056,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) #define ENC_UTF32LE 4 static int -get_standard_encoding_impl(const char *encoding, int *bytelength) +get_standard_encoding(const char *encoding, int *bytelength) { if (Py_TOLOWER(encoding[0]) == 'u' && Py_TOLOWER(encoding[1]) == 't' && @@ -1114,19 +1114,6 @@ get_standard_encoding_impl(const char *encoding, int *bytelength) return ENC_UNKNOWN; } - -static int -get_standard_encoding(PyObject *encoding, int *code, int *bytelength) -{ - const char *encoding_cstr = PyUnicode_AsUTF8(encoding); - if (encoding_cstr == NULL) { - return -1; - } - *code = get_standard_encoding_impl(encoding_cstr, bytelength); - return 0; -} - - /* This handler is declared static until someone demonstrates a need to call it directly. */ static PyObject * @@ -1160,7 +1147,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc) Py_DECREF(encode); return NULL; } - code = get_standard_encoding_impl(encoding, &bytelength); + code = get_standard_encoding(encoding, &bytelength); Py_DECREF(encode); if (code == ENC_UNKNOWN) { /* Not supported, fail with original exception */ @@ -1239,7 +1226,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc) Py_DECREF(encode); return NULL; } - code = get_standard_encoding_impl(encoding, &bytelength); + code = get_standard_encoding(encoding, &bytelength); Py_DECREF(encode); if (code == ENC_UNKNOWN) { /* Not supported, fail with original exception */ From 1db1dbb308b4c861fa7f3c51fc9d3eeb84f1be6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 9 Feb 2025 12:52:00 +0100 Subject: [PATCH 3/7] post-merge --- Python/codecs.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 69ffd0b6cc8086..8c1ddbe879237c 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1320,6 +1320,8 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } +// --- handler: 'surrogateescape' --------------------------------------------- + static PyObject * _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc) { @@ -1370,7 +1372,7 @@ _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc) Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ int consumed = 0; - const unsigned char *p = (const unsigned char*)PyBytes_AS_STRING(obj); + const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); while (consumed < 4 && consumed < slen) { /* Refuse to escape ASCII bytes. */ if (p[start + consumed] < 128) { @@ -1398,10 +1400,10 @@ _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc) static PyObject * PyCodec_SurrogateEscapeErrors(PyObject *exc) { - if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { + if (_PyIsUnicodeEncodeError(exc)) { return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc); } - else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { + else if (_PyIsUnicodeDecodeError(exc)) { return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc); } else { @@ -1457,11 +1459,14 @@ static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) return PyCodec_SurrogatePassErrors(exc); } -static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) + +static inline PyObject * +surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc) { return PyCodec_SurrogateEscapeErrors(exc); } + PyStatus _PyCodec_InitRegistry(PyInterpreterState *interp) { From 067a186f560ee552061c4a73649eb90c75d333f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 14 Feb 2025 19:29:57 +0100 Subject: [PATCH 4/7] Restore deleted comment --- Python/codecs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/codecs.c b/Python/codecs.c index a307a07a8d3406..cf0fbd08e09324 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1384,6 +1384,7 @@ _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc) /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); if (ch < 0xdc80 || ch > 0xdcff) { + /* Not a UTF-8b surrogate, fail with original exception */ Py_DECREF(obj); Py_DECREF(res); PyErr_SetObject(PyExceptionInstance_Class(exc), exc); From f88524b243fb22331aadec32537d367fb8634443 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 14 Feb 2025 19:30:46 +0100 Subject: [PATCH 5/7] comment capitalization --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index cf0fbd08e09324..0d5ca261e33292 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1424,7 +1424,7 @@ _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc) Py_DECREF(obj); if (consumed == 0) { - /* codec complained about ASCII byte. */ + /* Codec complained about ASCII byte. */ PyErr_SetObject(PyExceptionInstance_Class(exc), exc); return NULL; } From 99204262d05b70c65e4190e2ef821c18e74f051d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 14 Feb 2025 19:31:14 +0100 Subject: [PATCH 6/7] add period at the end of a sentence --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index 0d5ca261e33292..6edb306ecaca9d 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1384,7 +1384,7 @@ _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc) /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); if (ch < 0xdc80 || ch > 0xdcff) { - /* Not a UTF-8b surrogate, fail with original exception */ + /* Not a UTF-8b surrogate, fail with original exception. */ Py_DECREF(obj); Py_DECREF(res); PyErr_SetObject(PyExceptionInstance_Class(exc), exc); From e7643f0fdd6c757f1d9f2873ce96184a4c6359b5 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Mon, 17 Feb 2025 17:59:57 +0100 Subject: [PATCH 7/7] Remove obsolete comment --- Python/codecs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index 6edb306ecaca9d..be019d6cda52a7 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1381,7 +1381,6 @@ _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc) char *outp = PyBytes_AsString(res); for (Py_ssize_t i = start; i < end; i++) { - /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); if (ch < 0xdc80 || ch > 0xdcff) { /* Not a UTF-8b surrogate, fail with original exception. */