From 918acdf5ddb4decd58dddb14f64200ee6227586a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Wed, 22 Jan 2025 12:21:58 +0100
Subject: [PATCH 1/7] Use `_PyUnicodeError_GetParams` for the 'surrogateescape'
 handler

---
 Python/codecs.c | 157 ++++++++++++++++++++++++++++--------------------
 1 file changed, 92 insertions(+), 65 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 2cb3875db35058..afb940d995bc70 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -1056,7 +1056,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
 #define ENC_UTF32LE     4
 
 static int
-get_standard_encoding(const char *encoding, int *bytelength)
+get_standard_encoding_impl(const char *encoding, int *bytelength)
 {
     if (Py_TOLOWER(encoding[0]) == 'u' &&
         Py_TOLOWER(encoding[1]) == 't' &&
@@ -1114,6 +1114,19 @@ get_standard_encoding(const char *encoding, int *bytelength)
     return ENC_UNKNOWN;
 }
 
+
+static int
+get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
+{
+    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
+    if (encoding_cstr == NULL) {
+        return -1;
+    }
+    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
+    return 0;
+}
+
+
 /* This handler is declared static until someone demonstrates
    a need to call it directly. */
 static PyObject *
@@ -1147,7 +1160,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
             Py_DECREF(encode);
             return NULL;
         }
-        code = get_standard_encoding(encoding, &bytelength);
+        code = get_standard_encoding_impl(encoding, &bytelength);
         Py_DECREF(encode);
         if (code == ENC_UNKNOWN) {
             /* Not supported, fail with original exception */
@@ -1226,7 +1239,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
             Py_DECREF(encode);
             return NULL;
         }
-        code = get_standard_encoding(encoding, &bytelength);
+        code = get_standard_encoding_impl(encoding, &bytelength);
         Py_DECREF(encode);
         if (code == ENC_UNKNOWN) {
             /* Not supported, fail with original exception */
@@ -1280,76 +1293,90 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
     }
 }
 
+
 static PyObject *
-PyCodec_SurrogateEscapeErrors(PyObject *exc)
+_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
 {
-    PyObject *restuple;
-    PyObject *object;
-    Py_ssize_t i;
-    Py_ssize_t start;
-    Py_ssize_t end;
-    PyObject *res;
+    PyObject *obj;
+    Py_ssize_t start, end, slen;
+    if (_PyUnicodeError_GetParams(exc,
+                                  &obj, NULL,
+                                  &start, &end, &slen, false) < 0)
+    {
+        return NULL;
+    }
 
-    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
-        char *outp;
-        if (PyUnicodeEncodeError_GetStart(exc, &start))
-            return NULL;
-        if (PyUnicodeEncodeError_GetEnd(exc, &end))
-            return NULL;
-        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
-            return NULL;
-        res = PyBytes_FromStringAndSize(NULL, end-start);
-        if (!res) {
-            Py_DECREF(object);
+    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
+    if (res == NULL) {
+        Py_DECREF(obj);
+        return NULL;
+    }
+
+    char *outp = PyBytes_AsString(res);
+    for (Py_ssize_t i = start; i < end; i++) {
+        /* object is guaranteed to be "ready" */
+        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
+        if (ch < 0xdc80 || ch > 0xdcff) {
+            Py_DECREF(obj);
+            Py_DECREF(res);
+            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
             return NULL;
         }
-        outp = PyBytes_AsString(res);
-        for (i = start; i < end; i++) {
-            /* object is guaranteed to be "ready" */
-            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
-            if (ch < 0xdc80 || ch > 0xdcff) {
-                /* Not a UTF-8b surrogate, fail with original exception */
-                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
-                Py_DECREF(res);
-                Py_DECREF(object);
-                return NULL;
-            }
-            *outp++ = ch - 0xdc00;
+        *outp++ = ch - 0xdc00;
+    }
+    Py_DECREF(obj);
+
+    return Py_BuildValue("(Nn)", res, end);
+}
+
+
+static PyObject *
+_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
+{
+    PyObject *obj;
+    Py_ssize_t start, end, slen;
+    if (_PyUnicodeError_GetParams(exc,
+                                  &obj, NULL,
+                                  &start, &end, &slen, true) < 0)
+    {
+        return NULL;
+    }
+
+    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
+    int consumed = 0;
+    const unsigned char *p = (const unsigned char*)PyBytes_AS_STRING(obj);
+    while (consumed < 4 && consumed < slen) {
+        /* Refuse to escape ASCII bytes. */
+        if (p[start + consumed] < 128) {
+            break;
         }
-        restuple = Py_BuildValue("(On)", res, end);
-        Py_DECREF(res);
-        Py_DECREF(object);
-        return restuple;
+        ch[consumed] = 0xdc00 + p[start + consumed];
+        consumed++;
+    }
+    Py_DECREF(obj);
+
+    if (consumed == 0) {
+        /* codec complained about ASCII byte. */
+        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+        return NULL;
+    }
+
+    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
+    if (str == NULL) {
+        return NULL;
+    }
+    return Py_BuildValue("(Nn)", str, start + consumed);
+}
+
+
+static PyObject *
+PyCodec_SurrogateEscapeErrors(PyObject *exc)
+{
+    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
+        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
     }
     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
-        PyObject *str;
-        const unsigned char *p;
-        Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
-        int consumed = 0;
-        if (PyUnicodeDecodeError_GetStart(exc, &start))
-            return NULL;
-        if (PyUnicodeDecodeError_GetEnd(exc, &end))
-            return NULL;
-        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
-            return NULL;
-        p = (const unsigned char*)PyBytes_AS_STRING(object);
-        while (consumed < 4 && consumed < end-start) {
-            /* Refuse to escape ASCII bytes. */
-            if (p[start+consumed] < 128)
-                break;
-            ch[consumed] = 0xdc00 + p[start+consumed];
-            consumed++;
-        }
-        Py_DECREF(object);
-        if (!consumed) {
-            /* codec complained about ASCII byte. */
-            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
-            return NULL;
-        }
-        str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
-        if (str == NULL)
-            return NULL;
-        return Py_BuildValue("(Nn)", str, start+consumed);
+        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
     }
     else {
         wrong_exception_type(exc);

From 758b84e10a3fbce3401fb64ef7105def54aeb3d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Wed, 22 Jan 2025 12:25:04 +0100
Subject: [PATCH 2/7] fixup

---
 Python/codecs.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index afb940d995bc70..bbd2ff688458a9 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -1056,7 +1056,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
 #define ENC_UTF32LE     4
 
 static int
-get_standard_encoding_impl(const char *encoding, int *bytelength)
+get_standard_encoding(const char *encoding, int *bytelength)
 {
     if (Py_TOLOWER(encoding[0]) == 'u' &&
         Py_TOLOWER(encoding[1]) == 't' &&
@@ -1114,19 +1114,6 @@ get_standard_encoding_impl(const char *encoding, int *bytelength)
     return ENC_UNKNOWN;
 }
 
-
-static int
-get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
-{
-    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
-    if (encoding_cstr == NULL) {
-        return -1;
-    }
-    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
-    return 0;
-}
-
-
 /* This handler is declared static until someone demonstrates
    a need to call it directly. */
 static PyObject *
@@ -1160,7 +1147,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
             Py_DECREF(encode);
             return NULL;
         }
-        code = get_standard_encoding_impl(encoding, &bytelength);
+        code = get_standard_encoding(encoding, &bytelength);
         Py_DECREF(encode);
         if (code == ENC_UNKNOWN) {
             /* Not supported, fail with original exception */
@@ -1239,7 +1226,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
             Py_DECREF(encode);
             return NULL;
         }
-        code = get_standard_encoding_impl(encoding, &bytelength);
+        code = get_standard_encoding(encoding, &bytelength);
         Py_DECREF(encode);
         if (code == ENC_UNKNOWN) {
             /* Not supported, fail with original exception */

From 1db1dbb308b4c861fa7f3c51fc9d3eeb84f1be6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Sun, 9 Feb 2025 12:52:00 +0100
Subject: [PATCH 3/7] post-merge

---
 Python/codecs.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 69ffd0b6cc8086..8c1ddbe879237c 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -1320,6 +1320,8 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
 }
 
 
+// --- handler: 'surrogateescape' ---------------------------------------------
+
 static PyObject *
 _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
 {
@@ -1370,7 +1372,7 @@ _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
 
     Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
     int consumed = 0;
-    const unsigned char *p = (const unsigned char*)PyBytes_AS_STRING(obj);
+    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
     while (consumed < 4 && consumed < slen) {
         /* Refuse to escape ASCII bytes. */
         if (p[start + consumed] < 128) {
@@ -1398,10 +1400,10 @@ _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
 static PyObject *
 PyCodec_SurrogateEscapeErrors(PyObject *exc)
 {
-    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
+    if (_PyIsUnicodeEncodeError(exc)) {
         return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
     }
-    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
+    else if (_PyIsUnicodeDecodeError(exc)) {
         return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
     }
     else {
@@ -1457,11 +1459,14 @@ static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
     return PyCodec_SurrogatePassErrors(exc);
 }
 
-static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
+
+static inline PyObject *
+surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
 {
     return PyCodec_SurrogateEscapeErrors(exc);
 }
 
+
 PyStatus
 _PyCodec_InitRegistry(PyInterpreterState *interp)
 {

From 067a186f560ee552061c4a73649eb90c75d333f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Fri, 14 Feb 2025 19:29:57 +0100
Subject: [PATCH 4/7] Restore deleted comment

---
 Python/codecs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Python/codecs.c b/Python/codecs.c
index a307a07a8d3406..cf0fbd08e09324 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -1384,6 +1384,7 @@ _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
         /* object is guaranteed to be "ready" */
         Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
         if (ch < 0xdc80 || ch > 0xdcff) {
+            /* Not a UTF-8b surrogate, fail with original exception */
             Py_DECREF(obj);
             Py_DECREF(res);
             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);

From f88524b243fb22331aadec32537d367fb8634443 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Fri, 14 Feb 2025 19:30:46 +0100
Subject: [PATCH 5/7] comment capitalization

---
 Python/codecs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index cf0fbd08e09324..0d5ca261e33292 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -1424,7 +1424,7 @@ _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
     Py_DECREF(obj);
 
     if (consumed == 0) {
-        /* codec complained about ASCII byte. */
+        /* Codec complained about ASCII byte. */
         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
         return NULL;
     }

From 99204262d05b70c65e4190e2ef821c18e74f051d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Fri, 14 Feb 2025 19:31:14 +0100
Subject: [PATCH 6/7] add period at the end of a sentence

---
 Python/codecs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 0d5ca261e33292..6edb306ecaca9d 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -1384,7 +1384,7 @@ _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
         /* object is guaranteed to be "ready" */
         Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
         if (ch < 0xdc80 || ch > 0xdcff) {
-            /* Not a UTF-8b surrogate, fail with original exception */
+            /* Not a UTF-8b surrogate, fail with original exception. */
             Py_DECREF(obj);
             Py_DECREF(res);
             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);

From e7643f0fdd6c757f1d9f2873ce96184a4c6359b5 Mon Sep 17 00:00:00 2001
From: Petr Viktorin <encukou@gmail.com>
Date: Mon, 17 Feb 2025 17:59:57 +0100
Subject: [PATCH 7/7] Remove obsolete comment

---
 Python/codecs.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 6edb306ecaca9d..be019d6cda52a7 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -1381,7 +1381,6 @@ _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
 
     char *outp = PyBytes_AsString(res);
     for (Py_ssize_t i = start; i < end; i++) {
-        /* object is guaranteed to be "ready" */
         Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
         if (ch < 0xdc80 || ch > 0xdcff) {
             /* Not a UTF-8b surrogate, fail with original exception. */