gh-103997: Automatically dedent the argument to "-c" (#103998)

Erotemic · sunmy2019 · Eclips4 · web-flow · commit fc0ec2988999 · 2025-04-18T17:39:30.000+09:00
Co-authored-by: sunmy2019 &lt;59365878+sunmy2019@users.noreply.github.com&gt;
Co-authored-by: Kirill Podoprigora &lt;80244920+Eclips4@users.noreply.github.com&gt;
Co-authored-by: Inada Naoki &lt;songofacandy@gmail.com&gt;
Co-authored-by: Adam Turner &lt;9087854+AA-Turner@users.noreply.github.com&gt;
Co-authored-by: Bénédikt Tran &lt;10796600+picnixz@users.noreply.github.com&gt;
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
@@ -73,6 +73,9 @@ source.
 
    .. audit-event:: cpython.run_command command cmdoption-c
 
+   .. versionchanged:: next
+      *command* is automatically dedented before execution.
+
 .. option:: -m <module-name>
 
    Search :data:`sys.path` for the named module and execute its contents as
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
@@ -474,6 +474,12 @@ Other language changes
   explicitly overridden in the subclass.
   (Contributed by Tomasz Pytel in :gh:`132329`.)
 
+* The command line option :option:`-c` now automatically dedents its code
+  argument before execution. The auto-dedentation behavior mirrors
+  :func:`textwrap.dedent`.
+  (Contributed by Jon Crall and Steven Sun in :gh:`103998`.)
+
+
 .. _whatsnew314-pep765:
 
 PEP 765: Disallow return/break/continue that exit a finally block
diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h
@@ -247,6 +247,12 @@ extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
     Py_UCS4 *maxchar,
     int forward);
 
+/* Dedent a string.
+   Behaviour is expected to be an exact match of `textwrap.dedent`.
+   Return a new reference on success, NULL with exception set on error.
+   */
+extern PyObject* _PyUnicode_Dedent(PyObject *unicode);
+
 /* --- Misc functions ----------------------------------------------------- */
 
 extern PyObject* _PyUnicode_FormatLong(PyObject *, int, int, int);
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
@@ -17,6 +17,8 @@
     spawn_python, kill_python, assert_python_ok, assert_python_failure,
     interpreter_requires_environment
 )
+from textwrap import dedent
+
 
 if not support.has_subprocess_support:
     raise unittest.SkipTest("test module requires subprocess")
@@ -1051,6 +1053,88 @@ def test_int_max_str_digits(self):
         )
         self.assertEqual(res2int(res), (6000, 6000))
 
+    def test_cmd_dedent(self):
+        # test that -c auto-dedents its arguments
+        test_cases = [
+            (
+                """
+                    print('space-auto-dedent')
+                """,
+                "space-auto-dedent",
+            ),
+            (
+                dedent(
+                    """
+                ^^^print('tab-auto-dedent')
+                """
+                ).replace("^", "\t"),
+                "tab-auto-dedent",
+            ),
+            (
+                dedent(
+                    """
+                ^^if 1:
+                ^^^^print('mixed-auto-dedent-1')
+                ^^print('mixed-auto-dedent-2')
+                """
+                ).replace("^", "\t \t"),
+                "mixed-auto-dedent-1\nmixed-auto-dedent-2",
+            ),
+            (
+                '''
+                    data = """$
+
+                    this data has an empty newline above and a newline with spaces below $
+                                            $
+                    """$
+                    if 1:         $
+                        print(repr(data))$
+                '''.replace(
+                    "$", ""
+                ),
+                # Note: entirely blank lines are normalized to \n, even if they
+                # are part of a data string. This is consistent with
+                # textwrap.dedent behavior, but might not be intuitive.
+                "'\\n\\nthis data has an empty newline above and a newline with spaces below \\n\\n'",
+            ),
+            (
+                '',
+                '',
+            ),
+            (
+                '  \t\n\t\n \t\t\t  \t\t \t\n\t\t \n\n\n\t\t\t   ',
+                '',
+            ),
+        ]
+        for code, expected in test_cases:
+            # Run the auto-dedent case
+            args1 = sys.executable, '-c', code
+            proc1 = subprocess.run(args1, stdout=subprocess.PIPE)
+            self.assertEqual(proc1.returncode, 0, proc1)
+            output1 = proc1.stdout.strip().decode(encoding='utf-8')
+
+            # Manually dedent beforehand, check the result is the same.
+            args2 = sys.executable, '-c', dedent(code)
+            proc2 = subprocess.run(args2, stdout=subprocess.PIPE)
+            self.assertEqual(proc2.returncode, 0, proc2)
+            output2 = proc2.stdout.strip().decode(encoding='utf-8')
+
+            self.assertEqual(output1, output2)
+            self.assertEqual(output1.replace('\r\n', '\n'), expected)
+
+    def test_cmd_dedent_failcase(self):
+        # Mixing tabs and spaces is not allowed
+        from textwrap import dedent
+        template = dedent(
+            '''
+            -+if 1:
+            +-++ print('will fail')
+            ''')
+        code = template.replace('-', ' ').replace('+', '\t')
+        assert_python_failure('-c', code)
+        code = template.replace('-', '\t').replace('+', ' ')
+        assert_python_failure('-c', code)
+
     def test_cpu_count(self):
         code = "import os; print(os.cpu_count(), os.process_cpu_count())"
         res = assert_python_ok('-X', 'cpu_count=4321', '-c', code)
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst b/Misc/NEWS.d/next/Core_and_Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst
@@ -0,0 +1,4 @@
+String arguments passed to "-c" are now automatically dedented as if by
+:func:`textwrap.dedent`. This allows "python -c" invocations to be indented
+in shell scripts without causing indentation errors. (Patch by Jon Crall and
+Steven Sun)
diff --git a/Modules/main.c b/Modules/main.c
@@ -11,6 +11,7 @@
 #include "pycore_pylifecycle.h"   // _Py_PreInitializeFromPyArgv()
 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
 #include "pycore_pythonrun.h"     // _PyRun_AnyFileObject()
+#include "pycore_unicodeobject.h" // _PyUnicode_Dedent()
 
 /* Includes for exit_sigint() */
 #include <stdio.h>                // perror()
@@ -244,6 +245,11 @@ pymain_run_command(wchar_t *command)
         return pymain_exit_err_print();
     }
 
+    Py_SETREF(unicode, _PyUnicode_Dedent(unicode));
+    if (unicode == NULL) {
+        goto error;
+    }
+
     bytes = PyUnicode_AsUTF8String(unicode);
     Py_DECREF(unicode);
     if (bytes == NULL) {
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -14270,6 +14270,163 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
     return Py_BuildValue("(N)", copy);
 }
 
+/*
+This function searchs the longest common leading whitespace
+of all lines in the [src, end).
+It returns the length of the common leading whitespace and sets `output` to
+point to the beginning of the common leading whitespace if length > 0.
+*/
+static Py_ssize_t
+search_longest_common_leading_whitespace(
+    const char *const src,
+    const char *const end,
+    const char **output)
+{
+    // [_start, _start + _len)
+    // describes the current longest common leading whitespace
+    const char *_start = NULL;
+    Py_ssize_t _len = 0;
+
+    for (const char *iter = src; iter < end; ++iter) {
+        const char *line_start = iter;
+        const char *leading_whitespace_end = NULL;
+
+        // scan the whole line
+        while (iter < end && *iter != '\n') {
+            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
+                /* `iter` points to the first non-whitespace character
+                   in this line */
+                if (iter == line_start) {
+                    // some line has no indent, fast exit!
+                    return 0;
+                }
+                leading_whitespace_end = iter;
+            }
+            ++iter;
+        }
+
+        // if this line has all white space, skip it
+        if (!leading_whitespace_end) {
+            continue;
+        }
+
+        if (!_start) {
+            // update the first leading whitespace
+            _start = line_start;
+            _len = leading_whitespace_end - line_start;
+            assert(_len > 0);
+        }
+        else {
+            /* We then compare with the current longest leading whitespace.
+
+               [line_start, leading_whitespace_end) is the leading
+               whitespace of this line,
+
+               [_start, _start + _len) is the leading whitespace of the
+               current longest leading whitespace. */
+            Py_ssize_t new_len = 0;
+            const char *_iter = _start, *line_iter = line_start;
+
+            while (_iter < _start + _len && line_iter < leading_whitespace_end
+                   && *_iter == *line_iter)
+            {
+                ++_iter;
+                ++line_iter;
+                ++new_len;
+            }
+
+            _len = new_len;
+            if (_len == 0) {
+                // No common things now, fast exit!
+                return 0;
+            }
+        }
+    }
+
+    assert(_len >= 0);
+    if (_len > 0) {
+        *output = _start;
+    }
+    return _len;
+}
+
+/* Dedent a string.
+   Behaviour is expected to be an exact match of `textwrap.dedent`.
+   Return a new reference on success, NULL with exception set on error.
+   */
+PyObject *
+_PyUnicode_Dedent(PyObject *unicode)
+{
+    Py_ssize_t src_len = 0;
+    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
+    if (!src) {
+        return NULL;
+    }
+    assert(src_len >= 0);
+    if (src_len == 0) {
+        return Py_NewRef(unicode);
+    }
+
+    const char *const end = src + src_len;
+
+    // [whitespace_start, whitespace_start + whitespace_len)
+    // describes the current longest common leading whitespace
+    const char *whitespace_start = NULL;
+    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
+        src, end, &whitespace_start);
+
+    if (whitespace_len == 0) {
+        return Py_NewRef(unicode);
+    }
+
+    // now we should trigger a dedent
+    char *dest = PyMem_Malloc(src_len);
+    if (!dest) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    char *dest_iter = dest;
+
+    for (const char *iter = src; iter < end; ++iter) {
+        const char *line_start = iter;
+        bool in_leading_space = true;
+
+        // iterate over a line to find the end of a line
+        while (iter < end && *iter != '\n') {
+            if (in_leading_space && *iter != ' ' && *iter != '\t') {
+                in_leading_space = false;
+            }
+            ++iter;
+        }
+
+        // invariant: *iter == '\n' or iter == end
+        bool append_newline = iter < end;
+
+        // if this line has all white space, write '\n' and continue
+        if (in_leading_space && append_newline) {
+            *dest_iter++ = '\n';
+            continue;
+        }
+
+        /* copy [new_line_start + whitespace_len, iter) to buffer, then
+            conditionally append '\n' */
+
+        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
+        assert(new_line_len >= 0);
+        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
+
+        dest_iter += new_line_len;
+
+        if (append_newline) {
+            *dest_iter++ = '\n';
+        }
+    }
+
+    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
+    PyMem_Free(dest);
+    return res;
+}
+
 static PyMethodDef unicode_methods[] = {
     UNICODE_ENCODE_METHODDEF
     UNICODE_REPLACE_METHODDEF