From bdf7dd8a4f3d07b6864373a9a6d6b4132da8ccfb Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Sat, 30 Mar 2024 19:44:09 -0700 Subject: [PATCH 1/9] Use GHC calling convention on i686 and x86_64 --- Include/cpython/optimizer.h | 1 + Lib/test/test_perf_profiler.py | 2 +- Python/jit.c | 47 +++++++++++++++++++++++----------- Python/optimizer.c | 2 ++ Tools/jit/_targets.py | 40 ++++++++++++++++++++++------- Tools/jit/_writer.py | 4 +++ Tools/jit/template.c | 4 +-- Tools/jit/trampoline.c | 17 ++++++++++++ 8 files changed, 90 insertions(+), 27 deletions(-) create mode 100644 Tools/jit/trampoline.c diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 819251a25bb242..fa10a0ee6974ed 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -101,6 +101,7 @@ typedef struct _PyExecutorObject { uint32_t code_size; size_t jit_size; void *jit_code; + void *jit_code_ghccc; _PyExitData exits[1]; } _PyExecutorObject; diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py index 040be63da11447..e7c03b99086013 100644 --- a/Lib/test/test_perf_profiler.py +++ b/Lib/test/test_perf_profiler.py @@ -216,7 +216,7 @@ def is_unwinding_reliable(): cflags = sysconfig.get_config_var("PY_CORE_CFLAGS") if not cflags: return False - return "no-omit-frame-pointer" in cflags + return "no-omit-frame-pointer" in cflags and "_Py_JIT" not in cflags def perf_command_works(): diff --git a/Python/jit.c b/Python/jit.c index 8782adb847cfd6..c754054c3bc80b 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -382,8 +382,8 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size { // Loop once to find the total compiled size: size_t instruction_starts[UOP_MAX_TRACE_LENGTH]; - size_t code_size = 0; - size_t data_size = 0; + size_t code_size = trampoline.code.body_size; + size_t data_size = trampoline.data.body_size; for (size_t i = 0; i < length; i++) { _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; const StencilGroup *group = &stencil_groups[instruction->opcode]; @@ -405,11 +405,24 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size // Loop again to emit the code: unsigned char *code = memory; unsigned char *data = memory + code_size; + { + const StencilGroup *group = &trampoline; + // Think of patches as a dictionary mapping HoleValue to uint64_t: + uint64_t patches[] = GET_PATCHES(); + patches[HoleValue_CODE] = (uintptr_t)code; + patches[HoleValue_CONTINUE] = (uintptr_t)code + group->code.body_size; + patches[HoleValue_DATA] = (uintptr_t)data; + patches[HoleValue_EXECUTOR] = (uintptr_t)executor; + patches[HoleValue_TOP] = (uintptr_t)memory + trampoline.code.body_size; + patches[HoleValue_ZERO] = 0; + emit(group, patches); + code += group->code.body_size; + data += group->data.body_size; + } assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT); for (size_t i = 0; i < length; i++) { _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; const StencilGroup *group = &stencil_groups[instruction->opcode]; - // Think of patches as a dictionary mapping HoleValue to uintptr_t: uintptr_t patches[] = GET_PATCHES(); patches[HoleValue_CODE] = (uintptr_t)code; patches[HoleValue_CONTINUE] = (uintptr_t)code + group->code.body_size; @@ -451,18 +464,20 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size code += group->code.body_size; data += group->data.body_size; } - // Protect against accidental buffer overrun into data: - const StencilGroup *group = &stencil_groups[_FATAL_ERROR]; - uintptr_t patches[] = GET_PATCHES(); - patches[HoleValue_CODE] = (uintptr_t)code; - patches[HoleValue_CONTINUE] = (uintptr_t)code; - patches[HoleValue_DATA] = (uintptr_t)data; - patches[HoleValue_EXECUTOR] = (uintptr_t)executor; - patches[HoleValue_TOP] = (uintptr_t)code; - patches[HoleValue_ZERO] = 0; - emit(group, patches); - code += group->code.body_size; - data += group->data.body_size; + { + // Protect against accidental buffer overrun into data: + const StencilGroup *group = &stencil_groups[_FATAL_ERROR]; + uintptr_t patches[] = GET_PATCHES(); + patches[HoleValue_CODE] = (uintptr_t)code; + patches[HoleValue_CONTINUE] = (uintptr_t)code; + patches[HoleValue_DATA] = (uintptr_t)data; + patches[HoleValue_EXECUTOR] = (uintptr_t)executor; + patches[HoleValue_TOP] = (uintptr_t)code; + patches[HoleValue_ZERO] = 0; + emit(group, patches); + code += group->code.body_size; + data += group->data.body_size; + } assert(code == memory + code_size); assert(data == memory + code_size + data_size); if (mark_executable(memory, total_size)) { @@ -470,6 +485,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size return -1; } executor->jit_code = memory; + executor->jit_code_ghccc = memory + trampoline.code.body_size; executor->jit_size = total_size; return 0; } @@ -481,6 +497,7 @@ _PyJIT_Free(_PyExecutorObject *executor) size_t size = executor->jit_size; if (memory) { executor->jit_code = NULL; + executor->jit_code_ghccc = NULL; executor->jit_size = 0; if (jit_free(memory, size)) { PyErr_WriteUnraisable(NULL); diff --git a/Python/optimizer.c b/Python/optimizer.c index 5863336c0d9ecf..c9d597dc9f678a 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1138,6 +1138,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil #endif #ifdef _Py_JIT executor->jit_code = NULL; + executor->jit_code_ghccc = NULL; executor->jit_size = 0; if (_PyJIT_Compile(executor, executor->trace, length)) { Py_DECREF(executor); @@ -1168,6 +1169,7 @@ init_cold_exit_executor(_PyExecutorObject *executor, int oparg) #endif #ifdef _Py_JIT executor->jit_code = NULL; + executor->jit_code_ghccc = NULL; executor->jit_size = 0; if (_PyJIT_Compile(executor, executor->trace, 1)) { return -1; diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 66db358679239e..faad42ca78838d 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -38,6 +38,7 @@ class _Target(typing.Generic[_S, _R]): _: dataclasses.KW_ONLY alignment: int = 1 args: typing.Sequence[str] = () + ghccc: bool = False prefix: str = "" debug: bool = False force: bool = False @@ -85,7 +86,8 @@ async def _parse(self, path: pathlib.Path) -> _stencils.StencilGroup: sections: list[dict[typing.Literal["Section"], _S]] = json.loads(output) for wrapped_section in sections: self._handle_section(wrapped_section["Section"], group) - assert group.symbols["_JIT_ENTRY"] == (_stencils.HoleValue.CODE, 0) + entry_symbol = "_JIT_ENTRY" if "_JIT_ENTRY" in group.symbols else "_ENTRY" + assert group.symbols[entry_symbol] == (_stencils.HoleValue.CODE, 0) if group.data.body: line = f"0: {str(bytes(group.data.body)).removeprefix('b')}" group.data.disassembly.append(line) @@ -103,6 +105,8 @@ def _handle_relocation( async def _compile( self, opname: str, c: pathlib.Path, tempdir: pathlib.Path ) -> _stencils.StencilGroup: + if opname == "trampoline" and not self.ghccc: + return _stencils.StencilGroup() o = tempdir / f"{opname}.o" args = [ f"--target={self.triple}", @@ -130,13 +134,28 @@ async def _compile( "-fno-plt", # Don't call stack-smashing canaries that we can't find or patch: "-fno-stack-protector", - "-o", - f"{o}", "-std=c11", - f"{c}", *self.args, ] - await _llvm.run("clang", args, echo=self.verbose) + if self.ghccc: + ll = tempdir / f"{opname}.ll" + args_ll = args + ["-S", "-emit-llvm", "-o", f"{ll}", f"{c}"] + await _llvm.run("clang", args_ll, echo=self.verbose) + ir = ll.read_text() + # Hack! Use __attribute__((preserve_none)) once we have Clang 19: + ir = re.sub(r"((noalias |nonnull )*ptr @_JIT_\w+\()", r"ghccc \1", ir) + for line in ir.splitlines(): + if re.match(r"ptr @_JIT_\w+\(", line): + assert re.match(r"ghccc (\w+ )*ptr @_JIT_\w+\(", line) + ir = re.sub(r"musttail call ([^g])", r"musttail call ghccc \1", ir) + for line in ir.splitlines(): + if re.match(r"musttail", line): + assert re.match(r"musttail call ghccc", line) + ll.write_text(ir) + args_o = args + ["-Wno-unused-command-line-argument", "-o", f"{o}", f"{ll}"] + else: + args_o = args + ["-o", f"{o}", f"{c}"] + await _llvm.run("clang", args_o, echo=self.verbose) return await self._parse(o) async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: @@ -146,6 +165,8 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: with tempfile.TemporaryDirectory() as tempdir: work = pathlib.Path(tempdir).resolve() async with asyncio.TaskGroup() as group: + coro = self._compile("trampoline", TOOLS_JIT / "trampoline.c", work) + tasks.append(group.create_task(coro, name="trampoline")) for opname in opnames: coro = self._compile(opname, TOOLS_JIT_TEMPLATE_C, work) tasks.append(group.create_task(coro, name=opname)) @@ -456,12 +477,13 @@ def get_target(host: str) -> _COFF | _ELF | _MachO: return _ELF(host, alignment=8, args=args) if re.fullmatch(r"i686-pc-windows-msvc", host): args = ["-DPy_NO_ENABLE_SHARED"] - return _COFF(host, args=args, prefix="_") + return _COFF(host, args=args, ghccc=True, prefix="_") if re.fullmatch(r"x86_64-apple-darwin.*", host): - return _MachO(host, prefix="_") + args = ["-fomit-frame-pointer"] + return _MachO(host, args=args, ghccc=True, prefix="_") if re.fullmatch(r"x86_64-pc-windows-msvc", host): args = ["-fms-runtime-lib=dll"] - return _COFF(host, args=args) + return _COFF(host, ghccc=True, args=args) if re.fullmatch(r"x86_64-.*-linux-gnu", host): - return _ELF(host) + return _ELF(host, ghccc=True) raise ValueError(host) diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index cbc1ed2fa6543a..6b36d8a9c66a3f 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -53,9 +53,13 @@ def _dump_footer(opnames: typing.Iterable[str]) -> typing.Iterator[str]: yield "" yield "static const StencilGroup stencil_groups[512] = {" for opname in opnames: + if opname == "trampoline": + continue yield f" [{opname}] = INIT_STENCIL_GROUP({opname})," yield "};" yield "" + yield "static const StencilGroup trampoline = INIT_STENCIL_GROUP(trampoline);" + yield "" yield "#define GET_PATCHES() { \\" for value in _stencils.HoleValue: yield f" [HoleValue_{value.name}] = (uintptr_t)0xBADBADBADBADBADB, \\" diff --git a/Tools/jit/template.c b/Tools/jit/template.c index 228dc83254d678..af42737961c133 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -48,7 +48,7 @@ do { \ OPT_STAT_INC(traces_executed); \ __attribute__((musttail)) \ - return ((jit_func)((EXECUTOR)->jit_code))(frame, stack_pointer, tstate); \ + return ((jit_func)((EXECUTOR)->jit_code_ghccc))(frame, stack_pointer, tstate); \ } while (0) #undef GOTO_TIER_ONE @@ -65,7 +65,7 @@ do { \ #define PATCH_VALUE(TYPE, NAME, ALIAS) \ PyAPI_DATA(void) ALIAS; \ - TYPE NAME = (TYPE)(uint64_t)&ALIAS; + TYPE NAME = (TYPE)(uintptr_t)&ALIAS; #define PATCH_JUMP(ALIAS) \ do { \ diff --git a/Tools/jit/trampoline.c b/Tools/jit/trampoline.c new file mode 100644 index 00000000000000..97d862f1f6f696 --- /dev/null +++ b/Tools/jit/trampoline.c @@ -0,0 +1,17 @@ +#include "Python.h" + +#include "pycore_ceval.h" +#include "pycore_frame.h" +#include "pycore_jit.h" + +_Py_CODEUNIT * +_ENTRY(_PyInterpreterFrame *frame, PyObject **stack_pointer, PyThreadState *tstate) +{ + PyAPI_DATA(void) _JIT_EXECUTOR; + PyObject *executor = (PyObject *)(uintptr_t)&_JIT_EXECUTOR; + Py_INCREF(executor); + PyAPI_DATA(void) _JIT_CONTINUE; + _Py_CODEUNIT *target = ((jit_func)&_JIT_CONTINUE)(frame, stack_pointer, tstate); + Py_SETREF(tstate->previous_executor, executor); + return target; +} From 07e95c4f5011943fe66da5edcee113cfb9a1eb66 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 23 Apr 2024 11:29:57 -0700 Subject: [PATCH 2/9] Fix call to emit on 32-bit platforms --- Python/jit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index c754054c3bc80b..00c330a34a7ff6 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -407,8 +407,8 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size unsigned char *data = memory + code_size; { const StencilGroup *group = &trampoline; - // Think of patches as a dictionary mapping HoleValue to uint64_t: - uint64_t patches[] = GET_PATCHES(); + // Think of patches as a dictionary mapping HoleValue to uintptr_t: + uintptr_t patches[] = GET_PATCHES(); patches[HoleValue_CODE] = (uintptr_t)code; patches[HoleValue_CONTINUE] = (uintptr_t)code + group->code.body_size; patches[HoleValue_DATA] = (uintptr_t)data; From e3a5f8a96233715912a277a7ea4181ddf7275256 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 25 Apr 2024 08:43:40 -0700 Subject: [PATCH 3/9] Comments and cleanup --- Python/jit.c | 2 ++ Tools/jit/_targets.py | 28 +++++++++++++++++++--------- Tools/jit/trampoline.c | 8 ++++++++ 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 00c330a34a7ff6..470c9960b38c25 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -406,6 +406,8 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size unsigned char *code = memory; unsigned char *data = memory + code_size; { + // Compile the trampoline, which handles changing the calling convention + // (on platforms where it's not necessary, the trampoline is empty): const StencilGroup *group = &trampoline; // Think of patches as a dictionary mapping HoleValue to uintptr_t: uintptr_t patches[] = GET_PATCHES(); diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index faad42ca78838d..fe2eeb0190ec66 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -86,6 +86,9 @@ async def _parse(self, path: pathlib.Path) -> _stencils.StencilGroup: sections: list[dict[typing.Literal["Section"], _S]] = json.loads(output) for wrapped_section in sections: self._handle_section(wrapped_section["Section"], group) + # The trampoline's entry point is just named "_ENTRY", since on some + # platforms we later assume that any function starting with "_JIT_" uses + # the GHC calling convention: entry_symbol = "_JIT_ENTRY" if "_JIT_ENTRY" in group.symbols else "_ENTRY" assert group.symbols[entry_symbol] == (_stencils.HoleValue.CODE, 0) if group.data.body: @@ -105,6 +108,7 @@ def _handle_relocation( async def _compile( self, opname: str, c: pathlib.Path, tempdir: pathlib.Path ) -> _stencils.StencilGroup: + # "Compile" the trampoline to an empty stencil group if it's not needed: if opname == "trampoline" and not self.ghccc: return _stencils.StencilGroup() o = tempdir / f"{opname}.o" @@ -137,20 +141,26 @@ async def _compile( "-std=c11", *self.args, ] - if self.ghccc: + if self.ghccc:t + # So, this is mostly a giant hack (but it makes the code much + # smaller and faster, so it's worth it). We need to use the GHC + # calling convention, but Clang doesn't support it. So, we *first* + # compile the code to LLVM IR, perform some text replacements on the + # IR to change the calling convention(!), and then compile *that*. + # Once we have access to Clang 19, we can get rid of this and use + # __attribute__((preserve_none)) directly in the C code instead: ll = tempdir / f"{opname}.ll" args_ll = args + ["-S", "-emit-llvm", "-o", f"{ll}", f"{c}"] await _llvm.run("clang", args_ll, echo=self.verbose) ir = ll.read_text() - # Hack! Use __attribute__((preserve_none)) once we have Clang 19: + # This handles declarations, definitions, and calls to named symbols + # starting with "_JIT_": ir = re.sub(r"((noalias |nonnull )*ptr @_JIT_\w+\()", r"ghccc \1", ir) - for line in ir.splitlines(): - if re.match(r"ptr @_JIT_\w+\(", line): - assert re.match(r"ghccc (\w+ )*ptr @_JIT_\w+\(", line) - ir = re.sub(r"musttail call ([^g])", r"musttail call ghccc \1", ir) - for line in ir.splitlines(): - if re.match(r"musttail", line): - assert re.match(r"musttail call ghccc", line) + # This handles calls to anonymous callees, since anything with + # "musttail" needs to use the same calling convention: + ir = ir.replace("musttail call", "musttail call ghccc") + # Sometimes *both* replacements happen at the same site, so fix it: + ir = ir.replace("ghccc ghccc", "ghccc") ll.write_text(ir) args_o = args + ["-Wno-unused-command-line-argument", "-o", f"{o}", f"{ll}"] else: diff --git a/Tools/jit/trampoline.c b/Tools/jit/trampoline.c index 97d862f1f6f696..f3fd5624d0f518 100644 --- a/Tools/jit/trampoline.c +++ b/Tools/jit/trampoline.c @@ -4,12 +4,20 @@ #include "pycore_frame.h" #include "pycore_jit.h" +// This is where the calling convention changes, on platforms that require it. +// The actual change is hacked in while the JIT compiler is being built, in +// Tools/jit/_targets.py. On other platforms, this function compiles to nothing. _Py_CODEUNIT * _ENTRY(_PyInterpreterFrame *frame, PyObject **stack_pointer, PyThreadState *tstate) { + // This is subtle. The actual trace will return to us once it exits, so we + // need to make sure that we stay alive until then. If our trace side-exits + // into another trace, and this trace is then invalidated, the code for + // *this function* will be freed and we'll crash upon return: PyAPI_DATA(void) _JIT_EXECUTOR; PyObject *executor = (PyObject *)(uintptr_t)&_JIT_EXECUTOR; Py_INCREF(executor); + // Note that this is *not* a tail call: PyAPI_DATA(void) _JIT_CONTINUE; _Py_CODEUNIT *target = ((jit_func)&_JIT_CONTINUE)(frame, stack_pointer, tstate); Py_SETREF(tstate->previous_executor, executor); From 0311f2fa65c03c3406b12c5de3c7d144e25325d1 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 25 Apr 2024 08:45:01 -0700 Subject: [PATCH 4/9] Is this really needed? --- Tools/jit/_targets.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index fe2eeb0190ec66..7f5b93f37bd2c8 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -489,11 +489,10 @@ def get_target(host: str) -> _COFF | _ELF | _MachO: args = ["-DPy_NO_ENABLE_SHARED"] return _COFF(host, args=args, ghccc=True, prefix="_") if re.fullmatch(r"x86_64-apple-darwin.*", host): - args = ["-fomit-frame-pointer"] - return _MachO(host, args=args, ghccc=True, prefix="_") + return _MachO(host, ghccc=True, prefix="_") if re.fullmatch(r"x86_64-pc-windows-msvc", host): args = ["-fms-runtime-lib=dll"] - return _COFF(host, ghccc=True, args=args) + return _COFF(host, args=args, ghccc=True) if re.fullmatch(r"x86_64-.*-linux-gnu", host): return _ELF(host, ghccc=True) raise ValueError(host) From 89764fd88ebc1862f89e0b7f14278cc2b9637686 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 25 Apr 2024 08:48:12 -0700 Subject: [PATCH 5/9] t --- Tools/jit/_targets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 7f5b93f37bd2c8..ce8d9200224b47 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -141,7 +141,7 @@ async def _compile( "-std=c11", *self.args, ] - if self.ghccc:t + if self.ghccc: # So, this is mostly a giant hack (but it makes the code much # smaller and faster, so it's worth it). We need to use the GHC # calling convention, but Clang doesn't support it. So, we *first* From 76ec119b75562d048c64755f937f62d07a66c4a4 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 25 Apr 2024 08:55:32 -0700 Subject: [PATCH 6/9] ...guess so. --- Tools/jit/_targets.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index ce8d9200224b47..8cc5ce92b62d5d 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -150,7 +150,11 @@ async def _compile( # Once we have access to Clang 19, we can get rid of this and use # __attribute__((preserve_none)) directly in the C code instead: ll = tempdir / f"{opname}.ll" - args_ll = args + ["-S", "-emit-llvm", "-o", f"{ll}", f"{c}"] + args_ll = args + [ + # -fomit-frame-pointer is necessary because the GHC calling + # convention uses RBP to pass arguments: + "-S", "-emit-llvm", "-fomit-frame-pointer", "-o", f"{ll}", f"{c}" + ] await _llvm.run("clang", args_ll, echo=self.verbose) ir = ll.read_text() # This handles declarations, definitions, and calls to named symbols From 159d318eb834ebc38b46e834f70eb8d6992fdbe1 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 25 Apr 2024 11:56:49 -0700 Subject: [PATCH 7/9] Wording --- Tools/jit/_targets.py | 4 ++-- Tools/jit/trampoline.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 8cc5ce92b62d5d..bba9826f23e472 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -142,8 +142,8 @@ async def _compile( *self.args, ] if self.ghccc: - # So, this is mostly a giant hack (but it makes the code much - # smaller and faster, so it's worth it). We need to use the GHC + # This is a bit of an ugly workaround, but it makes the code much + # smaller and faster, so it's worth it. We want to use the GHC # calling convention, but Clang doesn't support it. So, we *first* # compile the code to LLVM IR, perform some text replacements on the # IR to change the calling convention(!), and then compile *that*. diff --git a/Tools/jit/trampoline.c b/Tools/jit/trampoline.c index f3fd5624d0f518..01b3d63a6790ba 100644 --- a/Tools/jit/trampoline.c +++ b/Tools/jit/trampoline.c @@ -5,7 +5,7 @@ #include "pycore_jit.h" // This is where the calling convention changes, on platforms that require it. -// The actual change is hacked in while the JIT compiler is being built, in +// The actual change is patched in while the JIT compiler is being built, in // Tools/jit/_targets.py. On other platforms, this function compiles to nothing. _Py_CODEUNIT * _ENTRY(_PyInterpreterFrame *frame, PyObject **stack_pointer, PyThreadState *tstate) From 66db030baa77cd9bc439dbd6547930a9c4bebfe6 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 30 Apr 2024 22:30:08 -0700 Subject: [PATCH 8/9] jit_code_ghccc -> jit_side_entry --- Include/cpython/optimizer.h | 2 +- Python/jit.c | 4 ++-- Python/optimizer.c | 4 ++-- Tools/jit/template.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index fa10a0ee6974ed..a31314b70536c9 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -101,7 +101,7 @@ typedef struct _PyExecutorObject { uint32_t code_size; size_t jit_size; void *jit_code; - void *jit_code_ghccc; + void *jit_side_entry; _PyExitData exits[1]; } _PyExecutorObject; diff --git a/Python/jit.c b/Python/jit.c index e563db250c78ed..8432162adb8c53 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -490,7 +490,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size return -1; } executor->jit_code = memory; - executor->jit_code_ghccc = memory + trampoline.code.body_size; + executor->jit_side_entry = memory + trampoline.code.body_size; executor->jit_size = total_size; return 0; } @@ -502,7 +502,7 @@ _PyJIT_Free(_PyExecutorObject *executor) size_t size = executor->jit_size; if (memory) { executor->jit_code = NULL; - executor->jit_code_ghccc = NULL; + executor->jit_side_entry = NULL; executor->jit_size = 0; if (jit_free(memory, size)) { PyErr_WriteUnraisable(NULL); diff --git a/Python/optimizer.c b/Python/optimizer.c index 884d5403deb00e..d1b23a89837f70 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1166,7 +1166,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil #endif #ifdef _Py_JIT executor->jit_code = NULL; - executor->jit_code_ghccc = NULL; + executor->jit_side_entry = NULL; executor->jit_size = 0; if (_PyJIT_Compile(executor, executor->trace, length)) { Py_DECREF(executor); @@ -1197,7 +1197,7 @@ init_cold_exit_executor(_PyExecutorObject *executor, int oparg) #endif #ifdef _Py_JIT executor->jit_code = NULL; - executor->jit_code_ghccc = NULL; + executor->jit_side_entry = NULL; executor->jit_size = 0; if (_PyJIT_Compile(executor, executor->trace, 1)) { return -1; diff --git a/Tools/jit/template.c b/Tools/jit/template.c index fc303b16db8f65..0dd0744f7aec9c 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -48,7 +48,7 @@ do { \ OPT_STAT_INC(traces_executed); \ __attribute__((musttail)) \ - return ((jit_func)((EXECUTOR)->jit_code_ghccc))(frame, stack_pointer, tstate); \ + return ((jit_func)((EXECUTOR)->jit_side_entry))(frame, stack_pointer, tstate); \ } while (0) #undef GOTO_TIER_ONE From d03f22741981e4c1ffc054490a2a98a522164236 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 30 Apr 2024 22:43:59 -0700 Subject: [PATCH 9/9] Clarify a few comments --- Python/jit.c | 7 +++++-- Tools/jit/_targets.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 8432162adb8c53..df14e48c564447 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -409,8 +409,11 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size unsigned char *code = memory; unsigned char *data = memory + code_size; { - // Compile the trampoline, which handles changing the calling convention - // (on platforms where it's not necessary, the trampoline is empty): + // Compile the trampoline, which handles converting between the native + // calling convention and the calling convention used by jitted code + // (which may be different for efficiency reasons). On platforms where + // we don't change calling conventions, the trampoline is empty and + // nothing is emitted here: const StencilGroup *group = &trampoline; // Think of patches as a dictionary mapping HoleValue to uintptr_t: uintptr_t patches[] = GET_PATCHES(); diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 24f46e513f0a30..274d17bcf38deb 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -480,6 +480,7 @@ def _handle_relocation( def get_target(host: str) -> _COFF | _ELF | _MachO: """Build a _Target for the given host "triple" and options.""" + # ghccc currently crashes Clang when combined with musttail on aarch64. :( if re.fullmatch(r"aarch64-apple-darwin.*", host): return _MachO(host, alignment=8, prefix="_") if re.fullmatch(r"aarch64-pc-windows-msvc", host):