From 9415686fede57340570b0165ff09ccf798c3e2a2 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 3 Sep 2025 17:47:41 -0700 Subject: [PATCH 01/11] Avoid heap allocation for function calls with a small number of arguments We don't have access to llvm::SmallVector or similar, but given the limited subset of the `std::vector` API that `function_call::args{,_convert}` need and the "reserve-then-fill" usage pattern, it is relatively straightforward to implement custom containers that get the job done. Seems to improves time to call the collatz function in pybind/pybind11_benchmark significantly; numbers are a little noisy but there's a clear improvement from "about 60 ns per call" to "about 45 ns per call" on my machine (M4 Max Mac), as measured with `timeit.repeat('collatz(4)', 'from pybind11_benchmark import collatz')`. --- CMakeLists.txt | 1 + include/pybind11/cast.h | 7 +- include/pybind11/detail/argument_vector.h | 320 ++++++++++++++++++ include/pybind11/pybind11.h | 4 +- tests/extra_python_package/test_files.py | 1 + tests/test_embed/CMakeLists.txt | 3 +- tests/test_embed/test_args_convert_vector.cpp | 81 +++++ tests/test_embed/test_argument_vector.cpp | 93 +++++ 8 files changed, 505 insertions(+), 5 deletions(-) create mode 100644 include/pybind11/detail/argument_vector.h create mode 100644 tests/test_embed/test_args_convert_vector.cpp create mode 100644 tests/test_embed/test_argument_vector.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ba5f665a2f..a6d619bbdd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT) endif() set(PYBIND11_HEADERS + include/pybind11/detail/argument_vector.h include/pybind11/detail/class.h include/pybind11/detail/common.h include/pybind11/detail/cpp_conduit.h diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index c635791fee..88e88d4e62 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -10,6 +10,7 @@ #pragma once +#include "detail/argument_vector.h" #include "detail/common.h" #include "detail/descr.h" #include "detail/native_enum_data.h" @@ -2045,10 +2046,12 @@ struct function_call { const function_record &func; /// Arguments passed to the function: - std::vector args; + /// (Inline size chosen mostly arbitrarily; 5 should pad function_call out to two cache lines + /// (16 pointers) in size.) + argument_vector<5> args; /// The `convert` value the arguments should be loaded with - std::vector args_convert; + args_convert_vector<5> args_convert; /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if /// present, are also in `args` but without a reference). diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h new file mode 100644 index 0000000000..3aa9687462 --- /dev/null +++ b/include/pybind11/detail/argument_vector.h @@ -0,0 +1,320 @@ +/* + pybind11/detail/argument_vector.h: small_vector-like containers to + avoid heap allocation of arguments during function call dispatch. + + Copyright (c) Meta Platforms, Inc. and affiliates. + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include + +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +PYBIND11_WARNING_DISABLE_MSVC(4127) + +PYBIND11_NAMESPACE_BEGIN(detail) + +// Shared implementation utility for our small_vector-like containers. +// We support C++11 and C++14, so we cannot use +// std::variant. Union with the tag packed next to the inline +// array's size is smaller anyway, allowing 1 extra handle of +// inline storage for free. Compare the layouts (1 line per +// size_t/void*): +// With variant, total is N + 2 for N >= 2: +// - variant tag (cannot be packed with the array size) +// - array size (or first pointer of 3 in std::vector) +// - N pointers of inline storage (or 2 remaining pointers of std::vector) +// Custom union, total is N + 1 for N >= 3: +// - variant tag & array size if applicable +// - N pointers of inline storage (or 3 pointers of std::vector) +// +// NOTE: this is a low-level representational convenience; the two +// use cases of this union are materially different and in particular +// have different semantics for inline_array::size. All that is being +// shared is the memory management behavior. +template +union inline_array_or_vector { + struct inline_array { + bool is_inline = true; + std::uint32_t size = 0; + std::array arr; + }; + struct heap_vector { + bool is_inline = false; + std::vector vec; + + heap_vector() = default; + heap_vector(std::size_t count, VectorT value) : vec(count, value) {} + }; + + inline_array array; + heap_vector vector; + + static_assert(std::is_trivially_move_constructible::value, + "ArrayT must be trivially move constructible"); + static_assert(std::is_trivially_destructible::value, + "ArrayT must be trivially destructible"); + + inline_array_or_vector() : array() {} + ~inline_array_or_vector() { + if (!is_inline()) { + vector.~heap_vector(); + } + } + inline_array_or_vector(const inline_array_or_vector &) = delete; + inline_array_or_vector &operator=(const inline_array_or_vector &) = delete; + + inline_array_or_vector(inline_array_or_vector &&rhs) noexcept { + if (rhs.is_inline()) { + std::memcpy(&array, &rhs.array, sizeof(array)); + } else { + new (&vector) heap_vector(std::move(rhs.vector)); + } + assert(is_inline() == rhs.is_inline()); + } + + inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept { + if (this == &rhs) { + return *this; + } + + if (rhs.is_inline()) { + if (!is_inline()) { + vector.~heap_vector(); + } + std::memcpy(&array, &rhs.array, sizeof(array)); + } else { + if (is_inline()) { + new (&vector) heap_vector(std::move(rhs.vector)); + } else { + vector = std::move(rhs.vector); + } + } + return *this; + } + + bool is_inline() const { + // It is undefined behavior to access the inactive member of a + // union directly. However, it is well-defined to reinterpret_cast any + // pointer into a pointer to char and examine it as an array + // of bytes. See + // https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4 + bool result = false; + std::memcpy(&result, reinterpret_cast(this), sizeof(bool)); + return result; + } +}; + +// small_vector-like container to avoid heap allocation for N or fewer +// arguments. +template +struct argument_vector { +public: + argument_vector() = default; + + argument_vector(const argument_vector &) = delete; + argument_vector &operator=(const argument_vector &) = delete; + argument_vector(argument_vector &&) noexcept = default; + argument_vector &operator=(argument_vector &&) noexcept = default; + + std::size_t size() const { + if (is_inline()) { + return m_repr.array.size; + } else { + return m_repr.vector.vec.size(); + } + } + + handle &operator[](std::size_t idx) { + assert(idx < size()); + if (is_inline()) { + return m_repr.array.arr[idx]; + } else { + return m_repr.vector.vec[idx]; + } + } + + handle operator[](std::size_t idx) const { + assert(idx < size()); + if (is_inline()) { + return m_repr.array.arr[idx]; + } else { + return m_repr.vector.vec[idx]; + } + } + + void push_back(handle x) { + if (is_inline()) { + auto &ha = m_repr.array; + if (ha.size == N) { + move_to_vector_with_reserved_size(N + 1); + m_repr.vector.vec.push_back(x); + } else { + ha.arr[ha.size++] = x; + } + } else { + m_repr.vector.vec.push_back(x); + } + } + + template + void emplace_back(Arg &&x) { + push_back(handle(x)); + } + + void reserve(std::size_t sz) { + if (is_inline()) { + if (sz > N) { + move_to_vector_with_reserved_size(sz); + } + } else { + m_repr.vector.vec.reserve(sz); + } + } + +private: + using repr_type = inline_array_or_vector; + repr_type m_repr; + + void move_to_vector_with_reserved_size(std::size_t reserved_size) { + assert(is_inline()); + auto &ha = m_repr.array; + using heap_vector = typename repr_type::heap_vector; + heap_vector hv; + hv.vec.reserve(reserved_size); + std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec)); + new (&m_repr.vector) heap_vector(std::move(hv)); + } + + bool is_inline() const { return m_repr.is_inline(); } +}; + +// small_vector-like container to avoid heap allocation for N or fewer +// arguments. +template +struct args_convert_vector { +private: +public: + args_convert_vector() = default; + + args_convert_vector(const args_convert_vector &) = delete; + args_convert_vector &operator=(const args_convert_vector &) = delete; + args_convert_vector(args_convert_vector &&) noexcept = default; + args_convert_vector &operator=(args_convert_vector &&) noexcept = default; + + args_convert_vector(std::size_t count, bool value) { + if (count > kInlineSize) { + new (&m_repr.vector) typename repr_type::heap_vector(count, value); + } else { + auto &inline_arr = m_repr.array; + inline_arr.arr.fill(value ? std::size_t(-1) : 0); + inline_arr.size = static_cast(count); + } + } + + std::size_t size() const { + if (is_inline()) { + return m_repr.array.size; + } else { + return m_repr.vector.vec.size(); + } + } + + void reserve(std::size_t sz) { + if (is_inline()) { + if (sz > kInlineSize) { + move_to_vector_with_reserved_size(sz); + } + } else { + m_repr.vector.vec.reserve(sz); + } + } + + bool operator[](std::size_t idx) const { + if (is_inline()) { + return inline_index(idx); + } else { + assert(idx < m_repr.vector.vec.size()); + return m_repr.vector.vec[idx]; + } + } + + void push_back(bool b) { + if (is_inline()) { + auto &ha = m_repr.array; + if (ha.size == kInlineSize) { + move_to_vector_with_reserved_size(kInlineSize + 1); + m_repr.vector.vec.push_back(b); + } else { + assert(ha.size < kInlineSize); + const auto wbi = word_and_bit_index(ha.size++); + assert(wbi.word < kWords); + assert(wbi.bit < kBitsPerWord); + if (b) { + ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit); + } else { + ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit); + } + assert(operator[](ha.size - 1) == b); + } + } else { + m_repr.vector.vec.push_back(b); + } + } + + void swap(args_convert_vector &rhs) { std::swap(m_repr, rhs.m_repr); } + +private: + struct WordAndBitIndex { + std::size_t word; + std::size_t bit; + }; + + static WordAndBitIndex word_and_bit_index(std::size_t idx) { + return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord}; + } + + bool inline_index(std::size_t idx) const { + const auto wbi = word_and_bit_index(idx); + assert(wbi.word < kWords); + assert(wbi.bit < kBitsPerWord); + return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit); + } + + void move_to_vector_with_reserved_size(std::size_t reserved_size) { + auto &inline_arr = m_repr.array; + using heap_vector = typename repr_type::heap_vector; + heap_vector hv; + hv.vec.reserve(reserved_size); + for (std::size_t ii = 0; ii < inline_arr.size; ++ii) { + hv.vec.push_back(inline_index(ii)); + } + new (&m_repr.vector) heap_vector(std::move(hv)); + } + + static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t); + static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord; + static constexpr auto kInlineSize = kWords * kBitsPerWord; + + using repr_type = inline_array_or_vector; + repr_type m_repr; + + bool is_inline() const { return m_repr.is_inline(); } +}; + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 117fecabf0..dcd245e981 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1048,12 +1048,12 @@ class cpp_function : public function { } #endif - std::vector second_pass_convert; + args_convert_vector<5> second_pass_convert; if (overloaded) { // We're in the first no-convert pass, so swap out the conversion flags for a // set of all-false flags. If the call fails, we'll swap the flags back in for // the conversion-allowed call below. - second_pass_convert.resize(func.nargs, false); + second_pass_convert = args_convert_vector<5>(func.nargs, false); call.args_convert.swap(second_pass_convert); } diff --git a/tests/extra_python_package/test_files.py b/tests/extra_python_package/test_files.py index 63e59f65a6..d3452aa383 100644 --- a/tests/extra_python_package/test_files.py +++ b/tests/extra_python_package/test_files.py @@ -76,6 +76,7 @@ } detail_headers = { + "include/pybind11/detail/argument_vector.h", "include/pybind11/detail/class.h", "include/pybind11/detail/common.h", "include/pybind11/detail/cpp_conduit.h", diff --git a/tests/test_embed/CMakeLists.txt b/tests/test_embed/CMakeLists.txt index 23c3336437..14dda54df2 100644 --- a/tests/test_embed/CMakeLists.txt +++ b/tests/test_embed/CMakeLists.txt @@ -33,7 +33,8 @@ if(PYBIND11_TEST_SMART_HOLDER) -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE) endif() -add_executable(test_embed catch.cpp test_interpreter.cpp test_subinterpreter.cpp) +add_executable(test_embed catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp + test_interpreter.cpp test_subinterpreter.cpp) pybind11_enable_warnings(test_embed) target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads) diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp new file mode 100644 index 0000000000..1db4d120f0 --- /dev/null +++ b/tests/test_embed/test_args_convert_vector.cpp @@ -0,0 +1,81 @@ +#include "pybind11/pybind11.h" +#include "catch.hpp" + +namespace py = pybind11; + +// 5 is chosen to match the production value. It doesn't really matter +// what we pick because the actual inline size is going to be 8 * +// sizeof(size_t) given the implementation at the time of writing. +using args_convert_vector = py::detail::args_convert_vector<5>; + +namespace { +template +std::vector get_sample_vectors() { + std::vector result; + result.emplace_back(); + for (const auto sz : {0, 4, 5, 6, 31, 32, 33, 63, 64, 65}) { + for (const bool b : {false, true}) { + result.emplace_back(static_cast(sz), b); + } + } + return result; +} + +void require_vector_matches_sample(const args_convert_vector &actual, + const std::vector &expected) { + REQUIRE(actual.size() == expected.size()); + for (size_t ii = 0; ii < actual.size(); ++ii) { + REQUIRE(actual[ii] == expected[ii]); + } +} + +template +void mutation_test_with_samples(ActualMutationFunc actual_mutation_func, + ExpectedMutationFunc expected_mutation_func) { + auto sample_contents = get_sample_vectors>(); + auto samples = get_sample_vectors(); + for (size_t ii = 0; ii < samples.size(); ++ii) { + auto &actual = samples[ii]; + auto &expected = sample_contents[ii]; + + actual_mutation_func(actual); + expected_mutation_func(expected); + require_vector_matches_sample(actual, expected); + } +} +} // namespace + +// I would like to write [capture](auto& vec) block inline, but we +// have to work with C++11, which doesn't have generic lambdas. +#define MUTATION_LAMBDA(capture, block) \ + [capture](args_convert_vector & vec) block, [capture](std::vector & vec) block + +// For readability, rather than having ugly empty arguments. +#define NO_CAPTURE + +TEST_CASE("check sample args_convert_vector contents") { + mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; })); +} + +TEST_CASE("args_convert_vector push_back") { + for (const bool b : {false, true}) { + mutation_test_with_samples(MUTATION_LAMBDA(b, { vec.push_back(b); })); + } +} + +TEST_CASE("args_convert_vector reserve") { + for (std::size_t ii = 0; ii < 4; ++ii) { + mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); })); + } +} + +TEST_CASE("args_convert_vector reserve then push_back") { + for (std::size_t ii = 0; ii < 4; ++ii) { + for (const bool b : {false, true}) { + mutation_test_with_samples(MUTATION_LAMBDA(=, { + vec.reserve(ii); + vec.push_back(b); + })); + } + } +} diff --git a/tests/test_embed/test_argument_vector.cpp b/tests/test_embed/test_argument_vector.cpp new file mode 100644 index 0000000000..bfa64d0e75 --- /dev/null +++ b/tests/test_embed/test_argument_vector.cpp @@ -0,0 +1,93 @@ +#include "pybind11/pybind11.h" +#include "catch.hpp" + +namespace py = pybind11; + +// 2 is chosen because it is the smallest number (keeping tests short) +// where we can create non-empty vectors whose size is the inline size +// plus or minus 1. +using argument_vector = py::detail::argument_vector<2>; + +namespace { +argument_vector to_argument_vector(const std::vector &v) { + argument_vector result; + result.reserve(v.size()); + for (const auto x : v) { + result.push_back(x); + } + return result; +} + +std::vector> get_sample_argument_vector_contents() { + return std::vector>{ + {}, + {py::handle(Py_None)}, + {py::handle(Py_None), py::handle(Py_False)}, + {py::handle(Py_None), py::handle(Py_False), py::handle(Py_True)}, + }; +} + +std::vector get_sample_argument_vectors() { + std::vector result; + for (const auto &vec : get_sample_argument_vector_contents()) { + result.push_back(to_argument_vector(vec)); + } + return result; +} + +void require_vector_matches_sample(const argument_vector &actual, + const std::vector &expected) { + REQUIRE(actual.size() == expected.size()); + for (size_t ii = 0; ii < actual.size(); ++ii) { + REQUIRE(actual[ii].ptr() == expected[ii].ptr()); + } +} + +template +void mutation_test_with_samples(ActualMutationFunc actual_mutation_func, + ExpectedMutationFunc expected_mutation_func) { + auto sample_contents = get_sample_argument_vector_contents(); + auto samples = get_sample_argument_vectors(); + for (size_t ii = 0; ii < samples.size(); ++ii) { + auto &actual = samples[ii]; + auto &expected = sample_contents[ii]; + + actual_mutation_func(actual); + expected_mutation_func(expected); + require_vector_matches_sample(actual, expected); + } +} + +} // namespace + +// I would like to write [capture](auto& vec) block inline, but we +// have to work with C++11, which doesn't have generic lambdas. +#define MUTATION_LAMBDA(capture, block) \ + [capture](argument_vector & vec) block, [capture](std::vector & vec) block + +// For readability, rather than having ugly empty arguments. +#define NO_CAPTURE + +TEST_CASE("check sample argument_vector contents") { + mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; })); +} + +TEST_CASE("argument_vector push_back") { + mutation_test_with_samples( + MUTATION_LAMBDA(NO_CAPTURE, { vec.push_back(py::handle(Py_None)); })); +} + +TEST_CASE("argument_vector reserve") { + for (std::size_t ii = 0; ii < 4; ++ii) { + mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); })); + } +} + +TEST_CASE("argument_vector reserve then push_back") { + for (std::size_t ii = 0; ii < 4; ++ii) { + mutation_test_with_samples(MUTATION_LAMBDA(ii, { + vec.reserve(ii); + vec.push_back(py::handle(Py_True)); + })); + } +} From 09fc719485b74d95c24c79a2f54223c77655a51e Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 4 Sep 2025 13:11:44 -0700 Subject: [PATCH 02/11] clang-tidy --- include/pybind11/detail/argument_vector.h | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h index 3aa9687462..f8a0023443 100644 --- a/include/pybind11/detail/argument_vector.h +++ b/include/pybind11/detail/argument_vector.h @@ -134,27 +134,24 @@ struct argument_vector { std::size_t size() const { if (is_inline()) { return m_repr.array.size; - } else { - return m_repr.vector.vec.size(); } + return m_repr.vector.vec.size(); } handle &operator[](std::size_t idx) { assert(idx < size()); if (is_inline()) { return m_repr.array.arr[idx]; - } else { - return m_repr.vector.vec[idx]; } + return m_repr.vector.vec[idx]; } handle operator[](std::size_t idx) const { assert(idx < size()); if (is_inline()) { return m_repr.array.arr[idx]; - } else { - return m_repr.vector.vec[idx]; } + return m_repr.vector.vec[idx]; } void push_back(handle x) { @@ -229,9 +226,8 @@ struct args_convert_vector { std::size_t size() const { if (is_inline()) { return m_repr.array.size; - } else { - return m_repr.vector.vec.size(); } + return m_repr.vector.vec.size(); } void reserve(std::size_t sz) { @@ -247,10 +243,9 @@ struct args_convert_vector { bool operator[](std::size_t idx) const { if (is_inline()) { return inline_index(idx); - } else { - assert(idx < m_repr.vector.vec.size()); - return m_repr.vector.vec[idx]; } + assert(idx < m_repr.vector.vec.size()); + return m_repr.vector.vec[idx]; } void push_back(bool b) { @@ -276,7 +271,7 @@ struct args_convert_vector { } } - void swap(args_convert_vector &rhs) { std::swap(m_repr, rhs.m_repr); } + void swap(args_convert_vector &rhs) noexcept { std::swap(m_repr, rhs.m_repr); } private: struct WordAndBitIndex { From 8dd2eabb6651dc0f968fb06ea84ca3b4c9745dc7 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 4 Sep 2025 15:51:17 -0700 Subject: [PATCH 03/11] more clang-tidy --- tests/test_embed/test_args_convert_vector.cpp | 1 + tests/test_embed/test_argument_vector.cpp | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp index 1db4d120f0..a24020756d 100644 --- a/tests/test_embed/test_args_convert_vector.cpp +++ b/tests/test_embed/test_args_convert_vector.cpp @@ -47,6 +47,7 @@ void mutation_test_with_samples(ActualMutationFunc actual_mutation_func, // I would like to write [capture](auto& vec) block inline, but we // have to work with C++11, which doesn't have generic lambdas. +// NOLINTNEXTLINE(bugprone-macro-parentheses) #define MUTATION_LAMBDA(capture, block) \ [capture](args_convert_vector & vec) block, [capture](std::vector & vec) block diff --git a/tests/test_embed/test_argument_vector.cpp b/tests/test_embed/test_argument_vector.cpp index bfa64d0e75..42e290d329 100644 --- a/tests/test_embed/test_argument_vector.cpp +++ b/tests/test_embed/test_argument_vector.cpp @@ -62,6 +62,7 @@ void mutation_test_with_samples(ActualMutationFunc actual_mutation_func, // I would like to write [capture](auto& vec) block inline, but we // have to work with C++11, which doesn't have generic lambdas. +// NOLINTNEXTLINE(bugprone-macro-parentheses) #define MUTATION_LAMBDA(capture, block) \ [capture](argument_vector & vec) block, [capture](std::vector & vec) block @@ -73,8 +74,7 @@ TEST_CASE("check sample argument_vector contents") { } TEST_CASE("argument_vector push_back") { - mutation_test_with_samples( - MUTATION_LAMBDA(NO_CAPTURE, { vec.push_back(py::handle(Py_None)); })); + mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { vec.emplace_back(Py_None); })); } TEST_CASE("argument_vector reserve") { @@ -87,7 +87,7 @@ TEST_CASE("argument_vector reserve then push_back") { for (std::size_t ii = 0; ii < 4; ++ii) { mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); - vec.push_back(py::handle(Py_True)); + vec.emplace_back(Py_True); })); } } From 8219f6615fde12915773e0981fa916115bfb839d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 4 Sep 2025 16:20:16 -0700 Subject: [PATCH 04/11] clang-tidy NOLINTBEGIN/END instead of NOLINTNEXTLINE --- tests/test_embed/test_args_convert_vector.cpp | 3 ++- tests/test_embed/test_argument_vector.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp index a24020756d..e1dbbb40d4 100644 --- a/tests/test_embed/test_args_convert_vector.cpp +++ b/tests/test_embed/test_args_convert_vector.cpp @@ -47,9 +47,10 @@ void mutation_test_with_samples(ActualMutationFunc actual_mutation_func, // I would like to write [capture](auto& vec) block inline, but we // have to work with C++11, which doesn't have generic lambdas. -// NOLINTNEXTLINE(bugprone-macro-parentheses) +// NOLINTBEGIN(bugprone-macro-parentheses) #define MUTATION_LAMBDA(capture, block) \ [capture](args_convert_vector & vec) block, [capture](std::vector & vec) block +// NOLINTEND(bugprone-macro-parentheses) // For readability, rather than having ugly empty arguments. #define NO_CAPTURE diff --git a/tests/test_embed/test_argument_vector.cpp b/tests/test_embed/test_argument_vector.cpp index 42e290d329..9cf302a9b1 100644 --- a/tests/test_embed/test_argument_vector.cpp +++ b/tests/test_embed/test_argument_vector.cpp @@ -62,9 +62,10 @@ void mutation_test_with_samples(ActualMutationFunc actual_mutation_func, // I would like to write [capture](auto& vec) block inline, but we // have to work with C++11, which doesn't have generic lambdas. -// NOLINTNEXTLINE(bugprone-macro-parentheses) +// NOLINTBEGIN(bugprone-macro-parentheses) #define MUTATION_LAMBDA(capture, block) \ [capture](argument_vector & vec) block, [capture](std::vector & vec) block +// NOLINTEND(bugprone-macro-parentheses) // For readability, rather than having ugly empty arguments. #define NO_CAPTURE From e4c68beec9a79f59e48cde981a68f7bb1842e37c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 4 Sep 2025 16:53:50 -0700 Subject: [PATCH 05/11] forgot to increase inline size after removing std::variant --- include/pybind11/cast.h | 4 ++-- include/pybind11/pybind11.h | 4 ++-- tests/test_embed/test_args_convert_vector.cpp | 9 +++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 88e88d4e62..0935921dd0 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2048,10 +2048,10 @@ struct function_call { /// Arguments passed to the function: /// (Inline size chosen mostly arbitrarily; 5 should pad function_call out to two cache lines /// (16 pointers) in size.) - argument_vector<5> args; + argument_vector<6> args; /// The `convert` value the arguments should be loaded with - args_convert_vector<5> args_convert; + args_convert_vector<6> args_convert; /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if /// present, are also in `args` but without a reference). diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index dcd245e981..8569c2657e 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1048,12 +1048,12 @@ class cpp_function : public function { } #endif - args_convert_vector<5> second_pass_convert; + args_convert_vector<6> second_pass_convert; if (overloaded) { // We're in the first no-convert pass, so swap out the conversion flags for a // set of all-false flags. If the call fails, we'll swap the flags back in for // the conversion-allowed call below. - second_pass_convert = args_convert_vector<5>(func.nargs, false); + second_pass_convert = args_convert_vector<6>(func.nargs, false); call.args_convert.swap(second_pass_convert); } diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp index e1dbbb40d4..848e6fd5cd 100644 --- a/tests/test_embed/test_args_convert_vector.cpp +++ b/tests/test_embed/test_args_convert_vector.cpp @@ -3,10 +3,11 @@ namespace py = pybind11; -// 5 is chosen to match the production value. It doesn't really matter -// what we pick because the actual inline size is going to be 8 * -// sizeof(size_t) given the implementation at the time of writing. -using args_convert_vector = py::detail::args_convert_vector<5>; +// Inline size is chosen to match the production value. It doesn't +// really matter what we pick because the actual inline size is going +// to be 8 * sizeof(size_t) given the implementation at the time of +// writing. +using args_convert_vector = py::detail::args_convert_vector<6>; namespace { template From 3c435f1bf77ff05cc13a906d284188d2991399a2 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 5 Sep 2025 15:46:30 -0700 Subject: [PATCH 06/11] constexpr arg_vector_small_size, use move instead of swap to hopefully clarify second_pass_convert --- include/pybind11/cast.h | 8 +++++--- include/pybind11/pybind11.h | 7 ++++--- tests/test_embed/test_args_convert_vector.cpp | 6 +----- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 0935921dd0..c68b5fa648 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2038,6 +2038,8 @@ using is_pos_only = std::is_same, pos_only>; // forward declaration (definition in attr.h) struct function_record; +constexpr std::size_t arg_vector_small_size = 6; + /// Internal data associated with a single function call struct function_call { function_call(const function_record &f, handle p); // Implementation in attr.h @@ -2046,12 +2048,12 @@ struct function_call { const function_record &func; /// Arguments passed to the function: - /// (Inline size chosen mostly arbitrarily; 5 should pad function_call out to two cache lines + /// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines /// (16 pointers) in size.) - argument_vector<6> args; + argument_vector args; /// The `convert` value the arguments should be loaded with - args_convert_vector<6> args_convert; + args_convert_vector args_convert; /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if /// present, are also in `args` but without a reference). diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 8569c2657e..515aab1414 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1048,13 +1048,14 @@ class cpp_function : public function { } #endif - args_convert_vector<6> second_pass_convert; + args_convert_vector second_pass_convert; if (overloaded) { // We're in the first no-convert pass, so swap out the conversion flags for a // set of all-false flags. If the call fails, we'll swap the flags back in for // the conversion-allowed call below. - second_pass_convert = args_convert_vector<6>(func.nargs, false); - call.args_convert.swap(second_pass_convert); + second_pass_convert = std::move(call.args_convert); + call.args_convert + = args_convert_vector(func.nargs, false); } // 6. Call the function. diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp index 848e6fd5cd..7ce2d713fa 100644 --- a/tests/test_embed/test_args_convert_vector.cpp +++ b/tests/test_embed/test_args_convert_vector.cpp @@ -3,11 +3,7 @@ namespace py = pybind11; -// Inline size is chosen to match the production value. It doesn't -// really matter what we pick because the actual inline size is going -// to be 8 * sizeof(size_t) given the implementation at the time of -// writing. -using args_convert_vector = py::detail::args_convert_vector<6>; +using args_convert_vector = py::detail::args_convert_vector; namespace { template From f7bd49e31bddb1193145208d4c61fc145d3d6252 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 8 Sep 2025 09:52:07 -0700 Subject: [PATCH 07/11] rename test_embed to test_low_level --- tests/CMakeLists.txt | 4 ++-- .../subdirectory_embed/CMakeLists.txt | 6 +++--- tests/{test_embed => test_low_level}/CMakeLists.txt | 12 ++++++------ tests/{test_embed => test_low_level}/catch.cpp | 2 +- .../external_module.cpp | 0 .../test_args_convert_vector.cpp | 0 .../test_argument_vector.cpp | 0 .../test_interpreter.cpp | 2 +- .../test_interpreter.py | 0 .../test_subinterpreter.cpp | 0 .../test_trampoline.py | 0 11 files changed, 13 insertions(+), 13 deletions(-) rename tests/{test_embed => test_low_level}/CMakeLists.txt (84%) rename tests/{test_embed => test_low_level}/catch.cpp (93%) rename tests/{test_embed => test_low_level}/external_module.cpp (100%) rename tests/{test_embed => test_low_level}/test_args_convert_vector.cpp (100%) rename tests/{test_embed => test_low_level}/test_argument_vector.cpp (100%) rename tests/{test_embed => test_low_level}/test_interpreter.cpp (99%) rename tests/{test_embed => test_low_level}/test_interpreter.py (100%) rename tests/{test_embed => test_low_level}/test_subinterpreter.cpp (100%) rename tests/{test_embed => test_low_level}/test_trampoline.py (100%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 34b22a57ae..96c12f954e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -647,8 +647,8 @@ if(NOT PYBIND11_CUDA_TESTS) # Test pure C++ code (not depending on Python). Provides the `test_pure_cpp` target. add_subdirectory(pure_cpp) - # Test embedding the interpreter. Provides the `cpptest` target. - add_subdirectory(test_embed) + # Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target. + add_subdirectory(test_low_level) # Test CMake build using functions and targets from subdirectory or installed location add_subdirectory(test_cmake_build) diff --git a/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt b/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt index d84916ea77..786511d5e5 100644 --- a/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt +++ b/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt @@ -26,11 +26,11 @@ add_custom_target( DEPENDS test_subdirectory_embed) # Test custom export group -- PYBIND11_EXPORT_NAME -add_library(test_embed_lib ../embed.cpp) -target_link_libraries(test_embed_lib PRIVATE pybind11::embed) +add_library(test_low_level_lib ../embed.cpp) +target_link_libraries(test_low_level_lib PRIVATE pybind11::embed) install( - TARGETS test_embed_lib + TARGETS test_low_level_lib EXPORT test_export ARCHIVE DESTINATION bin LIBRARY DESTINATION lib diff --git a/tests/test_embed/CMakeLists.txt b/tests/test_low_level/CMakeLists.txt similarity index 84% rename from tests/test_embed/CMakeLists.txt rename to tests/test_low_level/CMakeLists.txt index 14dda54df2..6b0be26792 100644 --- a/tests/test_embed/CMakeLists.txt +++ b/tests/test_low_level/CMakeLists.txt @@ -33,11 +33,11 @@ if(PYBIND11_TEST_SMART_HOLDER) -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE) endif() -add_executable(test_embed catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp - test_interpreter.cpp test_subinterpreter.cpp) -pybind11_enable_warnings(test_embed) +add_executable(test_low_level catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp + test_interpreter.cpp test_subinterpreter.cpp) +pybind11_enable_warnings(test_low_level) -target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads) +target_link_libraries(test_low_level PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads) if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR) file(COPY test_interpreter.py test_trampoline.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") @@ -45,8 +45,8 @@ endif() add_custom_target( cpptest - COMMAND "$" - DEPENDS test_embed + COMMAND "$" + DEPENDS test_low_level WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") pybind11_add_module(external_module THIN_LTO external_module.cpp) diff --git a/tests/test_embed/catch.cpp b/tests/test_low_level/catch.cpp similarity index 93% rename from tests/test_embed/catch.cpp rename to tests/test_low_level/catch.cpp index 558a7a35e5..cd3f8f4fb8 100644 --- a/tests/test_embed/catch.cpp +++ b/tests/test_low_level/catch.cpp @@ -19,7 +19,7 @@ namespace py = pybind11; int main(int argc, char *argv[]) { // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number: - std::string updated_pythonpath("pybind11_test_embed_PYTHONPATH_2099743835476552"); + std::string updated_pythonpath("pybind11_test_low_level_PYTHONPATH_2099743835476552"); const char *preexisting_pythonpath = getenv("PYTHONPATH"); if (preexisting_pythonpath != nullptr) { #if defined(_WIN32) diff --git a/tests/test_embed/external_module.cpp b/tests/test_low_level/external_module.cpp similarity index 100% rename from tests/test_embed/external_module.cpp rename to tests/test_low_level/external_module.cpp diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_low_level/test_args_convert_vector.cpp similarity index 100% rename from tests/test_embed/test_args_convert_vector.cpp rename to tests/test_low_level/test_args_convert_vector.cpp diff --git a/tests/test_embed/test_argument_vector.cpp b/tests/test_low_level/test_argument_vector.cpp similarity index 100% rename from tests/test_embed/test_argument_vector.cpp rename to tests/test_low_level/test_argument_vector.cpp diff --git a/tests/test_embed/test_interpreter.cpp b/tests/test_low_level/test_interpreter.cpp similarity index 99% rename from tests/test_embed/test_interpreter.cpp rename to tests/test_low_level/test_interpreter.cpp index 0e6c17a777..a4e1b68d01 100644 --- a/tests/test_embed/test_interpreter.cpp +++ b/tests/test_low_level/test_interpreter.cpp @@ -95,7 +95,7 @@ TEST_CASE("PYTHONPATH is used to update sys.path") { // The setup for this TEST_CASE is in catch.cpp! auto sys_path = py::str(py::module_::import("sys").attr("path")).cast(); REQUIRE_THAT(sys_path, - Catch::Matchers::Contains("pybind11_test_embed_PYTHONPATH_2099743835476552")); + Catch::Matchers::Contains("pybind11_test_low_level_PYTHONPATH_2099743835476552")); } TEST_CASE("Pass classes and data between modules defined in C++ and Python") { diff --git a/tests/test_embed/test_interpreter.py b/tests/test_low_level/test_interpreter.py similarity index 100% rename from tests/test_embed/test_interpreter.py rename to tests/test_low_level/test_interpreter.py diff --git a/tests/test_embed/test_subinterpreter.cpp b/tests/test_low_level/test_subinterpreter.cpp similarity index 100% rename from tests/test_embed/test_subinterpreter.cpp rename to tests/test_low_level/test_subinterpreter.cpp diff --git a/tests/test_embed/test_trampoline.py b/tests/test_low_level/test_trampoline.py similarity index 100% rename from tests/test_embed/test_trampoline.py rename to tests/test_low_level/test_trampoline.py From 6e2da02652ba5529a37b2d7bd124ef1446e344fd Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 8 Sep 2025 16:31:51 -0700 Subject: [PATCH 08/11] rename test_low_level to test_with_catch --- tests/CMakeLists.txt | 2 +- .../subdirectory_embed/CMakeLists.txt | 6 +++--- .../CMakeLists.txt | 12 ++++++------ tests/{test_low_level => test_with_catch}/catch.cpp | 2 +- .../external_module.cpp | 0 .../test_args_convert_vector.cpp | 0 .../test_argument_vector.cpp | 0 .../test_interpreter.cpp | 5 +++-- .../test_interpreter.py | 0 .../test_subinterpreter.cpp | 0 .../test_trampoline.py | 0 11 files changed, 14 insertions(+), 13 deletions(-) rename tests/{test_low_level => test_with_catch}/CMakeLists.txt (84%) rename tests/{test_low_level => test_with_catch}/catch.cpp (93%) rename tests/{test_low_level => test_with_catch}/external_module.cpp (100%) rename tests/{test_low_level => test_with_catch}/test_args_convert_vector.cpp (100%) rename tests/{test_low_level => test_with_catch}/test_argument_vector.cpp (100%) rename tests/{test_low_level => test_with_catch}/test_interpreter.cpp (99%) rename tests/{test_low_level => test_with_catch}/test_interpreter.py (100%) rename tests/{test_low_level => test_with_catch}/test_subinterpreter.cpp (100%) rename tests/{test_low_level => test_with_catch}/test_trampoline.py (100%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 96c12f954e..8ac236d404 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -648,7 +648,7 @@ if(NOT PYBIND11_CUDA_TESTS) add_subdirectory(pure_cpp) # Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target. - add_subdirectory(test_low_level) + add_subdirectory(test_with_catch) # Test CMake build using functions and targets from subdirectory or installed location add_subdirectory(test_cmake_build) diff --git a/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt b/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt index 786511d5e5..bb83a6e6ef 100644 --- a/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt +++ b/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt @@ -26,11 +26,11 @@ add_custom_target( DEPENDS test_subdirectory_embed) # Test custom export group -- PYBIND11_EXPORT_NAME -add_library(test_low_level_lib ../embed.cpp) -target_link_libraries(test_low_level_lib PRIVATE pybind11::embed) +add_library(test_with_catch_lib ../embed.cpp) +target_link_libraries(test_with_catch_lib PRIVATE pybind11::embed) install( - TARGETS test_low_level_lib + TARGETS test_with_catch_lib EXPORT test_export ARCHIVE DESTINATION bin LIBRARY DESTINATION lib diff --git a/tests/test_low_level/CMakeLists.txt b/tests/test_with_catch/CMakeLists.txt similarity index 84% rename from tests/test_low_level/CMakeLists.txt rename to tests/test_with_catch/CMakeLists.txt index 6b0be26792..136537e67f 100644 --- a/tests/test_low_level/CMakeLists.txt +++ b/tests/test_with_catch/CMakeLists.txt @@ -33,11 +33,11 @@ if(PYBIND11_TEST_SMART_HOLDER) -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE) endif() -add_executable(test_low_level catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp - test_interpreter.cpp test_subinterpreter.cpp) -pybind11_enable_warnings(test_low_level) +add_executable(test_with_catch catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp + test_interpreter.cpp test_subinterpreter.cpp) +pybind11_enable_warnings(test_with_catch) -target_link_libraries(test_low_level PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads) +target_link_libraries(test_with_catch PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads) if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR) file(COPY test_interpreter.py test_trampoline.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") @@ -45,8 +45,8 @@ endif() add_custom_target( cpptest - COMMAND "$" - DEPENDS test_low_level + COMMAND "$" + DEPENDS test_with_catch WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") pybind11_add_module(external_module THIN_LTO external_module.cpp) diff --git a/tests/test_low_level/catch.cpp b/tests/test_with_catch/catch.cpp similarity index 93% rename from tests/test_low_level/catch.cpp rename to tests/test_with_catch/catch.cpp index cd3f8f4fb8..5bd8b3880e 100644 --- a/tests/test_low_level/catch.cpp +++ b/tests/test_with_catch/catch.cpp @@ -19,7 +19,7 @@ namespace py = pybind11; int main(int argc, char *argv[]) { // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number: - std::string updated_pythonpath("pybind11_test_low_level_PYTHONPATH_2099743835476552"); + std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552"); const char *preexisting_pythonpath = getenv("PYTHONPATH"); if (preexisting_pythonpath != nullptr) { #if defined(_WIN32) diff --git a/tests/test_low_level/external_module.cpp b/tests/test_with_catch/external_module.cpp similarity index 100% rename from tests/test_low_level/external_module.cpp rename to tests/test_with_catch/external_module.cpp diff --git a/tests/test_low_level/test_args_convert_vector.cpp b/tests/test_with_catch/test_args_convert_vector.cpp similarity index 100% rename from tests/test_low_level/test_args_convert_vector.cpp rename to tests/test_with_catch/test_args_convert_vector.cpp diff --git a/tests/test_low_level/test_argument_vector.cpp b/tests/test_with_catch/test_argument_vector.cpp similarity index 100% rename from tests/test_low_level/test_argument_vector.cpp rename to tests/test_with_catch/test_argument_vector.cpp diff --git a/tests/test_low_level/test_interpreter.cpp b/tests/test_with_catch/test_interpreter.cpp similarity index 99% rename from tests/test_low_level/test_interpreter.cpp rename to tests/test_with_catch/test_interpreter.cpp index a4e1b68d01..d227eecddc 100644 --- a/tests/test_low_level/test_interpreter.cpp +++ b/tests/test_with_catch/test_interpreter.cpp @@ -94,8 +94,9 @@ PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) { TEST_CASE("PYTHONPATH is used to update sys.path") { // The setup for this TEST_CASE is in catch.cpp! auto sys_path = py::str(py::module_::import("sys").attr("path")).cast(); - REQUIRE_THAT(sys_path, - Catch::Matchers::Contains("pybind11_test_low_level_PYTHONPATH_2099743835476552")); + REQUIRE_THAT( + sys_path, + Catch::Matchers::Contains("pybind11_test_with_catch_PYTHONPATH_2099743835476552")); } TEST_CASE("Pass classes and data between modules defined in C++ and Python") { diff --git a/tests/test_low_level/test_interpreter.py b/tests/test_with_catch/test_interpreter.py similarity index 100% rename from tests/test_low_level/test_interpreter.py rename to tests/test_with_catch/test_interpreter.py diff --git a/tests/test_low_level/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp similarity index 100% rename from tests/test_low_level/test_subinterpreter.cpp rename to tests/test_with_catch/test_subinterpreter.cpp diff --git a/tests/test_low_level/test_trampoline.py b/tests/test_with_catch/test_trampoline.py similarity index 100% rename from tests/test_low_level/test_trampoline.py rename to tests/test_with_catch/test_trampoline.py From 40aaf36c63da8aac23b1bf4c649238f096bbb73f Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 12 Sep 2025 16:49:32 -0700 Subject: [PATCH 09/11] Be careful to NOINLINE slow paths --- include/pybind11/detail/argument_vector.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h index f8a0023443..915fa00694 100644 --- a/include/pybind11/detail/argument_vector.h +++ b/include/pybind11/detail/argument_vector.h @@ -159,12 +159,12 @@ struct argument_vector { auto &ha = m_repr.array; if (ha.size == N) { move_to_vector_with_reserved_size(N + 1); - m_repr.vector.vec.push_back(x); + push_back_slow_path(x); } else { ha.arr[ha.size++] = x; } } else { - m_repr.vector.vec.push_back(x); + push_back_slow_path(x); } } @@ -179,7 +179,7 @@ struct argument_vector { move_to_vector_with_reserved_size(sz); } } else { - m_repr.vector.vec.reserve(sz); + reserve_slow_path(sz); } } @@ -187,7 +187,7 @@ struct argument_vector { using repr_type = inline_array_or_vector; repr_type m_repr; - void move_to_vector_with_reserved_size(std::size_t reserved_size) { + PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) { assert(is_inline()); auto &ha = m_repr.array; using heap_vector = typename repr_type::heap_vector; @@ -197,6 +197,10 @@ struct argument_vector { new (&m_repr.vector) heap_vector(std::move(hv)); } + PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.vector.vec.push_back(x); } + + PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.vector.vec.reserve(sz); } + bool is_inline() const { return m_repr.is_inline(); } }; @@ -253,7 +257,7 @@ struct args_convert_vector { auto &ha = m_repr.array; if (ha.size == kInlineSize) { move_to_vector_with_reserved_size(kInlineSize + 1); - m_repr.vector.vec.push_back(b); + push_back_slow_path(b); } else { assert(ha.size < kInlineSize); const auto wbi = word_and_bit_index(ha.size++); @@ -267,7 +271,7 @@ struct args_convert_vector { assert(operator[](ha.size - 1) == b); } } else { - m_repr.vector.vec.push_back(b); + push_back_slow_path(b); } } @@ -290,7 +294,7 @@ struct args_convert_vector { return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit); } - void move_to_vector_with_reserved_size(std::size_t reserved_size) { + PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) { auto &inline_arr = m_repr.array; using heap_vector = typename repr_type::heap_vector; heap_vector hv; @@ -301,6 +305,8 @@ struct args_convert_vector { new (&m_repr.vector) heap_vector(std::move(hv)); } + PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.vector.vec.push_back(b); } + static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t); static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord; static constexpr auto kInlineSize = kWords * kBitsPerWord; From a2c8422ff9014a76c4ad85e48649e6b8fd2a1168 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 18 Sep 2025 09:52:58 -0700 Subject: [PATCH 10/11] rename array/vector members to iarray/hvector. Move comment per request. Add static_asserts for our untagged union implementation per request. --- include/pybind11/cast.h | 4 +- include/pybind11/detail/argument_vector.h | 93 +++++++++++++---------- 2 files changed, 55 insertions(+), 42 deletions(-) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index c68b5fa648..4dcbf26236 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2038,6 +2038,8 @@ using is_pos_only = std::is_same, pos_only>; // forward declaration (definition in attr.h) struct function_record; +/// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines +/// (16 pointers) in size.) constexpr std::size_t arg_vector_small_size = 6; /// Internal data associated with a single function call @@ -2048,8 +2050,6 @@ struct function_call { const function_record &func; /// Arguments passed to the function: - /// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines - /// (16 pointers) in size.) argument_vector args; /// The `convert` value the arguments should be loaded with diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h index 915fa00694..1cfce06024 100644 --- a/include/pybind11/detail/argument_vector.h +++ b/include/pybind11/detail/argument_vector.h @@ -16,9 +16,11 @@ #include #include +#include #include #include #include +#include #include #include @@ -33,7 +35,7 @@ PYBIND11_NAMESPACE_BEGIN(detail) // std::variant. Union with the tag packed next to the inline // array's size is smaller anyway, allowing 1 extra handle of // inline storage for free. Compare the layouts (1 line per -// size_t/void*): +// size_t/void*, assuming a 64-bit machine): // With variant, total is N + 2 for N >= 2: // - variant tag (cannot be packed with the array size) // - array size (or first pointer of 3 in std::vector) @@ -61,28 +63,29 @@ union inline_array_or_vector { heap_vector(std::size_t count, VectorT value) : vec(count, value) {} }; - inline_array array; - heap_vector vector; + inline_array iarray; + heap_vector hvector; static_assert(std::is_trivially_move_constructible::value, "ArrayT must be trivially move constructible"); static_assert(std::is_trivially_destructible::value, "ArrayT must be trivially destructible"); - inline_array_or_vector() : array() {} + inline_array_or_vector() : iarray() {} ~inline_array_or_vector() { if (!is_inline()) { - vector.~heap_vector(); + hvector.~heap_vector(); } } + // Disable copy ctor and assignment. inline_array_or_vector(const inline_array_or_vector &) = delete; inline_array_or_vector &operator=(const inline_array_or_vector &) = delete; inline_array_or_vector(inline_array_or_vector &&rhs) noexcept { if (rhs.is_inline()) { - std::memcpy(&array, &rhs.array, sizeof(array)); + std::memcpy(&iarray, &rhs.iarray, sizeof(iarray)); } else { - new (&vector) heap_vector(std::move(rhs.vector)); + new (&hvector) heap_vector(std::move(rhs.hvector)); } assert(is_inline() == rhs.is_inline()); } @@ -94,14 +97,14 @@ union inline_array_or_vector { if (rhs.is_inline()) { if (!is_inline()) { - vector.~heap_vector(); + hvector.~heap_vector(); } - std::memcpy(&array, &rhs.array, sizeof(array)); + std::memcpy(&iarray, &rhs.iarray, sizeof(iarray)); } else { if (is_inline()) { - new (&vector) heap_vector(std::move(rhs.vector)); + new (&hvector) heap_vector(std::move(rhs.hvector)); } else { - vector = std::move(rhs.vector); + hvector = std::move(rhs.hvector); } } return *this; @@ -114,6 +117,14 @@ union inline_array_or_vector { // of bytes. See // https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4 bool result = false; + static_assert(std::is_standard_layout::value, + "untagged union implementation relies on this"); + static_assert(std::is_standard_layout::value, + "untagged union implementation relies on this"); + static_assert(offsetof(inline_array, is_inline) == 0, + "untagged union implementation relies on this"); + static_assert(offsetof(heap_vector, is_inline) == 0, + "untagged union implementation relies on this"); std::memcpy(&result, reinterpret_cast(this), sizeof(bool)); return result; } @@ -126,6 +137,7 @@ struct argument_vector { public: argument_vector() = default; + // Disable copy ctor and assignment. argument_vector(const argument_vector &) = delete; argument_vector &operator=(const argument_vector &) = delete; argument_vector(argument_vector &&) noexcept = default; @@ -133,32 +145,32 @@ struct argument_vector { std::size_t size() const { if (is_inline()) { - return m_repr.array.size; + return m_repr.iarray.size; } - return m_repr.vector.vec.size(); + return m_repr.hvector.vec.size(); } handle &operator[](std::size_t idx) { assert(idx < size()); if (is_inline()) { - return m_repr.array.arr[idx]; + return m_repr.iarray.arr[idx]; } - return m_repr.vector.vec[idx]; + return m_repr.hvector.vec[idx]; } handle operator[](std::size_t idx) const { assert(idx < size()); if (is_inline()) { - return m_repr.array.arr[idx]; + return m_repr.iarray.arr[idx]; } - return m_repr.vector.vec[idx]; + return m_repr.hvector.vec[idx]; } void push_back(handle x) { if (is_inline()) { - auto &ha = m_repr.array; + auto &ha = m_repr.iarray; if (ha.size == N) { - move_to_vector_with_reserved_size(N + 1); + move_to_heap_vector_with_reserved_size(N + 1); push_back_slow_path(x); } else { ha.arr[ha.size++] = x; @@ -176,7 +188,7 @@ struct argument_vector { void reserve(std::size_t sz) { if (is_inline()) { if (sz > N) { - move_to_vector_with_reserved_size(sz); + move_to_heap_vector_with_reserved_size(sz); } } else { reserve_slow_path(sz); @@ -187,19 +199,19 @@ struct argument_vector { using repr_type = inline_array_or_vector; repr_type m_repr; - PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) { + PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) { assert(is_inline()); - auto &ha = m_repr.array; + auto &ha = m_repr.iarray; using heap_vector = typename repr_type::heap_vector; heap_vector hv; hv.vec.reserve(reserved_size); std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec)); - new (&m_repr.vector) heap_vector(std::move(hv)); + new (&m_repr.hvector) heap_vector(std::move(hv)); } - PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.vector.vec.push_back(x); } + PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.hvector.vec.push_back(x); } - PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.vector.vec.reserve(sz); } + PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.hvector.vec.reserve(sz); } bool is_inline() const { return m_repr.is_inline(); } }; @@ -212,6 +224,7 @@ struct args_convert_vector { public: args_convert_vector() = default; + // Disable copy ctor and assignment. args_convert_vector(const args_convert_vector &) = delete; args_convert_vector &operator=(const args_convert_vector &) = delete; args_convert_vector(args_convert_vector &&) noexcept = default; @@ -219,9 +232,9 @@ struct args_convert_vector { args_convert_vector(std::size_t count, bool value) { if (count > kInlineSize) { - new (&m_repr.vector) typename repr_type::heap_vector(count, value); + new (&m_repr.hvector) typename repr_type::heap_vector(count, value); } else { - auto &inline_arr = m_repr.array; + auto &inline_arr = m_repr.iarray; inline_arr.arr.fill(value ? std::size_t(-1) : 0); inline_arr.size = static_cast(count); } @@ -229,18 +242,18 @@ struct args_convert_vector { std::size_t size() const { if (is_inline()) { - return m_repr.array.size; + return m_repr.iarray.size; } - return m_repr.vector.vec.size(); + return m_repr.hvector.vec.size(); } void reserve(std::size_t sz) { if (is_inline()) { if (sz > kInlineSize) { - move_to_vector_with_reserved_size(sz); + move_to_heap_vector_with_reserved_size(sz); } } else { - m_repr.vector.vec.reserve(sz); + m_repr.hvector.vec.reserve(sz); } } @@ -248,15 +261,15 @@ struct args_convert_vector { if (is_inline()) { return inline_index(idx); } - assert(idx < m_repr.vector.vec.size()); - return m_repr.vector.vec[idx]; + assert(idx < m_repr.hvector.vec.size()); + return m_repr.hvector.vec[idx]; } void push_back(bool b) { if (is_inline()) { - auto &ha = m_repr.array; + auto &ha = m_repr.iarray; if (ha.size == kInlineSize) { - move_to_vector_with_reserved_size(kInlineSize + 1); + move_to_heap_vector_with_reserved_size(kInlineSize + 1); push_back_slow_path(b); } else { assert(ha.size < kInlineSize); @@ -291,21 +304,21 @@ struct args_convert_vector { const auto wbi = word_and_bit_index(idx); assert(wbi.word < kWords); assert(wbi.bit < kBitsPerWord); - return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit); + return m_repr.iarray.arr[wbi.word] & (std::size_t(1) << wbi.bit); } - PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) { - auto &inline_arr = m_repr.array; + PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) { + auto &inline_arr = m_repr.iarray; using heap_vector = typename repr_type::heap_vector; heap_vector hv; hv.vec.reserve(reserved_size); for (std::size_t ii = 0; ii < inline_arr.size; ++ii) { hv.vec.push_back(inline_index(ii)); } - new (&m_repr.vector) heap_vector(std::move(hv)); + new (&m_repr.hvector) heap_vector(std::move(hv)); } - PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.vector.vec.push_back(b); } + PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.hvector.vec.push_back(b); } static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t); static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord; From a2dddb42c5bf185bd233947c684197b9b3b010fd Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 18 Sep 2025 10:10:53 -0700 Subject: [PATCH 11/11] drop is_standard_layout assertions; see https://github.com/pybind/pybind11/pull/5824#issuecomment-3308616072 --- include/pybind11/detail/argument_vector.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h index 1cfce06024..e9bfe064d4 100644 --- a/include/pybind11/detail/argument_vector.h +++ b/include/pybind11/detail/argument_vector.h @@ -117,10 +117,6 @@ union inline_array_or_vector { // of bytes. See // https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4 bool result = false; - static_assert(std::is_standard_layout::value, - "untagged union implementation relies on this"); - static_assert(std::is_standard_layout::value, - "untagged union implementation relies on this"); static_assert(offsetof(inline_array, is_inline) == 0, "untagged union implementation relies on this"); static_assert(offsetof(heap_vector, is_inline) == 0,