From 9415686fede57340570b0165ff09ccf798c3e2a2 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 3 Sep 2025 17:47:41 -0700
Subject: [PATCH 01/11] Avoid heap allocation for function calls with a small
 number of arguments

We don't have access to llvm::SmallVector or similar, but given the
limited subset of the `std::vector` API that
`function_call::args{,_convert}` need and the "reserve-then-fill"
usage pattern, it is relatively straightforward to implement custom
containers that get the job done.

Seems to improves time to call the collatz function in
pybind/pybind11_benchmark significantly; numbers are a little noisy
but there's a clear improvement from "about 60 ns per call" to "about
45 ns per call" on my machine (M4 Max Mac), as measured with
`timeit.repeat('collatz(4)', 'from pybind11_benchmark import
collatz')`.
---
 CMakeLists.txt                                |   1 +
 include/pybind11/cast.h                       |   7 +-
 include/pybind11/detail/argument_vector.h     | 320 ++++++++++++++++++
 include/pybind11/pybind11.h                   |   4 +-
 tests/extra_python_package/test_files.py      |   1 +
 tests/test_embed/CMakeLists.txt               |   3 +-
 tests/test_embed/test_args_convert_vector.cpp |  81 +++++
 tests/test_embed/test_argument_vector.cpp     |  93 +++++
 8 files changed, 505 insertions(+), 5 deletions(-)
 create mode 100644 include/pybind11/detail/argument_vector.h
 create mode 100644 tests/test_embed/test_args_convert_vector.cpp
 create mode 100644 tests/test_embed/test_argument_vector.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba5f665a2f..a6d619bbdd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT)
 endif()
 
 set(PYBIND11_HEADERS
+    include/pybind11/detail/argument_vector.h
     include/pybind11/detail/class.h
     include/pybind11/detail/common.h
     include/pybind11/detail/cpp_conduit.h
diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h
index c635791fee..88e88d4e62 100644
--- a/include/pybind11/cast.h
+++ b/include/pybind11/cast.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include "detail/argument_vector.h"
 #include "detail/common.h"
 #include "detail/descr.h"
 #include "detail/native_enum_data.h"
@@ -2045,10 +2046,12 @@ struct function_call {
     const function_record &func;
 
     /// Arguments passed to the function:
-    std::vector<handle> args;
+    /// (Inline size chosen mostly arbitrarily; 5 should pad function_call out to two cache lines
+    /// (16 pointers) in size.)
+    argument_vector<5> args;
 
     /// The `convert` value the arguments should be loaded with
-    std::vector<bool> args_convert;
+    args_convert_vector<5> args_convert;
 
     /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
     /// present, are also in `args` but without a reference).
diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h
new file mode 100644
index 0000000000..3aa9687462
--- /dev/null
+++ b/include/pybind11/detail/argument_vector.h
@@ -0,0 +1,320 @@
+/*
+    pybind11/detail/argument_vector.h: small_vector-like containers to
+    avoid heap allocation of arguments during function call dispatch.
+
+    Copyright (c) Meta Platforms, Inc. and affiliates.
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Shared implementation utility for our small_vector-like containers.
+// We support C++11 and C++14, so we cannot use
+// std::variant. Union with the tag packed next to the inline
+// array's size is smaller anyway, allowing 1 extra handle of
+// inline storage for free. Compare the layouts (1 line per
+// size_t/void*):
+// With variant, total is N + 2 for N >= 2:
+// - variant tag (cannot be packed with the array size)
+// - array size (or first pointer of 3 in std::vector)
+// - N pointers of inline storage (or 2 remaining pointers of std::vector)
+// Custom union, total is N + 1 for N >= 3:
+// - variant tag & array size if applicable
+// - N pointers of inline storage (or 3 pointers of std::vector)
+//
+// NOTE: this is a low-level representational convenience; the two
+// use cases of this union are materially different and in particular
+// have different semantics for inline_array::size. All that is being
+// shared is the memory management behavior.
+template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
+union inline_array_or_vector {
+    struct inline_array {
+        bool is_inline = true;
+        std::uint32_t size = 0;
+        std::array<ArrayT, InlineSize> arr;
+    };
+    struct heap_vector {
+        bool is_inline = false;
+        std::vector<VectorT> vec;
+
+        heap_vector() = default;
+        heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
+    };
+
+    inline_array array;
+    heap_vector vector;
+
+    static_assert(std::is_trivially_move_constructible<ArrayT>::value,
+                  "ArrayT must be trivially move constructible");
+    static_assert(std::is_trivially_destructible<ArrayT>::value,
+                  "ArrayT must be trivially destructible");
+
+    inline_array_or_vector() : array() {}
+    ~inline_array_or_vector() {
+        if (!is_inline()) {
+            vector.~heap_vector();
+        }
+    }
+    inline_array_or_vector(const inline_array_or_vector &) = delete;
+    inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;
+
+    inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
+        if (rhs.is_inline()) {
+            std::memcpy(&array, &rhs.array, sizeof(array));
+        } else {
+            new (&vector) heap_vector(std::move(rhs.vector));
+        }
+        assert(is_inline() == rhs.is_inline());
+    }
+
+    inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.is_inline()) {
+            if (!is_inline()) {
+                vector.~heap_vector();
+            }
+            std::memcpy(&array, &rhs.array, sizeof(array));
+        } else {
+            if (is_inline()) {
+                new (&vector) heap_vector(std::move(rhs.vector));
+            } else {
+                vector = std::move(rhs.vector);
+            }
+        }
+        return *this;
+    }
+
+    bool is_inline() const {
+        // It is undefined behavior to access the inactive member of a
+        // union directly. However, it is well-defined to reinterpret_cast any
+        // pointer into a pointer to char and examine it as an array
+        // of bytes. See
+        // https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
+        bool result = false;
+        std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
+        return result;
+    }
+};
+
+// small_vector-like container to avoid heap allocation for N or fewer
+// arguments.
+template <std::size_t N>
+struct argument_vector {
+public:
+    argument_vector() = default;
+
+    argument_vector(const argument_vector &) = delete;
+    argument_vector &operator=(const argument_vector &) = delete;
+    argument_vector(argument_vector &&) noexcept = default;
+    argument_vector &operator=(argument_vector &&) noexcept = default;
+
+    std::size_t size() const {
+        if (is_inline()) {
+            return m_repr.array.size;
+        } else {
+            return m_repr.vector.vec.size();
+        }
+    }
+
+    handle &operator[](std::size_t idx) {
+        assert(idx < size());
+        if (is_inline()) {
+            return m_repr.array.arr[idx];
+        } else {
+            return m_repr.vector.vec[idx];
+        }
+    }
+
+    handle operator[](std::size_t idx) const {
+        assert(idx < size());
+        if (is_inline()) {
+            return m_repr.array.arr[idx];
+        } else {
+            return m_repr.vector.vec[idx];
+        }
+    }
+
+    void push_back(handle x) {
+        if (is_inline()) {
+            auto &ha = m_repr.array;
+            if (ha.size == N) {
+                move_to_vector_with_reserved_size(N + 1);
+                m_repr.vector.vec.push_back(x);
+            } else {
+                ha.arr[ha.size++] = x;
+            }
+        } else {
+            m_repr.vector.vec.push_back(x);
+        }
+    }
+
+    template <typename Arg>
+    void emplace_back(Arg &&x) {
+        push_back(handle(x));
+    }
+
+    void reserve(std::size_t sz) {
+        if (is_inline()) {
+            if (sz > N) {
+                move_to_vector_with_reserved_size(sz);
+            }
+        } else {
+            m_repr.vector.vec.reserve(sz);
+        }
+    }
+
+private:
+    using repr_type = inline_array_or_vector<handle, N>;
+    repr_type m_repr;
+
+    void move_to_vector_with_reserved_size(std::size_t reserved_size) {
+        assert(is_inline());
+        auto &ha = m_repr.array;
+        using heap_vector = typename repr_type::heap_vector;
+        heap_vector hv;
+        hv.vec.reserve(reserved_size);
+        std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
+        new (&m_repr.vector) heap_vector(std::move(hv));
+    }
+
+    bool is_inline() const { return m_repr.is_inline(); }
+};
+
+// small_vector-like container to avoid heap allocation for N or fewer
+// arguments.
+template <std::size_t kRequestedInlineSize>
+struct args_convert_vector {
+private:
+public:
+    args_convert_vector() = default;
+
+    args_convert_vector(const args_convert_vector &) = delete;
+    args_convert_vector &operator=(const args_convert_vector &) = delete;
+    args_convert_vector(args_convert_vector &&) noexcept = default;
+    args_convert_vector &operator=(args_convert_vector &&) noexcept = default;
+
+    args_convert_vector(std::size_t count, bool value) {
+        if (count > kInlineSize) {
+            new (&m_repr.vector) typename repr_type::heap_vector(count, value);
+        } else {
+            auto &inline_arr = m_repr.array;
+            inline_arr.arr.fill(value ? std::size_t(-1) : 0);
+            inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
+        }
+    }
+
+    std::size_t size() const {
+        if (is_inline()) {
+            return m_repr.array.size;
+        } else {
+            return m_repr.vector.vec.size();
+        }
+    }
+
+    void reserve(std::size_t sz) {
+        if (is_inline()) {
+            if (sz > kInlineSize) {
+                move_to_vector_with_reserved_size(sz);
+            }
+        } else {
+            m_repr.vector.vec.reserve(sz);
+        }
+    }
+
+    bool operator[](std::size_t idx) const {
+        if (is_inline()) {
+            return inline_index(idx);
+        } else {
+            assert(idx < m_repr.vector.vec.size());
+            return m_repr.vector.vec[idx];
+        }
+    }
+
+    void push_back(bool b) {
+        if (is_inline()) {
+            auto &ha = m_repr.array;
+            if (ha.size == kInlineSize) {
+                move_to_vector_with_reserved_size(kInlineSize + 1);
+                m_repr.vector.vec.push_back(b);
+            } else {
+                assert(ha.size < kInlineSize);
+                const auto wbi = word_and_bit_index(ha.size++);
+                assert(wbi.word < kWords);
+                assert(wbi.bit < kBitsPerWord);
+                if (b) {
+                    ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit);
+                } else {
+                    ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit);
+                }
+                assert(operator[](ha.size - 1) == b);
+            }
+        } else {
+            m_repr.vector.vec.push_back(b);
+        }
+    }
+
+    void swap(args_convert_vector &rhs) { std::swap(m_repr, rhs.m_repr); }
+
+private:
+    struct WordAndBitIndex {
+        std::size_t word;
+        std::size_t bit;
+    };
+
+    static WordAndBitIndex word_and_bit_index(std::size_t idx) {
+        return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord};
+    }
+
+    bool inline_index(std::size_t idx) const {
+        const auto wbi = word_and_bit_index(idx);
+        assert(wbi.word < kWords);
+        assert(wbi.bit < kBitsPerWord);
+        return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit);
+    }
+
+    void move_to_vector_with_reserved_size(std::size_t reserved_size) {
+        auto &inline_arr = m_repr.array;
+        using heap_vector = typename repr_type::heap_vector;
+        heap_vector hv;
+        hv.vec.reserve(reserved_size);
+        for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
+            hv.vec.push_back(inline_index(ii));
+        }
+        new (&m_repr.vector) heap_vector(std::move(hv));
+    }
+
+    static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
+    static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
+    static constexpr auto kInlineSize = kWords * kBitsPerWord;
+
+    using repr_type = inline_array_or_vector<std::size_t, kWords, bool>;
+    repr_type m_repr;
+
+    bool is_inline() const { return m_repr.is_inline(); }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h
index 117fecabf0..dcd245e981 100644
--- a/include/pybind11/pybind11.h
+++ b/include/pybind11/pybind11.h
@@ -1048,12 +1048,12 @@ class cpp_function : public function {
                 }
 #endif
 
-                std::vector<bool> second_pass_convert;
+                args_convert_vector<5> second_pass_convert;
                 if (overloaded) {
                     // We're in the first no-convert pass, so swap out the conversion flags for a
                     // set of all-false flags.  If the call fails, we'll swap the flags back in for
                     // the conversion-allowed call below.
-                    second_pass_convert.resize(func.nargs, false);
+                    second_pass_convert = args_convert_vector<5>(func.nargs, false);
                     call.args_convert.swap(second_pass_convert);
                 }
 
diff --git a/tests/extra_python_package/test_files.py b/tests/extra_python_package/test_files.py
index 63e59f65a6..d3452aa383 100644
--- a/tests/extra_python_package/test_files.py
+++ b/tests/extra_python_package/test_files.py
@@ -76,6 +76,7 @@
 }
 
 detail_headers = {
+    "include/pybind11/detail/argument_vector.h",
     "include/pybind11/detail/class.h",
     "include/pybind11/detail/common.h",
     "include/pybind11/detail/cpp_conduit.h",
diff --git a/tests/test_embed/CMakeLists.txt b/tests/test_embed/CMakeLists.txt
index 23c3336437..14dda54df2 100644
--- a/tests/test_embed/CMakeLists.txt
+++ b/tests/test_embed/CMakeLists.txt
@@ -33,7 +33,8 @@ if(PYBIND11_TEST_SMART_HOLDER)
     -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE)
 endif()
 
-add_executable(test_embed catch.cpp test_interpreter.cpp test_subinterpreter.cpp)
+add_executable(test_embed catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
+                          test_interpreter.cpp test_subinterpreter.cpp)
 pybind11_enable_warnings(test_embed)
 
 target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp
new file mode 100644
index 0000000000..1db4d120f0
--- /dev/null
+++ b/tests/test_embed/test_args_convert_vector.cpp
@@ -0,0 +1,81 @@
+#include "pybind11/pybind11.h"
+#include "catch.hpp"
+
+namespace py = pybind11;
+
+// 5 is chosen to match the production value. It doesn't really matter
+// what we pick because the actual inline size is going to be 8 *
+// sizeof(size_t) given the implementation at the time of writing.
+using args_convert_vector = py::detail::args_convert_vector<5>;
+
+namespace {
+template <typename Container>
+std::vector<Container> get_sample_vectors() {
+    std::vector<Container> result;
+    result.emplace_back();
+    for (const auto sz : {0, 4, 5, 6, 31, 32, 33, 63, 64, 65}) {
+        for (const bool b : {false, true}) {
+            result.emplace_back(static_cast<std::size_t>(sz), b);
+        }
+    }
+    return result;
+}
+
+void require_vector_matches_sample(const args_convert_vector &actual,
+                                   const std::vector<bool> &expected) {
+    REQUIRE(actual.size() == expected.size());
+    for (size_t ii = 0; ii < actual.size(); ++ii) {
+        REQUIRE(actual[ii] == expected[ii]);
+    }
+}
+
+template <typename ActualMutationFunc, typename ExpectedMutationFunc>
+void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
+                                ExpectedMutationFunc expected_mutation_func) {
+    auto sample_contents = get_sample_vectors<std::vector<bool>>();
+    auto samples = get_sample_vectors<args_convert_vector>();
+    for (size_t ii = 0; ii < samples.size(); ++ii) {
+        auto &actual = samples[ii];
+        auto &expected = sample_contents[ii];
+
+        actual_mutation_func(actual);
+        expected_mutation_func(expected);
+        require_vector_matches_sample(actual, expected);
+    }
+}
+} // namespace
+
+// I would like to write [capture](auto& vec) block inline, but we
+// have to work with C++11, which doesn't have generic lambdas.
+#define MUTATION_LAMBDA(capture, block)                                                           \
+    [capture](args_convert_vector & vec) block, [capture](std::vector<bool> & vec) block
+
+// For readability, rather than having ugly empty arguments.
+#define NO_CAPTURE
+
+TEST_CASE("check sample args_convert_vector contents") {
+    mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; }));
+}
+
+TEST_CASE("args_convert_vector push_back") {
+    for (const bool b : {false, true}) {
+        mutation_test_with_samples(MUTATION_LAMBDA(b, { vec.push_back(b); }));
+    }
+}
+
+TEST_CASE("args_convert_vector reserve") {
+    for (std::size_t ii = 0; ii < 4; ++ii) {
+        mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); }));
+    }
+}
+
+TEST_CASE("args_convert_vector reserve then push_back") {
+    for (std::size_t ii = 0; ii < 4; ++ii) {
+        for (const bool b : {false, true}) {
+            mutation_test_with_samples(MUTATION_LAMBDA(=, {
+                vec.reserve(ii);
+                vec.push_back(b);
+            }));
+        }
+    }
+}
diff --git a/tests/test_embed/test_argument_vector.cpp b/tests/test_embed/test_argument_vector.cpp
new file mode 100644
index 0000000000..bfa64d0e75
--- /dev/null
+++ b/tests/test_embed/test_argument_vector.cpp
@@ -0,0 +1,93 @@
+#include "pybind11/pybind11.h"
+#include "catch.hpp"
+
+namespace py = pybind11;
+
+// 2 is chosen because it is the smallest number (keeping tests short)
+// where we can create non-empty vectors whose size is the inline size
+// plus or minus 1.
+using argument_vector = py::detail::argument_vector<2>;
+
+namespace {
+argument_vector to_argument_vector(const std::vector<py::handle> &v) {
+    argument_vector result;
+    result.reserve(v.size());
+    for (const auto x : v) {
+        result.push_back(x);
+    }
+    return result;
+}
+
+std::vector<std::vector<py::handle>> get_sample_argument_vector_contents() {
+    return std::vector<std::vector<py::handle>>{
+        {},
+        {py::handle(Py_None)},
+        {py::handle(Py_None), py::handle(Py_False)},
+        {py::handle(Py_None), py::handle(Py_False), py::handle(Py_True)},
+    };
+}
+
+std::vector<argument_vector> get_sample_argument_vectors() {
+    std::vector<argument_vector> result;
+    for (const auto &vec : get_sample_argument_vector_contents()) {
+        result.push_back(to_argument_vector(vec));
+    }
+    return result;
+}
+
+void require_vector_matches_sample(const argument_vector &actual,
+                                   const std::vector<py::handle> &expected) {
+    REQUIRE(actual.size() == expected.size());
+    for (size_t ii = 0; ii < actual.size(); ++ii) {
+        REQUIRE(actual[ii].ptr() == expected[ii].ptr());
+    }
+}
+
+template <typename ActualMutationFunc, typename ExpectedMutationFunc>
+void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
+                                ExpectedMutationFunc expected_mutation_func) {
+    auto sample_contents = get_sample_argument_vector_contents();
+    auto samples = get_sample_argument_vectors();
+    for (size_t ii = 0; ii < samples.size(); ++ii) {
+        auto &actual = samples[ii];
+        auto &expected = sample_contents[ii];
+
+        actual_mutation_func(actual);
+        expected_mutation_func(expected);
+        require_vector_matches_sample(actual, expected);
+    }
+}
+
+} // namespace
+
+// I would like to write [capture](auto& vec) block inline, but we
+// have to work with C++11, which doesn't have generic lambdas.
+#define MUTATION_LAMBDA(capture, block)                                                           \
+    [capture](argument_vector & vec) block, [capture](std::vector<py::handle> & vec) block
+
+// For readability, rather than having ugly empty arguments.
+#define NO_CAPTURE
+
+TEST_CASE("check sample argument_vector contents") {
+    mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; }));
+}
+
+TEST_CASE("argument_vector push_back") {
+    mutation_test_with_samples(
+        MUTATION_LAMBDA(NO_CAPTURE, { vec.push_back(py::handle(Py_None)); }));
+}
+
+TEST_CASE("argument_vector reserve") {
+    for (std::size_t ii = 0; ii < 4; ++ii) {
+        mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); }));
+    }
+}
+
+TEST_CASE("argument_vector reserve then push_back") {
+    for (std::size_t ii = 0; ii < 4; ++ii) {
+        mutation_test_with_samples(MUTATION_LAMBDA(ii, {
+            vec.reserve(ii);
+            vec.push_back(py::handle(Py_True));
+        }));
+    }
+}

From 09fc719485b74d95c24c79a2f54223c77655a51e Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 4 Sep 2025 13:11:44 -0700
Subject: [PATCH 02/11] clang-tidy

---
 include/pybind11/detail/argument_vector.h | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h
index 3aa9687462..f8a0023443 100644
--- a/include/pybind11/detail/argument_vector.h
+++ b/include/pybind11/detail/argument_vector.h
@@ -134,27 +134,24 @@ struct argument_vector {
     std::size_t size() const {
         if (is_inline()) {
             return m_repr.array.size;
-        } else {
-            return m_repr.vector.vec.size();
         }
+        return m_repr.vector.vec.size();
     }
 
     handle &operator[](std::size_t idx) {
         assert(idx < size());
         if (is_inline()) {
             return m_repr.array.arr[idx];
-        } else {
-            return m_repr.vector.vec[idx];
         }
+        return m_repr.vector.vec[idx];
     }
 
     handle operator[](std::size_t idx) const {
         assert(idx < size());
         if (is_inline()) {
             return m_repr.array.arr[idx];
-        } else {
-            return m_repr.vector.vec[idx];
         }
+        return m_repr.vector.vec[idx];
     }
 
     void push_back(handle x) {
@@ -229,9 +226,8 @@ struct args_convert_vector {
     std::size_t size() const {
         if (is_inline()) {
             return m_repr.array.size;
-        } else {
-            return m_repr.vector.vec.size();
         }
+        return m_repr.vector.vec.size();
     }
 
     void reserve(std::size_t sz) {
@@ -247,10 +243,9 @@ struct args_convert_vector {
     bool operator[](std::size_t idx) const {
         if (is_inline()) {
             return inline_index(idx);
-        } else {
-            assert(idx < m_repr.vector.vec.size());
-            return m_repr.vector.vec[idx];
         }
+        assert(idx < m_repr.vector.vec.size());
+        return m_repr.vector.vec[idx];
     }
 
     void push_back(bool b) {
@@ -276,7 +271,7 @@ struct args_convert_vector {
         }
     }
 
-    void swap(args_convert_vector &rhs) { std::swap(m_repr, rhs.m_repr); }
+    void swap(args_convert_vector &rhs) noexcept { std::swap(m_repr, rhs.m_repr); }
 
 private:
     struct WordAndBitIndex {

From 8dd2eabb6651dc0f968fb06ea84ca3b4c9745dc7 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 4 Sep 2025 15:51:17 -0700
Subject: [PATCH 03/11] more clang-tidy

---
 tests/test_embed/test_args_convert_vector.cpp | 1 +
 tests/test_embed/test_argument_vector.cpp     | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp
index 1db4d120f0..a24020756d 100644
--- a/tests/test_embed/test_args_convert_vector.cpp
+++ b/tests/test_embed/test_args_convert_vector.cpp
@@ -47,6 +47,7 @@ void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
 
 // I would like to write [capture](auto& vec) block inline, but we
 // have to work with C++11, which doesn't have generic lambdas.
+// NOLINTNEXTLINE(bugprone-macro-parentheses)
 #define MUTATION_LAMBDA(capture, block)                                                           \
     [capture](args_convert_vector & vec) block, [capture](std::vector<bool> & vec) block
 
diff --git a/tests/test_embed/test_argument_vector.cpp b/tests/test_embed/test_argument_vector.cpp
index bfa64d0e75..42e290d329 100644
--- a/tests/test_embed/test_argument_vector.cpp
+++ b/tests/test_embed/test_argument_vector.cpp
@@ -62,6 +62,7 @@ void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
 
 // I would like to write [capture](auto& vec) block inline, but we
 // have to work with C++11, which doesn't have generic lambdas.
+// NOLINTNEXTLINE(bugprone-macro-parentheses)
 #define MUTATION_LAMBDA(capture, block)                                                           \
     [capture](argument_vector & vec) block, [capture](std::vector<py::handle> & vec) block
 
@@ -73,8 +74,7 @@ TEST_CASE("check sample argument_vector contents") {
 }
 
 TEST_CASE("argument_vector push_back") {
-    mutation_test_with_samples(
-        MUTATION_LAMBDA(NO_CAPTURE, { vec.push_back(py::handle(Py_None)); }));
+    mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { vec.emplace_back(Py_None); }));
 }
 
 TEST_CASE("argument_vector reserve") {
@@ -87,7 +87,7 @@ TEST_CASE("argument_vector reserve then push_back") {
     for (std::size_t ii = 0; ii < 4; ++ii) {
         mutation_test_with_samples(MUTATION_LAMBDA(ii, {
             vec.reserve(ii);
-            vec.push_back(py::handle(Py_True));
+            vec.emplace_back(Py_True);
         }));
     }
 }

From 8219f6615fde12915773e0981fa916115bfb839d Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 4 Sep 2025 16:20:16 -0700
Subject: [PATCH 04/11] clang-tidy NOLINTBEGIN/END instead of NOLINTNEXTLINE

---
 tests/test_embed/test_args_convert_vector.cpp | 3 ++-
 tests/test_embed/test_argument_vector.cpp     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp
index a24020756d..e1dbbb40d4 100644
--- a/tests/test_embed/test_args_convert_vector.cpp
+++ b/tests/test_embed/test_args_convert_vector.cpp
@@ -47,9 +47,10 @@ void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
 
 // I would like to write [capture](auto& vec) block inline, but we
 // have to work with C++11, which doesn't have generic lambdas.
-// NOLINTNEXTLINE(bugprone-macro-parentheses)
+// NOLINTBEGIN(bugprone-macro-parentheses)
 #define MUTATION_LAMBDA(capture, block)                                                           \
     [capture](args_convert_vector & vec) block, [capture](std::vector<bool> & vec) block
+// NOLINTEND(bugprone-macro-parentheses)
 
 // For readability, rather than having ugly empty arguments.
 #define NO_CAPTURE
diff --git a/tests/test_embed/test_argument_vector.cpp b/tests/test_embed/test_argument_vector.cpp
index 42e290d329..9cf302a9b1 100644
--- a/tests/test_embed/test_argument_vector.cpp
+++ b/tests/test_embed/test_argument_vector.cpp
@@ -62,9 +62,10 @@ void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
 
 // I would like to write [capture](auto& vec) block inline, but we
 // have to work with C++11, which doesn't have generic lambdas.
-// NOLINTNEXTLINE(bugprone-macro-parentheses)
+// NOLINTBEGIN(bugprone-macro-parentheses)
 #define MUTATION_LAMBDA(capture, block)                                                           \
     [capture](argument_vector & vec) block, [capture](std::vector<py::handle> & vec) block
+// NOLINTEND(bugprone-macro-parentheses)
 
 // For readability, rather than having ugly empty arguments.
 #define NO_CAPTURE

From e4c68beec9a79f59e48cde981a68f7bb1842e37c Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 4 Sep 2025 16:53:50 -0700
Subject: [PATCH 05/11] forgot to increase inline size after removing
 std::variant

---
 include/pybind11/cast.h                       | 4 ++--
 include/pybind11/pybind11.h                   | 4 ++--
 tests/test_embed/test_args_convert_vector.cpp | 9 +++++----
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h
index 88e88d4e62..0935921dd0 100644
--- a/include/pybind11/cast.h
+++ b/include/pybind11/cast.h
@@ -2048,10 +2048,10 @@ struct function_call {
     /// Arguments passed to the function:
     /// (Inline size chosen mostly arbitrarily; 5 should pad function_call out to two cache lines
     /// (16 pointers) in size.)
-    argument_vector<5> args;
+    argument_vector<6> args;
 
     /// The `convert` value the arguments should be loaded with
-    args_convert_vector<5> args_convert;
+    args_convert_vector<6> args_convert;
 
     /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
     /// present, are also in `args` but without a reference).
diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h
index dcd245e981..8569c2657e 100644
--- a/include/pybind11/pybind11.h
+++ b/include/pybind11/pybind11.h
@@ -1048,12 +1048,12 @@ class cpp_function : public function {
                 }
 #endif
 
-                args_convert_vector<5> second_pass_convert;
+                args_convert_vector<6> second_pass_convert;
                 if (overloaded) {
                     // We're in the first no-convert pass, so swap out the conversion flags for a
                     // set of all-false flags.  If the call fails, we'll swap the flags back in for
                     // the conversion-allowed call below.
-                    second_pass_convert = args_convert_vector<5>(func.nargs, false);
+                    second_pass_convert = args_convert_vector<6>(func.nargs, false);
                     call.args_convert.swap(second_pass_convert);
                 }
 
diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp
index e1dbbb40d4..848e6fd5cd 100644
--- a/tests/test_embed/test_args_convert_vector.cpp
+++ b/tests/test_embed/test_args_convert_vector.cpp
@@ -3,10 +3,11 @@
 
 namespace py = pybind11;
 
-// 5 is chosen to match the production value. It doesn't really matter
-// what we pick because the actual inline size is going to be 8 *
-// sizeof(size_t) given the implementation at the time of writing.
-using args_convert_vector = py::detail::args_convert_vector<5>;
+// Inline size is chosen to match the production value. It doesn't
+// really matter what we pick because the actual inline size is going
+// to be 8 * sizeof(size_t) given the implementation at the time of
+// writing.
+using args_convert_vector = py::detail::args_convert_vector<6>;
 
 namespace {
 template <typename Container>

From 3c435f1bf77ff05cc13a906d284188d2991399a2 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 5 Sep 2025 15:46:30 -0700
Subject: [PATCH 06/11] constexpr arg_vector_small_size, use move instead of
 swap to hopefully clarify second_pass_convert

---
 include/pybind11/cast.h                       | 8 +++++---
 include/pybind11/pybind11.h                   | 7 ++++---
 tests/test_embed/test_args_convert_vector.cpp | 6 +-----
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h
index 0935921dd0..c68b5fa648 100644
--- a/include/pybind11/cast.h
+++ b/include/pybind11/cast.h
@@ -2038,6 +2038,8 @@ using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
 // forward declaration (definition in attr.h)
 struct function_record;
 
+constexpr std::size_t arg_vector_small_size = 6;
+
 /// Internal data associated with a single function call
 struct function_call {
     function_call(const function_record &f, handle p); // Implementation in attr.h
@@ -2046,12 +2048,12 @@ struct function_call {
     const function_record &func;
 
     /// Arguments passed to the function:
-    /// (Inline size chosen mostly arbitrarily; 5 should pad function_call out to two cache lines
+    /// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines
     /// (16 pointers) in size.)
-    argument_vector<6> args;
+    argument_vector<arg_vector_small_size> args;
 
     /// The `convert` value the arguments should be loaded with
-    args_convert_vector<6> args_convert;
+    args_convert_vector<arg_vector_small_size> args_convert;
 
     /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
     /// present, are also in `args` but without a reference).
diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h
index 8569c2657e..515aab1414 100644
--- a/include/pybind11/pybind11.h
+++ b/include/pybind11/pybind11.h
@@ -1048,13 +1048,14 @@ class cpp_function : public function {
                 }
 #endif
 
-                args_convert_vector<6> second_pass_convert;
+                args_convert_vector<arg_vector_small_size> second_pass_convert;
                 if (overloaded) {
                     // We're in the first no-convert pass, so swap out the conversion flags for a
                     // set of all-false flags.  If the call fails, we'll swap the flags back in for
                     // the conversion-allowed call below.
-                    second_pass_convert = args_convert_vector<6>(func.nargs, false);
-                    call.args_convert.swap(second_pass_convert);
+                    second_pass_convert = std::move(call.args_convert);
+                    call.args_convert
+                        = args_convert_vector<arg_vector_small_size>(func.nargs, false);
                 }
 
                 // 6. Call the function.
diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp
index 848e6fd5cd..7ce2d713fa 100644
--- a/tests/test_embed/test_args_convert_vector.cpp
+++ b/tests/test_embed/test_args_convert_vector.cpp
@@ -3,11 +3,7 @@
 
 namespace py = pybind11;
 
-// Inline size is chosen to match the production value. It doesn't
-// really matter what we pick because the actual inline size is going
-// to be 8 * sizeof(size_t) given the implementation at the time of
-// writing.
-using args_convert_vector = py::detail::args_convert_vector<6>;
+using args_convert_vector = py::detail::args_convert_vector<py::detail::arg_vector_small_size>;
 
 namespace {
 template <typename Container>

From f7bd49e31bddb1193145208d4c61fc145d3d6252 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 8 Sep 2025 09:52:07 -0700
Subject: [PATCH 07/11] rename test_embed to test_low_level

---
 tests/CMakeLists.txt                                 |  4 ++--
 .../subdirectory_embed/CMakeLists.txt                |  6 +++---
 tests/{test_embed => test_low_level}/CMakeLists.txt  | 12 ++++++------
 tests/{test_embed => test_low_level}/catch.cpp       |  2 +-
 .../external_module.cpp                              |  0
 .../test_args_convert_vector.cpp                     |  0
 .../test_argument_vector.cpp                         |  0
 .../test_interpreter.cpp                             |  2 +-
 .../test_interpreter.py                              |  0
 .../test_subinterpreter.cpp                          |  0
 .../test_trampoline.py                               |  0
 11 files changed, 13 insertions(+), 13 deletions(-)
 rename tests/{test_embed => test_low_level}/CMakeLists.txt (84%)
 rename tests/{test_embed => test_low_level}/catch.cpp (93%)
 rename tests/{test_embed => test_low_level}/external_module.cpp (100%)
 rename tests/{test_embed => test_low_level}/test_args_convert_vector.cpp (100%)
 rename tests/{test_embed => test_low_level}/test_argument_vector.cpp (100%)
 rename tests/{test_embed => test_low_level}/test_interpreter.cpp (99%)
 rename tests/{test_embed => test_low_level}/test_interpreter.py (100%)
 rename tests/{test_embed => test_low_level}/test_subinterpreter.cpp (100%)
 rename tests/{test_embed => test_low_level}/test_trampoline.py (100%)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 34b22a57ae..96c12f954e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -647,8 +647,8 @@ if(NOT PYBIND11_CUDA_TESTS)
   # Test pure C++ code (not depending on Python). Provides the `test_pure_cpp` target.
   add_subdirectory(pure_cpp)
 
-  # Test embedding the interpreter. Provides the `cpptest` target.
-  add_subdirectory(test_embed)
+  # Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target.
+  add_subdirectory(test_low_level)
 
   # Test CMake build using functions and targets from subdirectory or installed location
   add_subdirectory(test_cmake_build)
diff --git a/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt b/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
index d84916ea77..786511d5e5 100644
--- a/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
+++ b/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
@@ -26,11 +26,11 @@ add_custom_target(
   DEPENDS test_subdirectory_embed)
 
 # Test custom export group -- PYBIND11_EXPORT_NAME
-add_library(test_embed_lib ../embed.cpp)
-target_link_libraries(test_embed_lib PRIVATE pybind11::embed)
+add_library(test_low_level_lib ../embed.cpp)
+target_link_libraries(test_low_level_lib PRIVATE pybind11::embed)
 
 install(
-  TARGETS test_embed_lib
+  TARGETS test_low_level_lib
   EXPORT test_export
   ARCHIVE DESTINATION bin
   LIBRARY DESTINATION lib
diff --git a/tests/test_embed/CMakeLists.txt b/tests/test_low_level/CMakeLists.txt
similarity index 84%
rename from tests/test_embed/CMakeLists.txt
rename to tests/test_low_level/CMakeLists.txt
index 14dda54df2..6b0be26792 100644
--- a/tests/test_embed/CMakeLists.txt
+++ b/tests/test_low_level/CMakeLists.txt
@@ -33,11 +33,11 @@ if(PYBIND11_TEST_SMART_HOLDER)
     -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE)
 endif()
 
-add_executable(test_embed catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
-                          test_interpreter.cpp test_subinterpreter.cpp)
-pybind11_enable_warnings(test_embed)
+add_executable(test_low_level catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
+                              test_interpreter.cpp test_subinterpreter.cpp)
+pybind11_enable_warnings(test_low_level)
 
-target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
+target_link_libraries(test_low_level PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
 
 if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
   file(COPY test_interpreter.py test_trampoline.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
@@ -45,8 +45,8 @@ endif()
 
 add_custom_target(
   cpptest
-  COMMAND "$<TARGET_FILE:test_embed>"
-  DEPENDS test_embed
+  COMMAND "$<TARGET_FILE:test_low_level>"
+  DEPENDS test_low_level
   WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
 
 pybind11_add_module(external_module THIN_LTO external_module.cpp)
diff --git a/tests/test_embed/catch.cpp b/tests/test_low_level/catch.cpp
similarity index 93%
rename from tests/test_embed/catch.cpp
rename to tests/test_low_level/catch.cpp
index 558a7a35e5..cd3f8f4fb8 100644
--- a/tests/test_embed/catch.cpp
+++ b/tests/test_low_level/catch.cpp
@@ -19,7 +19,7 @@ namespace py = pybind11;
 
 int main(int argc, char *argv[]) {
     // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number:
-    std::string updated_pythonpath("pybind11_test_embed_PYTHONPATH_2099743835476552");
+    std::string updated_pythonpath("pybind11_test_low_level_PYTHONPATH_2099743835476552");
     const char *preexisting_pythonpath = getenv("PYTHONPATH");
     if (preexisting_pythonpath != nullptr) {
 #if defined(_WIN32)
diff --git a/tests/test_embed/external_module.cpp b/tests/test_low_level/external_module.cpp
similarity index 100%
rename from tests/test_embed/external_module.cpp
rename to tests/test_low_level/external_module.cpp
diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_low_level/test_args_convert_vector.cpp
similarity index 100%
rename from tests/test_embed/test_args_convert_vector.cpp
rename to tests/test_low_level/test_args_convert_vector.cpp
diff --git a/tests/test_embed/test_argument_vector.cpp b/tests/test_low_level/test_argument_vector.cpp
similarity index 100%
rename from tests/test_embed/test_argument_vector.cpp
rename to tests/test_low_level/test_argument_vector.cpp
diff --git a/tests/test_embed/test_interpreter.cpp b/tests/test_low_level/test_interpreter.cpp
similarity index 99%
rename from tests/test_embed/test_interpreter.cpp
rename to tests/test_low_level/test_interpreter.cpp
index 0e6c17a777..a4e1b68d01 100644
--- a/tests/test_embed/test_interpreter.cpp
+++ b/tests/test_low_level/test_interpreter.cpp
@@ -95,7 +95,7 @@ TEST_CASE("PYTHONPATH is used to update sys.path") {
     // The setup for this TEST_CASE is in catch.cpp!
     auto sys_path = py::str(py::module_::import("sys").attr("path")).cast<std::string>();
     REQUIRE_THAT(sys_path,
-                 Catch::Matchers::Contains("pybind11_test_embed_PYTHONPATH_2099743835476552"));
+                 Catch::Matchers::Contains("pybind11_test_low_level_PYTHONPATH_2099743835476552"));
 }
 
 TEST_CASE("Pass classes and data between modules defined in C++ and Python") {
diff --git a/tests/test_embed/test_interpreter.py b/tests/test_low_level/test_interpreter.py
similarity index 100%
rename from tests/test_embed/test_interpreter.py
rename to tests/test_low_level/test_interpreter.py
diff --git a/tests/test_embed/test_subinterpreter.cpp b/tests/test_low_level/test_subinterpreter.cpp
similarity index 100%
rename from tests/test_embed/test_subinterpreter.cpp
rename to tests/test_low_level/test_subinterpreter.cpp
diff --git a/tests/test_embed/test_trampoline.py b/tests/test_low_level/test_trampoline.py
similarity index 100%
rename from tests/test_embed/test_trampoline.py
rename to tests/test_low_level/test_trampoline.py

From 6e2da02652ba5529a37b2d7bd124ef1446e344fd Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 8 Sep 2025 16:31:51 -0700
Subject: [PATCH 08/11] rename test_low_level to test_with_catch

---
 tests/CMakeLists.txt                                 |  2 +-
 .../subdirectory_embed/CMakeLists.txt                |  6 +++---
 .../CMakeLists.txt                                   | 12 ++++++------
 tests/{test_low_level => test_with_catch}/catch.cpp  |  2 +-
 .../external_module.cpp                              |  0
 .../test_args_convert_vector.cpp                     |  0
 .../test_argument_vector.cpp                         |  0
 .../test_interpreter.cpp                             |  5 +++--
 .../test_interpreter.py                              |  0
 .../test_subinterpreter.cpp                          |  0
 .../test_trampoline.py                               |  0
 11 files changed, 14 insertions(+), 13 deletions(-)
 rename tests/{test_low_level => test_with_catch}/CMakeLists.txt (84%)
 rename tests/{test_low_level => test_with_catch}/catch.cpp (93%)
 rename tests/{test_low_level => test_with_catch}/external_module.cpp (100%)
 rename tests/{test_low_level => test_with_catch}/test_args_convert_vector.cpp (100%)
 rename tests/{test_low_level => test_with_catch}/test_argument_vector.cpp (100%)
 rename tests/{test_low_level => test_with_catch}/test_interpreter.cpp (99%)
 rename tests/{test_low_level => test_with_catch}/test_interpreter.py (100%)
 rename tests/{test_low_level => test_with_catch}/test_subinterpreter.cpp (100%)
 rename tests/{test_low_level => test_with_catch}/test_trampoline.py (100%)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 96c12f954e..8ac236d404 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -648,7 +648,7 @@ if(NOT PYBIND11_CUDA_TESTS)
   add_subdirectory(pure_cpp)
 
   # Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target.
-  add_subdirectory(test_low_level)
+  add_subdirectory(test_with_catch)
 
   # Test CMake build using functions and targets from subdirectory or installed location
   add_subdirectory(test_cmake_build)
diff --git a/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt b/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
index 786511d5e5..bb83a6e6ef 100644
--- a/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
+++ b/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
@@ -26,11 +26,11 @@ add_custom_target(
   DEPENDS test_subdirectory_embed)
 
 # Test custom export group -- PYBIND11_EXPORT_NAME
-add_library(test_low_level_lib ../embed.cpp)
-target_link_libraries(test_low_level_lib PRIVATE pybind11::embed)
+add_library(test_with_catch_lib ../embed.cpp)
+target_link_libraries(test_with_catch_lib PRIVATE pybind11::embed)
 
 install(
-  TARGETS test_low_level_lib
+  TARGETS test_with_catch_lib
   EXPORT test_export
   ARCHIVE DESTINATION bin
   LIBRARY DESTINATION lib
diff --git a/tests/test_low_level/CMakeLists.txt b/tests/test_with_catch/CMakeLists.txt
similarity index 84%
rename from tests/test_low_level/CMakeLists.txt
rename to tests/test_with_catch/CMakeLists.txt
index 6b0be26792..136537e67f 100644
--- a/tests/test_low_level/CMakeLists.txt
+++ b/tests/test_with_catch/CMakeLists.txt
@@ -33,11 +33,11 @@ if(PYBIND11_TEST_SMART_HOLDER)
     -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE)
 endif()
 
-add_executable(test_low_level catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
-                              test_interpreter.cpp test_subinterpreter.cpp)
-pybind11_enable_warnings(test_low_level)
+add_executable(test_with_catch catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
+                               test_interpreter.cpp test_subinterpreter.cpp)
+pybind11_enable_warnings(test_with_catch)
 
-target_link_libraries(test_low_level PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
+target_link_libraries(test_with_catch PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
 
 if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
   file(COPY test_interpreter.py test_trampoline.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
@@ -45,8 +45,8 @@ endif()
 
 add_custom_target(
   cpptest
-  COMMAND "$<TARGET_FILE:test_low_level>"
-  DEPENDS test_low_level
+  COMMAND "$<TARGET_FILE:test_with_catch>"
+  DEPENDS test_with_catch
   WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
 
 pybind11_add_module(external_module THIN_LTO external_module.cpp)
diff --git a/tests/test_low_level/catch.cpp b/tests/test_with_catch/catch.cpp
similarity index 93%
rename from tests/test_low_level/catch.cpp
rename to tests/test_with_catch/catch.cpp
index cd3f8f4fb8..5bd8b3880e 100644
--- a/tests/test_low_level/catch.cpp
+++ b/tests/test_with_catch/catch.cpp
@@ -19,7 +19,7 @@ namespace py = pybind11;
 
 int main(int argc, char *argv[]) {
     // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number:
-    std::string updated_pythonpath("pybind11_test_low_level_PYTHONPATH_2099743835476552");
+    std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552");
     const char *preexisting_pythonpath = getenv("PYTHONPATH");
     if (preexisting_pythonpath != nullptr) {
 #if defined(_WIN32)
diff --git a/tests/test_low_level/external_module.cpp b/tests/test_with_catch/external_module.cpp
similarity index 100%
rename from tests/test_low_level/external_module.cpp
rename to tests/test_with_catch/external_module.cpp
diff --git a/tests/test_low_level/test_args_convert_vector.cpp b/tests/test_with_catch/test_args_convert_vector.cpp
similarity index 100%
rename from tests/test_low_level/test_args_convert_vector.cpp
rename to tests/test_with_catch/test_args_convert_vector.cpp
diff --git a/tests/test_low_level/test_argument_vector.cpp b/tests/test_with_catch/test_argument_vector.cpp
similarity index 100%
rename from tests/test_low_level/test_argument_vector.cpp
rename to tests/test_with_catch/test_argument_vector.cpp
diff --git a/tests/test_low_level/test_interpreter.cpp b/tests/test_with_catch/test_interpreter.cpp
similarity index 99%
rename from tests/test_low_level/test_interpreter.cpp
rename to tests/test_with_catch/test_interpreter.cpp
index a4e1b68d01..d227eecddc 100644
--- a/tests/test_low_level/test_interpreter.cpp
+++ b/tests/test_with_catch/test_interpreter.cpp
@@ -94,8 +94,9 @@ PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) {
 TEST_CASE("PYTHONPATH is used to update sys.path") {
     // The setup for this TEST_CASE is in catch.cpp!
     auto sys_path = py::str(py::module_::import("sys").attr("path")).cast<std::string>();
-    REQUIRE_THAT(sys_path,
-                 Catch::Matchers::Contains("pybind11_test_low_level_PYTHONPATH_2099743835476552"));
+    REQUIRE_THAT(
+        sys_path,
+        Catch::Matchers::Contains("pybind11_test_with_catch_PYTHONPATH_2099743835476552"));
 }
 
 TEST_CASE("Pass classes and data between modules defined in C++ and Python") {
diff --git a/tests/test_low_level/test_interpreter.py b/tests/test_with_catch/test_interpreter.py
similarity index 100%
rename from tests/test_low_level/test_interpreter.py
rename to tests/test_with_catch/test_interpreter.py
diff --git a/tests/test_low_level/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp
similarity index 100%
rename from tests/test_low_level/test_subinterpreter.cpp
rename to tests/test_with_catch/test_subinterpreter.cpp
diff --git a/tests/test_low_level/test_trampoline.py b/tests/test_with_catch/test_trampoline.py
similarity index 100%
rename from tests/test_low_level/test_trampoline.py
rename to tests/test_with_catch/test_trampoline.py

From 40aaf36c63da8aac23b1bf4c649238f096bbb73f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 12 Sep 2025 16:49:32 -0700
Subject: [PATCH 09/11] Be careful to NOINLINE slow paths

---
 include/pybind11/detail/argument_vector.h | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h
index f8a0023443..915fa00694 100644
--- a/include/pybind11/detail/argument_vector.h
+++ b/include/pybind11/detail/argument_vector.h
@@ -159,12 +159,12 @@ struct argument_vector {
             auto &ha = m_repr.array;
             if (ha.size == N) {
                 move_to_vector_with_reserved_size(N + 1);
-                m_repr.vector.vec.push_back(x);
+                push_back_slow_path(x);
             } else {
                 ha.arr[ha.size++] = x;
             }
         } else {
-            m_repr.vector.vec.push_back(x);
+            push_back_slow_path(x);
         }
     }
 
@@ -179,7 +179,7 @@ struct argument_vector {
                 move_to_vector_with_reserved_size(sz);
             }
         } else {
-            m_repr.vector.vec.reserve(sz);
+            reserve_slow_path(sz);
         }
     }
 
@@ -187,7 +187,7 @@ struct argument_vector {
     using repr_type = inline_array_or_vector<handle, N>;
     repr_type m_repr;
 
-    void move_to_vector_with_reserved_size(std::size_t reserved_size) {
+    PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) {
         assert(is_inline());
         auto &ha = m_repr.array;
         using heap_vector = typename repr_type::heap_vector;
@@ -197,6 +197,10 @@ struct argument_vector {
         new (&m_repr.vector) heap_vector(std::move(hv));
     }
 
+    PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.vector.vec.push_back(x); }
+
+    PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.vector.vec.reserve(sz); }
+
     bool is_inline() const { return m_repr.is_inline(); }
 };
 
@@ -253,7 +257,7 @@ struct args_convert_vector {
             auto &ha = m_repr.array;
             if (ha.size == kInlineSize) {
                 move_to_vector_with_reserved_size(kInlineSize + 1);
-                m_repr.vector.vec.push_back(b);
+                push_back_slow_path(b);
             } else {
                 assert(ha.size < kInlineSize);
                 const auto wbi = word_and_bit_index(ha.size++);
@@ -267,7 +271,7 @@ struct args_convert_vector {
                 assert(operator[](ha.size - 1) == b);
             }
         } else {
-            m_repr.vector.vec.push_back(b);
+            push_back_slow_path(b);
         }
     }
 
@@ -290,7 +294,7 @@ struct args_convert_vector {
         return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit);
     }
 
-    void move_to_vector_with_reserved_size(std::size_t reserved_size) {
+    PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) {
         auto &inline_arr = m_repr.array;
         using heap_vector = typename repr_type::heap_vector;
         heap_vector hv;
@@ -301,6 +305,8 @@ struct args_convert_vector {
         new (&m_repr.vector) heap_vector(std::move(hv));
     }
 
+    PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.vector.vec.push_back(b); }
+
     static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
     static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
     static constexpr auto kInlineSize = kWords * kBitsPerWord;

From a2c8422ff9014a76c4ad85e48649e6b8fd2a1168 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 18 Sep 2025 09:52:58 -0700
Subject: [PATCH 10/11] rename array/vector members to iarray/hvector. Move
 comment per request. Add static_asserts for our untagged union implementation
 per request.

---
 include/pybind11/cast.h                   |  4 +-
 include/pybind11/detail/argument_vector.h | 93 +++++++++++++----------
 2 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h
index c68b5fa648..4dcbf26236 100644
--- a/include/pybind11/cast.h
+++ b/include/pybind11/cast.h
@@ -2038,6 +2038,8 @@ using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
 // forward declaration (definition in attr.h)
 struct function_record;
 
+/// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines
+/// (16 pointers) in size.)
 constexpr std::size_t arg_vector_small_size = 6;
 
 /// Internal data associated with a single function call
@@ -2048,8 +2050,6 @@ struct function_call {
     const function_record &func;
 
     /// Arguments passed to the function:
-    /// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines
-    /// (16 pointers) in size.)
     argument_vector<arg_vector_small_size> args;
 
     /// The `convert` value the arguments should be loaded with
diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h
index 915fa00694..1cfce06024 100644
--- a/include/pybind11/detail/argument_vector.h
+++ b/include/pybind11/detail/argument_vector.h
@@ -16,9 +16,11 @@
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <iterator>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -33,7 +35,7 @@ PYBIND11_NAMESPACE_BEGIN(detail)
 // std::variant. Union with the tag packed next to the inline
 // array's size is smaller anyway, allowing 1 extra handle of
 // inline storage for free. Compare the layouts (1 line per
-// size_t/void*):
+// size_t/void*, assuming a 64-bit machine):
 // With variant, total is N + 2 for N >= 2:
 // - variant tag (cannot be packed with the array size)
 // - array size (or first pointer of 3 in std::vector)
@@ -61,28 +63,29 @@ union inline_array_or_vector {
         heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
     };
 
-    inline_array array;
-    heap_vector vector;
+    inline_array iarray;
+    heap_vector hvector;
 
     static_assert(std::is_trivially_move_constructible<ArrayT>::value,
                   "ArrayT must be trivially move constructible");
     static_assert(std::is_trivially_destructible<ArrayT>::value,
                   "ArrayT must be trivially destructible");
 
-    inline_array_or_vector() : array() {}
+    inline_array_or_vector() : iarray() {}
     ~inline_array_or_vector() {
         if (!is_inline()) {
-            vector.~heap_vector();
+            hvector.~heap_vector();
         }
     }
+    // Disable copy ctor and assignment.
     inline_array_or_vector(const inline_array_or_vector &) = delete;
     inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;
 
     inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
         if (rhs.is_inline()) {
-            std::memcpy(&array, &rhs.array, sizeof(array));
+            std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
         } else {
-            new (&vector) heap_vector(std::move(rhs.vector));
+            new (&hvector) heap_vector(std::move(rhs.hvector));
         }
         assert(is_inline() == rhs.is_inline());
     }
@@ -94,14 +97,14 @@ union inline_array_or_vector {
 
         if (rhs.is_inline()) {
             if (!is_inline()) {
-                vector.~heap_vector();
+                hvector.~heap_vector();
             }
-            std::memcpy(&array, &rhs.array, sizeof(array));
+            std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
         } else {
             if (is_inline()) {
-                new (&vector) heap_vector(std::move(rhs.vector));
+                new (&hvector) heap_vector(std::move(rhs.hvector));
             } else {
-                vector = std::move(rhs.vector);
+                hvector = std::move(rhs.hvector);
             }
         }
         return *this;
@@ -114,6 +117,14 @@ union inline_array_or_vector {
         // of bytes. See
         // https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
         bool result = false;
+        static_assert(std::is_standard_layout<inline_array>::value,
+                      "untagged union implementation relies on this");
+        static_assert(std::is_standard_layout<heap_vector>::value,
+                      "untagged union implementation relies on this");
+        static_assert(offsetof(inline_array, is_inline) == 0,
+                      "untagged union implementation relies on this");
+        static_assert(offsetof(heap_vector, is_inline) == 0,
+                      "untagged union implementation relies on this");
         std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
         return result;
     }
@@ -126,6 +137,7 @@ struct argument_vector {
 public:
     argument_vector() = default;
 
+    // Disable copy ctor and assignment.
     argument_vector(const argument_vector &) = delete;
     argument_vector &operator=(const argument_vector &) = delete;
     argument_vector(argument_vector &&) noexcept = default;
@@ -133,32 +145,32 @@ struct argument_vector {
 
     std::size_t size() const {
         if (is_inline()) {
-            return m_repr.array.size;
+            return m_repr.iarray.size;
         }
-        return m_repr.vector.vec.size();
+        return m_repr.hvector.vec.size();
     }
 
     handle &operator[](std::size_t idx) {
         assert(idx < size());
         if (is_inline()) {
-            return m_repr.array.arr[idx];
+            return m_repr.iarray.arr[idx];
         }
-        return m_repr.vector.vec[idx];
+        return m_repr.hvector.vec[idx];
     }
 
     handle operator[](std::size_t idx) const {
         assert(idx < size());
         if (is_inline()) {
-            return m_repr.array.arr[idx];
+            return m_repr.iarray.arr[idx];
         }
-        return m_repr.vector.vec[idx];
+        return m_repr.hvector.vec[idx];
     }
 
     void push_back(handle x) {
         if (is_inline()) {
-            auto &ha = m_repr.array;
+            auto &ha = m_repr.iarray;
             if (ha.size == N) {
-                move_to_vector_with_reserved_size(N + 1);
+                move_to_heap_vector_with_reserved_size(N + 1);
                 push_back_slow_path(x);
             } else {
                 ha.arr[ha.size++] = x;
@@ -176,7 +188,7 @@ struct argument_vector {
     void reserve(std::size_t sz) {
         if (is_inline()) {
             if (sz > N) {
-                move_to_vector_with_reserved_size(sz);
+                move_to_heap_vector_with_reserved_size(sz);
             }
         } else {
             reserve_slow_path(sz);
@@ -187,19 +199,19 @@ struct argument_vector {
     using repr_type = inline_array_or_vector<handle, N>;
     repr_type m_repr;
 
-    PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) {
+    PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
         assert(is_inline());
-        auto &ha = m_repr.array;
+        auto &ha = m_repr.iarray;
         using heap_vector = typename repr_type::heap_vector;
         heap_vector hv;
         hv.vec.reserve(reserved_size);
         std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
-        new (&m_repr.vector) heap_vector(std::move(hv));
+        new (&m_repr.hvector) heap_vector(std::move(hv));
     }
 
-    PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.vector.vec.push_back(x); }
+    PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.hvector.vec.push_back(x); }
 
-    PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.vector.vec.reserve(sz); }
+    PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.hvector.vec.reserve(sz); }
 
     bool is_inline() const { return m_repr.is_inline(); }
 };
@@ -212,6 +224,7 @@ struct args_convert_vector {
 public:
     args_convert_vector() = default;
 
+    // Disable copy ctor and assignment.
     args_convert_vector(const args_convert_vector &) = delete;
     args_convert_vector &operator=(const args_convert_vector &) = delete;
     args_convert_vector(args_convert_vector &&) noexcept = default;
@@ -219,9 +232,9 @@ struct args_convert_vector {
 
     args_convert_vector(std::size_t count, bool value) {
         if (count > kInlineSize) {
-            new (&m_repr.vector) typename repr_type::heap_vector(count, value);
+            new (&m_repr.hvector) typename repr_type::heap_vector(count, value);
         } else {
-            auto &inline_arr = m_repr.array;
+            auto &inline_arr = m_repr.iarray;
             inline_arr.arr.fill(value ? std::size_t(-1) : 0);
             inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
         }
@@ -229,18 +242,18 @@ struct args_convert_vector {
 
     std::size_t size() const {
         if (is_inline()) {
-            return m_repr.array.size;
+            return m_repr.iarray.size;
         }
-        return m_repr.vector.vec.size();
+        return m_repr.hvector.vec.size();
     }
 
     void reserve(std::size_t sz) {
         if (is_inline()) {
             if (sz > kInlineSize) {
-                move_to_vector_with_reserved_size(sz);
+                move_to_heap_vector_with_reserved_size(sz);
             }
         } else {
-            m_repr.vector.vec.reserve(sz);
+            m_repr.hvector.vec.reserve(sz);
         }
     }
 
@@ -248,15 +261,15 @@ struct args_convert_vector {
         if (is_inline()) {
             return inline_index(idx);
         }
-        assert(idx < m_repr.vector.vec.size());
-        return m_repr.vector.vec[idx];
+        assert(idx < m_repr.hvector.vec.size());
+        return m_repr.hvector.vec[idx];
     }
 
     void push_back(bool b) {
         if (is_inline()) {
-            auto &ha = m_repr.array;
+            auto &ha = m_repr.iarray;
             if (ha.size == kInlineSize) {
-                move_to_vector_with_reserved_size(kInlineSize + 1);
+                move_to_heap_vector_with_reserved_size(kInlineSize + 1);
                 push_back_slow_path(b);
             } else {
                 assert(ha.size < kInlineSize);
@@ -291,21 +304,21 @@ struct args_convert_vector {
         const auto wbi = word_and_bit_index(idx);
         assert(wbi.word < kWords);
         assert(wbi.bit < kBitsPerWord);
-        return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit);
+        return m_repr.iarray.arr[wbi.word] & (std::size_t(1) << wbi.bit);
     }
 
-    PYBIND11_NOINLINE void move_to_vector_with_reserved_size(std::size_t reserved_size) {
-        auto &inline_arr = m_repr.array;
+    PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
+        auto &inline_arr = m_repr.iarray;
         using heap_vector = typename repr_type::heap_vector;
         heap_vector hv;
         hv.vec.reserve(reserved_size);
         for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
             hv.vec.push_back(inline_index(ii));
         }
-        new (&m_repr.vector) heap_vector(std::move(hv));
+        new (&m_repr.hvector) heap_vector(std::move(hv));
     }
 
-    PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.vector.vec.push_back(b); }
+    PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.hvector.vec.push_back(b); }
 
     static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
     static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;

From a2dddb42c5bf185bd233947c684197b9b3b010fd Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 18 Sep 2025 10:10:53 -0700
Subject: [PATCH 11/11] drop is_standard_layout assertions; see
 https://github.com/pybind/pybind11/pull/5824#issuecomment-3308616072

---
 include/pybind11/detail/argument_vector.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h
index 1cfce06024..e9bfe064d4 100644
--- a/include/pybind11/detail/argument_vector.h
+++ b/include/pybind11/detail/argument_vector.h
@@ -117,10 +117,6 @@ union inline_array_or_vector {
         // of bytes. See
         // https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
         bool result = false;
-        static_assert(std::is_standard_layout<inline_array>::value,
-                      "untagged union implementation relies on this");
-        static_assert(std::is_standard_layout<heap_vector>::value,
-                      "untagged union implementation relies on this");
         static_assert(offsetof(inline_array, is_inline) == 0,
                       "untagged union implementation relies on this");
         static_assert(offsetof(heap_vector, is_inline) == 0,