From b3e1b435d6dc5c5de64a12123665a45ee269b181 Mon Sep 17 00:00:00 2001 From: Luc Jaulmes Date: Mon, 12 May 2025 11:21:34 +0100 Subject: [PATCH 01/16] Rename CurrentNumThreads to AvailableNumThreads --- quest/src/api/environment.cpp | 4 ++-- quest/src/core/autodeployer.cpp | 2 +- quest/src/cpu/cpu_config.cpp | 12 +++++++++++- quest/src/cpu/cpu_config.hpp | 4 +++- quest/src/cpu/cpu_subroutines.cpp | 2 +- 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index 6eef515c4..541491899 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -225,7 +225,7 @@ void printCpuInfo() { "cpu", { {"numCpuCores", printer_toStr(std::thread::hardware_concurrency()) + pm}, {"numOmpProcs", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na}, - {"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na}, + {"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na}, {"cpuMemory", ram}, {"cpuMemoryFree", un}, }); @@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) { QuESTEnv env = getQuESTEnv(); - int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1; + int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1; int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled(); int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible(); diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp index 2b6645e42..27c412687 100644 --- a/quest/src/core/autodeployer.cpp +++ b/quest/src/core/autodeployer.cpp @@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us // and we require more than 1 thread available at QuESTEnv creation if (useMultithread == modeflag::USE_AUTO) - useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0; + useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0; } diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index f27471d38..a818f5be4 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -46,11 +46,12 @@ bool cpu_isOpenmpCompiled() { } -int cpu_getCurrentNumThreads() { +int cpu_getAvailableNumThreads() { #if COMPILE_OPENMP int n = -1; #pragma omp parallel shared(n) + #pragma omp single n = omp_get_num_threads(); return n; @@ -90,6 +91,15 @@ int cpu_getOpenmpThreadInd() { } +int cpu_getCurrentNumThreads() { +#if COMPILE_OPENMP + return omp_get_num_threads(); +#else + return 1; +#endif +} + + /* * MEMORY ALLOCATION diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp index 48f54b44f..982fc8a2f 100644 --- a/quest/src/cpu/cpu_config.hpp +++ b/quest/src/cpu/cpu_config.hpp @@ -23,7 +23,7 @@ using std::vector; bool cpu_isOpenmpCompiled(); -int cpu_getCurrentNumThreads(); +int cpu_getAvailableNumThreads(); int cpu_getNumOpenmpProcessors(); @@ -35,6 +35,8 @@ int cpu_getNumOpenmpProcessors(); int cpu_getOpenmpThreadInd(); +int cpu_getCurrentNumThreads(); + /* diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 9c90e08c1..0b60d85ca 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -873,7 +873,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( // whenever each thread has at least 1 iteration for itself. And of course // we serialise both inner and outer loops when qureg multithreading is off. - if (!qureg.isMultithreaded || numOuterIts >= cpu_getCurrentNumThreads()) { + if (!qureg.isMultithreaded || numOuterIts >= cpu_getAvailableNumThreads()) { // parallel #pragma omp parallel for if(qureg.isMultithreaded) From 08b7db228b6b01a0ea450e0e1d9c182e2e2eef7b Mon Sep 17 00:00:00 2001 From: Luc Jaulmes Date: Tue, 6 May 2025 10:57:14 +0200 Subject: [PATCH 02/16] Assign init work in parallel to enforce first-touch --- quest/src/core/utilities.hpp | 18 ++++++++++++++++++ quest/src/cpu/cpu_subroutines.cpp | 9 ++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp index 6d741312b..3fd47c43a 100644 --- a/quest/src/core/utilities.hpp +++ b/quest/src/core/utilities.hpp @@ -343,6 +343,24 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds); +// Generic function to split a workload fairly, with granularity >= block_size +static inline +std::pair +util_distribute(const qindex work, const qindex block, const int id, const int n) { + // ASSUME(work % block == 0); + const qindex blocks = work / block; + + qindex spread = blocks / n; + qindex extra = blocks % n; + + qindex prev_extra = (id * extra) / n; + qindex prev_shift = (id * extra) % n; + qindex here_extra = (prev_shift + extra) >= n; + + qindex pos = id * spread + prev_extra; + return std::make_pair(pos * block, (pos + spread + here_extra) * block); +} + /* * GATE PARAMETERS diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 0b60d85ca..87e9e47a9 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -2357,7 +2357,14 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { // faster on average (though perhaps not for large quregs) // than a custom multithreaded loop - std::fill(qureg.cpuAmps, qureg.cpuAmps + qureg.numAmpsPerNode, amp); + #pragma omp parallel if(qureg.isMultithreaded) + { + + // Distribute number of tasks and convert to indexes. 4kB page standard? + const auto [start, end] = util_distribute(qureg.numAmpsPerNode, 4096 / sizeof(qcomp), + cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads()); + std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); + } } From 3adba93c99b8b0aa7facce01fbbd62dd1c5c38dd Mon Sep 17 00:00:00 2001 From: Luc Jaulmes Date: Tue, 6 May 2025 11:46:31 +0200 Subject: [PATCH 03/16] Numa aware alloc --- CMakeLists.txt | 15 ++++++ quest/src/api/qureg.cpp | 4 +- quest/src/core/errors.cpp | 12 +++++ quest/src/core/errors.hpp | 3 ++ quest/src/cpu/cpu_config.cpp | 96 +++++++++++++++++++++++++++++++----- quest/src/cpu/cpu_config.hpp | 3 ++ 6 files changed, 118 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 933e23086..a5a69e761 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -262,6 +262,21 @@ if (ENABLE_MULTITHREADING) OpenMP::OpenMP_C ) + # Find NUMA - location of NUMA headers + if (WIN32) + compile_option(NUMA_AWARE 0) + message(WARNING "Building on Windows, QuEST will not be aware of numa locality") + else() + include(FindPkgConfig) + pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL) + compile_option(NUMA_AWARE ${NUMA_FOUND}) + if (${NUMA_FOUND}) + target_link_libraries(QuEST PRIVATE PkgConfig::NUMA) + else() + message(WARNING "libnuma not found, QuEST will not be aware of numa locality") + endif() + endif() + if (VERBOSE_LIB_NAME) string(CONCAT LIB_NAME ${LIB_NAME} "+mt") endif() diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp index 98068f079..7d68528a1 100644 --- a/quest/src/api/qureg.cpp +++ b/quest/src/api/qureg.cpp @@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread); // always allocate CPU memory - qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed + qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed // conditionally allocate GPU memory and communication buffers (even if numNodes == 1). // note that in distributed settings but where useDistrib=false, each node will have a @@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) { validate_quregFields(qureg, __func__); // free CPU memory - cpu_deallocArray(qureg.cpuAmps); + cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode); // free CPU communication buffer if (qureg.isDistributed) diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp index 2f44127c8..afcd8c014 100644 --- a/quest/src/core/errors.cpp +++ b/quest/src/core/errors.cpp @@ -94,6 +94,18 @@ void error_allocOfQuESTEnvFailed() { } +void error_gettingPageSizeFailed() { + + raiseInternalError("Failed to get the page size."); +} + + +void error_gettingNumaNodesFailed() { + + raiseInternalError("Failed to get the numa node count"); +} + + /* * MEMORY ERRORS diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp index ce8f7e68c..88cffeb50 100644 --- a/quest/src/core/errors.hpp +++ b/quest/src/core/errors.hpp @@ -42,6 +42,9 @@ void error_validationListUniquenessCheckExceededMaskSize(); void error_allocOfQuESTEnvFailed(); +void error_gettingPageSizeFailed(); + +void error_gettingNumaNodesFailed(); /* diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index a818f5be4..385efa1a2 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -14,6 +14,7 @@ #include #include #include +#include using std::vector; @@ -34,6 +35,12 @@ using std::vector; #include #endif +#if NUMA_AWARE + #include + #include + #include + #include +#endif // NUMA_AWARE /* @@ -105,23 +112,69 @@ int cpu_getCurrentNumThreads() { * MEMORY ALLOCATION */ +#if NUMA_AWARE +unsigned long getPageSize() { + static unsigned long page_size = 0; + if (!page_size) { + page_size = sysconf(_SC_PAGESIZE); + if (page_size == ~0UL) { + error_gettingPageSizeFailed(); + } + } + return page_size; +} + +unsigned long getNumaNodes() { + static int n_nodes = 0; + if (!n_nodes) { + n_nodes = numa_num_configured_nodes(); + if (n_nodes < 1) { + error_gettingNumaNodesFailed(); + } + } + return n_nodes; +} +#endif qcomp* cpu_allocArray(qindex length) { + return (qcomp*) calloc(length, sizeof(qcomp)); +} - /// @todo - /// here, we calloc the entire array in a serial setting, rather than one malloc - /// followed by threads subsequently memset'ing their own partitions. The latter - /// approach would distribute the array pages across NUMA nodes, accelerating - /// their subsequent access by the same threads (via NUMA's first-touch policy). - /// We have so far foregone this optimisation since a thread's memory-access pattern - /// in many of the QuEST functions is non-trivial, and likely to be inconsistent - /// with the memset pattern. As such, I expect the benefit is totally occluded - /// and only introduces potential new bugs - but this should be tested and confirmed! - - // we call calloc over malloc in order to fail immediately if mem isn't available; - // caller must handle nullptr result - return (qcomp*) calloc(length, sizeof(qcomp)); +qcomp* cpu_allocNumaArray(qindex length) { +#if !NUMA_AWARE + return cpu_allocArray(length); +#else + unsigned long page_size = getPageSize(); + int n_nodes = getNumaNodes(); + + qindex size = length * sizeof(qcomp); + int pages = (size + page_size - 1) / page_size; + void *addr = mmap(NULL, pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (n_nodes == 1) { + return reinterpret_cast(addr); + } + + // distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible + int floor_pages = pages / n_nodes; + int spread_pages = pages % n_nodes; + + uintptr_t pos = (uintptr_t)addr; + for (int node = 0, shift = n_nodes; node < n_nodes; ++node) { + shift -= spread_pages; + int node_pages = floor_pages + (shift <= 0); + + unsigned long node_mask = 1UL << node; + mbind((void*)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof(node_mask) * 8, 0); + + pos += node_pages * page_size; + if (shift <= 0) { + shift += n_nodes; + } + } + + return reinterpret_cast(addr); +#endif // NUMA_AWARE } @@ -132,6 +185,23 @@ void cpu_deallocArray(qcomp* arr) { } +void cpu_deallocNumaArray(qcomp* arr, qindex length) { + if (arr == nullptr) { + return; + } + +#if !NUMA_AWARE + return cpu_deallocArray(arr); +#else + unsigned long page_size = getPageSize(); + qindex size = length * sizeof(qcomp); + int pages = (size + page_size - 1) / page_size; + + munmap(arr, pages * page_size); +#endif // NUMA_AWARE +} + + qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) { // do not allocate if arr alloc failed (caller will handle) diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp index 982fc8a2f..ba55a785f 100644 --- a/quest/src/cpu/cpu_config.hpp +++ b/quest/src/cpu/cpu_config.hpp @@ -46,6 +46,9 @@ int cpu_getCurrentNumThreads(); qcomp* cpu_allocArray(qindex length); void cpu_deallocArray(qcomp* arr); +qcomp* cpu_allocNumaArray(qindex length); +void cpu_deallocNumaArray(qcomp* arr, qindex length); + qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim); void cpu_deallocMatrixWrapper(qcomp** wrapper); From 827cd09e62cada3849543870be9997bbc45bee35 Mon Sep 17 00:00:00 2001 From: Luc Jaulmes Date: Mon, 23 Jun 2025 14:24:41 +0100 Subject: [PATCH 04/16] Move util_distribute() out of headers --- quest/src/core/utilities.cpp | 15 +++++++++++++++ quest/src/core/utilities.hpp | 17 +---------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp index 1c22d43d8..2ad1a1309 100644 --- a/quest/src/core/utilities.cpp +++ b/quest/src/core/utilities.cpp @@ -903,6 +903,21 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin } +std::pair +util_distribute(const qindex work, const qindex block, const int id, const int n) { + // ASSUME(work % block == 0); + const qindex blocks = work / block; + + qindex spread = blocks / n; + qindex extra = blocks % n; + + qindex prev_extra = (id * extra) / n; + qindex prev_shift = (id * extra) % n; + qindex here_extra = (prev_shift + extra) >= n; + + qindex pos = id * spread + prev_extra; + return std::make_pair(pos * block, (pos + spread + here_extra) * block); +} /* * GATE PARAMETERS diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp index 3fd47c43a..36f051595 100644 --- a/quest/src/core/utilities.hpp +++ b/quest/src/core/utilities.hpp @@ -344,22 +344,7 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin // Generic function to split a workload fairly, with granularity >= block_size -static inline -std::pair -util_distribute(const qindex work, const qindex block, const int id, const int n) { - // ASSUME(work % block == 0); - const qindex blocks = work / block; - - qindex spread = blocks / n; - qindex extra = blocks % n; - - qindex prev_extra = (id * extra) / n; - qindex prev_shift = (id * extra) % n; - qindex here_extra = (prev_shift + extra) >= n; - - qindex pos = id * spread + prev_extra; - return std::make_pair(pos * block, (pos + spread + here_extra) * block); -} +std::pair util_distribute(const qindex work, const qindex block, const int id, const int n); /* From 7960aebaf38d9a2a32ddf9c1c146bae1bcf21f70 Mon Sep 17 00:00:00 2001 From: Luc Jaulmes Date: Sun, 29 Jun 2025 11:18:07 +0100 Subject: [PATCH 05/16] Make util_distribute() work on toy examples --- quest/src/core/utilities.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp index 2ad1a1309..9377be4ee 100644 --- a/quest/src/core/utilities.cpp +++ b/quest/src/core/utilities.cpp @@ -905,8 +905,9 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin std::pair util_distribute(const qindex work, const qindex block, const int id, const int n) { - // ASSUME(work % block == 0); const qindex blocks = work / block; + // really should not happen except when work < block as work is power of 2 and block should be too. + const qindex last = work % block; qindex spread = blocks / n; qindex extra = blocks % n; @@ -916,7 +917,8 @@ util_distribute(const qindex work, const qindex block, const int id, const int n qindex here_extra = (prev_shift + extra) >= n; qindex pos = id * spread + prev_extra; - return std::make_pair(pos * block, (pos + spread + here_extra) * block); + qindex end = pos + spread + here_extra; + return std::make_pair(pos * block, end * block + (id == n - 1 ? last : 0)); } /* From 98c50a4d962a717d311b05e9de3d35e9ca65b5d2 Mon Sep 17 00:00:00 2001 From: Luc Jaulmes Date: Sun, 29 Jun 2025 11:26:39 +0100 Subject: [PATCH 06/16] Clean up hardcoded 4k page size --- quest/src/core/memory.hpp | 6 ++++++ quest/src/cpu/cpu_config.cpp | 15 ++++++++++----- quest/src/cpu/cpu_config.hpp | 2 ++ quest/src/cpu/cpu_subroutines.cpp | 4 +--- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/quest/src/core/memory.hpp b/quest/src/core/memory.hpp index 7b112b027..cd20d867b 100644 --- a/quest/src/core/memory.hpp +++ b/quest/src/core/memory.hpp @@ -21,6 +21,12 @@ +/* + * Not necessarily the actual page size, but a sensible default for page-sized things. + */ +#define FALLBACK_PAGE_SIZE 4096 + + /* * HARDWARE QUERYING */ diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index 385efa1a2..9aa640fe8 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -9,6 +9,7 @@ #include "quest/include/types.h" #include "quest/include/paulis.h" +#include "quest/src/core/memory.hpp" #include "quest/src/core/errors.hpp" #include @@ -112,8 +113,8 @@ int cpu_getCurrentNumThreads() { * MEMORY ALLOCATION */ +unsigned long cpu_getPageSize() { #if NUMA_AWARE -unsigned long getPageSize() { static unsigned long page_size = 0; if (!page_size) { page_size = sysconf(_SC_PAGESIZE); @@ -122,9 +123,13 @@ unsigned long getPageSize() { } } return page_size; +#else + return FALLBACK_PAGE_SIZE; +#endif } -unsigned long getNumaNodes() { +#if NUMA_AWARE +unsigned long cpu_getNumaNodes() { static int n_nodes = 0; if (!n_nodes) { n_nodes = numa_num_configured_nodes(); @@ -145,8 +150,8 @@ qcomp* cpu_allocNumaArray(qindex length) { #if !NUMA_AWARE return cpu_allocArray(length); #else - unsigned long page_size = getPageSize(); - int n_nodes = getNumaNodes(); + unsigned long page_size = cpu_getPageSize(); + int n_nodes = cpu_getNumaNodes(); qindex size = length * sizeof(qcomp); int pages = (size + page_size - 1) / page_size; @@ -193,7 +198,7 @@ void cpu_deallocNumaArray(qcomp* arr, qindex length) { #if !NUMA_AWARE return cpu_deallocArray(arr); #else - unsigned long page_size = getPageSize(); + unsigned long page_size = cpu_getPageSize(); qindex size = length * sizeof(qcomp); int pages = (size + page_size - 1) / page_size; diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp index ba55a785f..0d8a4e416 100644 --- a/quest/src/cpu/cpu_config.hpp +++ b/quest/src/cpu/cpu_config.hpp @@ -65,6 +65,8 @@ PauliStr* cpu_allocPauliStrings(qindex numStrings); void cpu_deallocPauliStrings(PauliStr* strings); +unsigned long cpu_getPageSize(); + /* * MEMORY MOVEMENT diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 87e9e47a9..ad5266e08 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -2359,9 +2359,7 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { // than a custom multithreaded loop #pragma omp parallel if(qureg.isMultithreaded) { - - // Distribute number of tasks and convert to indexes. 4kB page standard? - const auto [start, end] = util_distribute(qureg.numAmpsPerNode, 4096 / sizeof(qcomp), + const auto [start, end] = util_distribute(qureg.numAmpsPerNode, cpu_getPageSize() / sizeof(qcomp), cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads()); std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); } From 99e2705dab6e56dd2f09d74504b5eefc75c51f33 Mon Sep 17 00:00:00 2001 From: Luc Jaulmes Date: Sun, 29 Jun 2025 11:39:21 +0100 Subject: [PATCH 07/16] Fix cmake to handle missing NUMA_FOUND variable In the case libnuma is not found, NUMA_FOUND is (can be?) not defined instead of being set to 0. --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5a69e761..9e629de55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -269,10 +269,11 @@ if (ENABLE_MULTITHREADING) else() include(FindPkgConfig) pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL) - compile_option(NUMA_AWARE ${NUMA_FOUND}) if (${NUMA_FOUND}) + compile_option(NUMA_AWARE ${NUMA_FOUND}) target_link_libraries(QuEST PRIVATE PkgConfig::NUMA) else() + compile_option(NUMA_AWARE 0) message(WARNING "libnuma not found, QuEST will not be aware of numa locality") endif() endif() From 9e9e2308948f9dff4f4e74fd0e1cae7d3646ae84 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 3 Jul 2025 22:44:56 +0200 Subject: [PATCH 08/16] clarified util_distribute to util_getBlockMultipleSubRange --- quest/src/core/utilities.cpp | 46 ++++++++++++++++++++++--------- quest/src/core/utilities.hpp | 5 ++-- quest/src/cpu/cpu_subroutines.cpp | 15 +++++++--- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp index 9377be4ee..7aa4a257f 100644 --- a/quest/src/core/utilities.cpp +++ b/quest/src/core/utilities.cpp @@ -5,6 +5,7 @@ * logic, matrix algebra, and channel parameters. * * @author Tyson Jones + * @author Luc Jaulmes (distributing ranges over blocks) */ #include "quest/include/types.h" @@ -25,6 +26,7 @@ #include #include +#include #include #include #include @@ -902,25 +904,43 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin return out; } +std::pair util_getBlockMultipleSubRange( + qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges +) { + // divides a range into whole blocks (and a single leftover sub-block) and + // attempts to uniformly distribute the blocks across the specified number of + // sub-ranges. When the blocks do not divide evenly between sub-ranges, the + // leftover blocks are spread apart across sub-ranges. When the range does not + // divide evenly into blocks, the overflow is given to the final sub-range. -std::pair -util_distribute(const qindex work, const qindex block, const int id, const int n) { - const qindex blocks = work / block; - // really should not happen except when work < block as work is power of 2 and block should be too. - const qindex last = work % block; + qindex numFullBlocks = rangeLen / blockLen; // floors + qindex subBlockLen = rangeLen % blockLen; - qindex spread = blocks / n; - qindex extra = blocks % n; + qindex baseNumBlocksPerSubRange = numFullBlocks / numSubRanges; + qindex numExtraBlocks = numFullBlocks % numSubRanges; - qindex prev_extra = (id * extra) / n; - qindex prev_shift = (id * extra) % n; - qindex here_extra = (prev_shift + extra) >= n; + // determine how many extra blocks this subrange should contain + qindex prevExtra = (idSubRange * numExtraBlocks) / numSubRanges; + qindex prevShift = (idSubRange * numExtraBlocks) % numSubRanges; + bool hereExtra = (prevShift + numExtraBlocks) >= numSubRanges; - qindex pos = id * spread + prev_extra; - qindex end = pos + spread + here_extra; - return std::make_pair(pos * block, end * block + (id == n - 1 ? last : 0)); + // allocate blocks to this sub-range + qindex startBlockInd = idSubRange * baseNumBlocksPerSubRange + prevExtra; + qindex endBlockInd = startBlockInd + baseNumBlocksPerSubRange + hereExtra; + + // find this sub-range indices within [0, rangeLen) + qindex startInd = startBlockInd * blockLen; + qindex endInd = endBlockInd * blockLen; // exclusive + + // arbitrarily allocate the leftover sub-block to the final sub-range + if (idSubRange == numSubRanges - 1) + endInd += subBlockLen; + + return std::make_pair(startInd, endInd); } + + /* * GATE PARAMETERS */ diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp index 36f051595..56691c2f4 100644 --- a/quest/src/core/utilities.hpp +++ b/quest/src/core/utilities.hpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -342,9 +343,8 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds); +std::pair util_getBlockMultipleSubRange(qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges); -// Generic function to split a workload fairly, with granularity >= block_size -std::pair util_distribute(const qindex work, const qindex block, const int id, const int n); /* @@ -355,6 +355,7 @@ qreal util_getPhaseFromGateAngle(qreal angle); qcomp util_getPhaseFromGateAngle(qcomp angle); + /* * DECOHERENCE FACTORS */ diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index ad5266e08..b267df03a 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -9,6 +9,7 @@ * * @author Tyson Jones * @author Oliver Brown (OpenMP 'if' clauses) + * @author Luc Jaulmes (optimised initUniformState) * @author Richard Meister (helped patch on LLVM) * @author Kshitij Chhabra (patched v3 clauses with gcc9) * @author Ania (Anna) Brown (developed QuEST v1 logic) @@ -2355,12 +2356,18 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_multiQubitProjector void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { - // faster on average (though perhaps not for large quregs) - // than a custom multithreaded loop + // approx-uniformly distribute modified memory pages across threads, + // in the hope that each std::fill() will touch only memory within + // the thread's corresponding NUMA node, for best performance + + int numAmpsPerPage = cpu_getPageSize() / sizeof(qcomp); // divides evenly + #pragma omp parallel if(qureg.isMultithreaded) { - const auto [start, end] = util_distribute(qureg.numAmpsPerNode, cpu_getPageSize() / sizeof(qcomp), - cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads()); + const auto [start, end] = util_getBlockMultipleSubRange( + qureg.numAmpsPerNode, numAmpsPerPage, + cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads()); + std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); } } From 311b4d135055603634131d61ce221b71a527255a Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 3 Jul 2025 22:49:35 +0200 Subject: [PATCH 09/16] added Windows getPageSize --- quest/src/cpu/cpu_config.cpp | 65 +++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index 9aa640fe8..778eae882 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -3,6 +3,7 @@ * configuration, and allocating and copying RAM data. * * @author Tyson Jones + * @author Luc Jaulmes (NUMA awareness) */ #include "quest/include/modes.h" @@ -32,16 +33,35 @@ using std::vector; #endif +/// @todo +/// Windows provides a NUMA API we could access in theory, although we +/// forego the hassle for now - who is running QuEST on big multi-core +/// Windows? This validation protects against enabling NUMA awareness +/// on Windows but silently recieving no benefit due to no NUMA API calls + +#if NUMA_AWARE && defined(_WIN32) + #error "NUMA awareness is not currently supported on non-POSIX systems like Windows." +#endif + + #if COMPILE_OPENMP #include #endif -#if NUMA_AWARE +#if NUMA_AWARE && ! defined(_WIN32) #include - #include #include #include -#endif // NUMA_AWARE +#endif + +#if defined(_WIN32) + #define NOMINMAX + #define WIN32_LEAN_AND_MEAN + #include +#else + #include +#endif + /* @@ -113,33 +133,26 @@ int cpu_getCurrentNumThreads() { * MEMORY ALLOCATION */ -unsigned long cpu_getPageSize() { -#if NUMA_AWARE - static unsigned long page_size = 0; - if (!page_size) { - page_size = sysconf(_SC_PAGESIZE); - if (page_size == ~0UL) { - error_gettingPageSizeFailed(); - } - } - return page_size; +long cpu_getPageSize() { + + // avoid repeated queries to this fixed value + static long pageSize = 0; + if (pageSize > 0) + return pageSize; + + // obtain pageSize for the first time +#if defined(_WIN32) + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + pageSize = sysInfo.dwPageSize; #else - return FALLBACK_PAGE_SIZE; + pageSize = sysconf(_SC_PAGESIZE); #endif -} -#if NUMA_AWARE -unsigned long cpu_getNumaNodes() { - static int n_nodes = 0; - if (!n_nodes) { - n_nodes = numa_num_configured_nodes(); - if (n_nodes < 1) { - error_gettingNumaNodesFailed(); - } - } - return n_nodes; + + return pageSize; } -#endif + qcomp* cpu_allocArray(qindex length) { return (qcomp*) calloc(length, sizeof(qcomp)); From 1ffd8041cb3bfaa6dfe5aac922c172ced3a3cbcb Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 3 Jul 2025 22:52:00 +0200 Subject: [PATCH 10/16] clarified NUMA alloc --- quest/src/core/memory.hpp | 6 -- quest/src/cpu/cpu_config.cpp | 118 ++++++++++++++++++++++++----------- 2 files changed, 81 insertions(+), 43 deletions(-) diff --git a/quest/src/core/memory.hpp b/quest/src/core/memory.hpp index cd20d867b..7b112b027 100644 --- a/quest/src/core/memory.hpp +++ b/quest/src/core/memory.hpp @@ -21,12 +21,6 @@ -/* - * Not necessarily the actual page size, but a sensible default for page-sized things. - */ -#define FALLBACK_PAGE_SIZE 4096 - - /* * HARDWARE QUERYING */ diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index 778eae882..a831869bd 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -133,6 +133,14 @@ int cpu_getCurrentNumThreads() { * MEMORY ALLOCATION */ + +qindex getNumPagesToContainArray(long pageLen, qindex arrLen) { + + // round up to the nearest page + return static_cast(std::ceil(arrLen / (qreal) pageLen)); +} + + long cpu_getPageSize() { // avoid repeated queries to this fixed value @@ -160,39 +168,65 @@ qcomp* cpu_allocArray(qindex length) { qcomp* cpu_allocNumaArray(qindex length) { -#if !NUMA_AWARE +#if ! NUMA_AWARE return cpu_allocArray(length); -#else - unsigned long page_size = cpu_getPageSize(); - int n_nodes = cpu_getNumaNodes(); - - qindex size = length * sizeof(qcomp); - int pages = (size + page_size - 1) / page_size; - void *addr = mmap(NULL, pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (n_nodes == 1) { - return reinterpret_cast(addr); - } - - // distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible - int floor_pages = pages / n_nodes; - int spread_pages = pages % n_nodes; - - uintptr_t pos = (uintptr_t)addr; - for (int node = 0, shift = n_nodes; node < n_nodes; ++node) { - shift -= spread_pages; - int node_pages = floor_pages + (shift <= 0); - unsigned long node_mask = 1UL << node; - mbind((void*)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof(node_mask) * 8, 0); +#elif defined(_WIN32) + error_numaAllocOrDeallocAttemptedOnWindows(); - pos += node_pages * page_size; - if (shift <= 0) { - shift += n_nodes; - } +#else + // we will divide array's memory into pages + long pageSize = cpu_getPageSize(); + qindex arraySize = length * sizeof(qcomp); // gauranteed no overflow + + // if entire array fits within a single page, alloc like normal + if (arraySize <= pageSize) + return cpu_allocArray(length); + + // otherwise we will bind pages across NUMA nodes + static int numNodes = numa_num_configured_nodes(); + if (numNodes < 1) + error_gettingNumNumaNodesFailed(); + + qindex numPages = getNumPagesToContainArray(pageSize, arraySize); + qindex numBytes = numPages * pageSize; // prior validation gaurantees no overflow + + // allocate memory, potentially more than arraySize (depending on page divisibility) + void *rawAddr = mmap(NULL, numBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + // if there is only a single NUMA node, then all memory access will occur within it + qcomp* outAddr = reinterpret_cast(rawAddr); + if (numNodes == 1) + return outAddr; + + // otherwise, we bind continguous pages to NUMA nodes, distributing the pages + // attemptedly uniformly and spreading remaining pages maximally apart + qindex baseNumPagesPerNode = numPages / numNodes; // floors + qindex remainingNumPagesTotal = numPages % numNodes; + + // use integer type for safe address arithmetic below + uintptr_t offsetAddr = reinterpret_cast(rawAddr); + + for (int node=0, shift=numNodes; node < numNodes; ++node) { + + // decide number of pages to bind to NUMA node + shift -= remainingNumPagesTotal; + qindex numPagesInNode = baseNumPagesPerNode + (shift <= 0); + qindex numBytesInNode = numPagesInNode * pageSize; // validation prevents overflow + + // bind those pages from the offset address to the node (identified by mask) + unsigned long nodeMask = 1UL << node; + void* nodeAddr = reinterpret_cast(offsetAddr); + long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numNodes, 0); + + // prepare next node's address + offsetAddr += numPagesInNode * pageSize; + if (shift <= 0) + shift += numNodes; } - return reinterpret_cast(addr); -#endif // NUMA_AWARE + return outAddr; +#endif } @@ -204,19 +238,29 @@ void cpu_deallocArray(qcomp* arr) { void cpu_deallocNumaArray(qcomp* arr, qindex length) { - if (arr == nullptr) { + + // musn't pass nullptr to munmap() below + if (arr == nullptr) return; - } -#if !NUMA_AWARE - return cpu_deallocArray(arr); +#if ! NUMA_AWARE + cpu_deallocArray(arr); + +#elif defined(_WIN32) + error_numaAllocOrDeallocAttemptedOnWindows(); + #else - unsigned long page_size = cpu_getPageSize(); - qindex size = length * sizeof(qcomp); - int pages = (size + page_size - 1) / page_size; + qindex arrSize = length * sizeof(qcomp); + unsigned long pageSize = cpu_getPageSize(); - munmap(arr, pages * page_size); -#endif // NUMA_AWARE + // sub-page arrays were allocated with calloc() + if (arrSize <= pageSize) + return cpu_deallocArray(length); + + qindex numPages = getNumPagesToContainArray(pageSize, arraySize); + qindex numBytes = numPages * pageSize; // gauranteed no overflow + int success = munmap(arr, numBytes); +#endif } From 872336ed1a6f16241c266798eb9f4ec5bb53270d Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 3 Jul 2025 22:52:41 +0200 Subject: [PATCH 11/16] added error checking --- quest/src/core/errors.cpp | 38 +++++++++++++++++++++++++++++------- quest/src/core/errors.hpp | 17 ++++++++++++++-- quest/src/cpu/cpu_config.cpp | 26 ++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp index afcd8c014..112e94230 100644 --- a/quest/src/core/errors.cpp +++ b/quest/src/core/errors.cpp @@ -5,6 +5,7 @@ * deployment is consistent with the compiled deployment modes. * * @author Tyson Jones + * @author Luc Jaulmes (NUMA & pagesize errors) */ #include "quest/include/types.h" @@ -94,26 +95,49 @@ void error_allocOfQuESTEnvFailed() { } + +/* + * MEMORY ERRORS + */ + +void error_memSizeQueriedButWouldOverflow() { + + raiseInternalError("Attempted to obtain memory necessary to allocate a distributed object's single-node partition but it overflowed size_t despite prior validation."); +} + void error_gettingPageSizeFailed() { raiseInternalError("Failed to get the page size."); } +void error_pageSizeNotAPowerOf2() { + + raiseInternalError("The discovered page size was not a power of 2. Get Dr Denning on the phone."); +} -void error_gettingNumaNodesFailed() { +void error_pageSizeNotAMultipleOfQcomp() { - raiseInternalError("Failed to get the numa node count"); + raiseInternalError("The page size was indivisible by the number of bytes in a qcomp."); } +void error_gettingNumNumaNodesFailed() { + raiseInternalError("Failed to get the NUMA node count"); +} -/* - * MEMORY ERRORS - */ +void error_numaAllocOrDeallocAttemptedOnWindows() { -void error_memSizeQueriedButWouldOverflow() { + raiseInternalError("NUMA-aware memory allocation or deallocation was attempted on Windows though this is not yet implemented, indicating a potential build issue."); +} - raiseInternalError("Attempted to obtain memory necessary to allocate a distributed object's single-node partition but it overflowed size_t despite prior validation."); +void error_numaBindingFailed() { + + raiseInternalError("The binding of memory pages to NUMA nodes (with mbind) unexpectedly failed, despite prior reservation (with mmap) succeeding."); +} + +void error_numaUnmappingFailed() { + + raiseInternalError("NUMA-aware memory deallocation unexpectedly failed."); } diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp index 88cffeb50..a7f3615b5 100644 --- a/quest/src/core/errors.hpp +++ b/quest/src/core/errors.hpp @@ -5,6 +5,7 @@ * deployment is consistent with the compiled deployment modes. * * @author Tyson Jones + * @author Luc Jaulmes (NUMA & pagesize errors) */ #ifndef ERRORS_HPP @@ -42,9 +43,7 @@ void error_validationListUniquenessCheckExceededMaskSize(); void error_allocOfQuESTEnvFailed(); -void error_gettingPageSizeFailed(); -void error_gettingNumaNodesFailed(); /* @@ -53,6 +52,20 @@ void error_gettingNumaNodesFailed(); void error_memSizeQueriedButWouldOverflow(); +void error_gettingPageSizeFailed(); + +void error_pageSizeNotAPowerOf2(); + +void error_pageSizeNotAMultipleOfQcomp(); + +void error_gettingNumNumaNodesFailed(); + +void error_numaAllocOrDeallocAttemptedOnWindows(); + +void error_numaBindingFailed(); + +void error_numaUnmappingFailed(); + /* diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index a831869bd..3339f7772 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -12,6 +12,7 @@ #include "quest/src/core/memory.hpp" #include "quest/src/core/errors.hpp" +#include "quest/src/core/bitwise.hpp" #include #include @@ -157,6 +158,19 @@ long cpu_getPageSize() { pageSize = sysconf(_SC_PAGESIZE); #endif + // rigorously check the found pagesize is valid + // and consistent with preconditions assumed by + // callers, to avoid extremely funky bugs on + // esoteric future systems + + if (pageSize <= 0) + error_gettingPageSizeFailed(); + + if (!isPowerOf2(pageSize)) + error_pageSizeNotAPowerOf2(); + + if (pageSize % sizeof(qcomp) != 0) + error_pageSizeNotAMultipleOfQcomp(); return pageSize; } @@ -194,6 +208,10 @@ qcomp* cpu_allocNumaArray(qindex length) { // allocate memory, potentially more than arraySize (depending on page divisibility) void *rawAddr = mmap(NULL, numBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + // indicate memory alloc failure to caller (no NUMA-specific validation error message) + if (rawAddr == MAP_FAILED) + return nullptr; + // if there is only a single NUMA node, then all memory access will occur within it qcomp* outAddr = reinterpret_cast(rawAddr); if (numNodes == 1) @@ -219,6 +237,11 @@ qcomp* cpu_allocNumaArray(qindex length) { void* nodeAddr = reinterpret_cast(offsetAddr); long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numNodes, 0); + // treat bind failure as internal error (even though it can result from insufficient kernel mem), + // rather than permitting silent fallback to non-NUMA awareness which might be astonishingly slow + if (success == -1) + error_numaBindingFailed(); + // prepare next node's address offsetAddr += numPagesInNode * pageSize; if (shift <= 0) @@ -260,6 +283,9 @@ void cpu_deallocNumaArray(qcomp* arr, qindex length) { qindex numPages = getNumPagesToContainArray(pageSize, arraySize); qindex numBytes = numPages * pageSize; // gauranteed no overflow int success = munmap(arr, numBytes); + + if (success == -1) + error_numaUnmappingFailed(); #endif } From 99583117f577b1806556d6d885cf68a5b1057c46 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 3 Jul 2025 22:59:09 +0200 Subject: [PATCH 12/16] updating authorlists and speeding up Windows compilation --- CMakeLists.txt | 2 +- quest/src/core/errors.hpp | 1 - quest/src/core/memory.cpp | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e629de55..a05a70e44 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ # @author Oliver Thomson Brown # @author Erich Essmann (patches including MSVC support) # @author Tyson Jones (patches including clang multithreading) -# @author Luc Jaulmes (patching install) +# @author Luc Jaulmes (NUMA awareness, patching install) # # Contributions to previous builds from: # - Ania Brown diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp index a7f3615b5..d5b2ddadf 100644 --- a/quest/src/core/errors.hpp +++ b/quest/src/core/errors.hpp @@ -45,7 +45,6 @@ void error_allocOfQuESTEnvFailed(); - /* * MEMORY ERRORS */ diff --git a/quest/src/core/memory.cpp b/quest/src/core/memory.cpp index 79d4301a4..c8b81fc88 100644 --- a/quest/src/core/memory.cpp +++ b/quest/src/core/memory.cpp @@ -30,6 +30,7 @@ #include #elif defined(_WIN32) #define NOMINMAX + #define WIN32_LEAN_AND_MEAN #include #endif From eacba2db3f7135c3c91d388e4a81d628e2a28430 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 3 Jul 2025 23:06:50 +0200 Subject: [PATCH 13/16] patching cpu_getPageSize() return type --- quest/src/cpu/cpu_config.cpp | 2 +- quest/src/cpu/cpu_config.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index 3339f7772..5e93114a0 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -274,7 +274,7 @@ void cpu_deallocNumaArray(qcomp* arr, qindex length) { #else qindex arrSize = length * sizeof(qcomp); - unsigned long pageSize = cpu_getPageSize(); + long pageSize = cpu_getPageSize(); // sub-page arrays were allocated with calloc() if (arrSize <= pageSize) diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp index 0d8a4e416..21d39e359 100644 --- a/quest/src/cpu/cpu_config.hpp +++ b/quest/src/cpu/cpu_config.hpp @@ -65,7 +65,7 @@ PauliStr* cpu_allocPauliStrings(qindex numStrings); void cpu_deallocPauliStrings(PauliStr* strings); -unsigned long cpu_getPageSize(); +long cpu_getPageSize(); /* From 76cffba574460198b76a9640c2251a8d20c2b05d Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 3 Jul 2025 23:42:34 +0200 Subject: [PATCH 14/16] embarrassing really --- quest/src/cpu/cpu_config.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index 5e93114a0..bada05bca 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -278,9 +278,9 @@ void cpu_deallocNumaArray(qcomp* arr, qindex length) { // sub-page arrays were allocated with calloc() if (arrSize <= pageSize) - return cpu_deallocArray(length); + return cpu_deallocArray(arr); - qindex numPages = getNumPagesToContainArray(pageSize, arraySize); + qindex numPages = getNumPagesToContainArray(pageSize, arrSize); qindex numBytes = numPages * pageSize; // gauranteed no overflow int success = munmap(arr, numBytes); From 223447c37c15ce13a12a85131dacc3438a5cbf55 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sat, 5 Jul 2025 01:13:19 +0200 Subject: [PATCH 15/16] patched mbind args --- CMakeLists.txt | 1 + quest/src/cpu/cpu_config.cpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a05a70e44..e2e52d85e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -272,6 +272,7 @@ if (ENABLE_MULTITHREADING) if (${NUMA_FOUND}) compile_option(NUMA_AWARE ${NUMA_FOUND}) target_link_libraries(QuEST PRIVATE PkgConfig::NUMA) + message(STATUS "NUMA awareness is enabled.") else() compile_option(NUMA_AWARE 0) message(WARNING "libnuma not found, QuEST will not be aware of numa locality") diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index bada05bca..b24a34e9e 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -234,8 +234,9 @@ qcomp* cpu_allocNumaArray(qindex length) { // bind those pages from the offset address to the node (identified by mask) unsigned long nodeMask = 1UL << node; + unsigned long numBitsInMask = 8 * nodeMask; void* nodeAddr = reinterpret_cast(offsetAddr); - long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numNodes, 0); + long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numBitsInMask, 0); // treat bind failure as internal error (even though it can result from insufficient kernel mem), // rather than permitting silent fallback to non-NUMA awareness which might be astonishingly slow From 2227b0186fd64371bc3d3b2805d1a537d400a6be Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sat, 5 Jul 2025 01:15:22 +0200 Subject: [PATCH 16/16] absolutely shameful --- quest/src/cpu/cpu_config.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index b24a34e9e..e488e6a9c 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -234,7 +234,7 @@ qcomp* cpu_allocNumaArray(qindex length) { // bind those pages from the offset address to the node (identified by mask) unsigned long nodeMask = 1UL << node; - unsigned long numBitsInMask = 8 * nodeMask; + unsigned long numBitsInMask = 8 * sizeof(nodeMask); void* nodeAddr = reinterpret_cast(offsetAddr); long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numBitsInMask, 0);