QuEST-Kit · TysonRayJones · Jul 9, 2025 · May 12, 2025 · May 6, 2025 · May 6, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,7 +1,7 @@
 # @author Oliver Thomson Brown
 # @author Erich Essmann (patches including MSVC support)
 # @author Tyson Jones (patches including clang multithreading)
-# @author Luc Jaulmes (patching install)
+# @author Luc Jaulmes (NUMA awareness, patching install)
 #
 # Contributions to previous builds from:
 #  - Ania Brown
@@ -262,6 +262,23 @@ if (ENABLE_MULTITHREADING)
     OpenMP::OpenMP_C
   )
 
+  # Find NUMA - location of NUMA headers
+  if (WIN32)
+    compile_option(NUMA_AWARE 0)
+    message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
+  else()
+    include(FindPkgConfig)
+    pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
+    if (${NUMA_FOUND})
+      compile_option(NUMA_AWARE ${NUMA_FOUND})
+      target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
+      message(STATUS "NUMA awareness is enabled.")
+    else()
+      compile_option(NUMA_AWARE 0)
+      message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
+    endif()
+  endif()
+
   if (VERBOSE_LIB_NAME)
     string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
   endif()

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
@@ -225,7 +225,7 @@ void printCpuInfo() {
         "cpu", {
         {"numCpuCores",   printer_toStr(std::thread::hardware_concurrency()) + pm},
         {"numOmpProcs",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
-        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na},
+        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na},
         {"cpuMemory",     ram},
         {"cpuMemoryFree", un},
     });
@@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) {
 
     QuESTEnv env = getQuESTEnv();
 
-    int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1;
+    int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
     int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
     int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
 

diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
@@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
     Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
 
     // always allocate CPU memory
-    qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed
+    qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed
 
     // conditionally allocate GPU memory and communication buffers (even if numNodes == 1).
     // note that in distributed settings but where useDistrib=false, each node will have a
@@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) {
     validate_quregFields(qureg, __func__);
 
     // free CPU memory
-    cpu_deallocArray(qureg.cpuAmps);
+    cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode);
 
     // free CPU communication buffer
     if (qureg.isDistributed)

diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp
@@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us
 
     // and we require more than 1 thread available at QuESTEnv creation
     if (useMultithread == modeflag::USE_AUTO)
-        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0;
+        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0;
 }
 
 

diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
@@ -5,6 +5,7 @@
  * deployment is consistent with the compiled deployment modes.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA & pagesize errors)
  */
 
 #include "quest/include/types.h"
@@ -104,6 +105,41 @@ void error_memSizeQueriedButWouldOverflow() {
     raiseInternalError("Attempted to obtain memory necessary to allocate a distributed object's single-node partition but it overflowed size_t despite prior validation.");
 }
 
+void error_gettingPageSizeFailed() {
+
+    raiseInternalError("Failed to get the page size.");
+}
+
+void error_pageSizeNotAPowerOf2() {
+
+    raiseInternalError("The discovered page size was not a power of 2. Get Dr Denning on the phone.");
+}
+
+void error_pageSizeNotAMultipleOfQcomp() {
+
+    raiseInternalError("The page size was indivisible by the number of bytes in a qcomp.");
+}
+
+void error_gettingNumNumaNodesFailed() {
+
+    raiseInternalError("Failed to get the NUMA node count");
+}
+
+void error_numaAllocOrDeallocAttemptedOnWindows() {
+
+    raiseInternalError("NUMA-aware memory allocation or deallocation was attempted on Windows though this is not yet implemented, indicating a potential build issue.");
+}
+
+void error_numaBindingFailed() {
+
+    raiseInternalError("The binding of memory pages to NUMA nodes (with mbind) unexpectedly failed, despite prior reservation (with mmap) succeeding.");
+}
+
+void error_numaUnmappingFailed() {
+
+    raiseInternalError("NUMA-aware memory deallocation unexpectedly failed.");
+}
+
 
 
 /*

diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
@@ -5,6 +5,7 @@
  * deployment is consistent with the compiled deployment modes.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA & pagesize errors)
  */
 
 #ifndef ERRORS_HPP
@@ -50,6 +51,20 @@ void error_allocOfQuESTEnvFailed();
 
 void error_memSizeQueriedButWouldOverflow();
 
+void error_gettingPageSizeFailed();
+
+void error_pageSizeNotAPowerOf2();
+
+void error_pageSizeNotAMultipleOfQcomp();
+
+void error_gettingNumNumaNodesFailed();
+
+void error_numaAllocOrDeallocAttemptedOnWindows();
+
+void error_numaBindingFailed();
+
+void error_numaUnmappingFailed();
+
 
 
 /*

diff --git a/quest/src/core/memory.cpp b/quest/src/core/memory.cpp
@@ -30,6 +30,7 @@
     #include <sys/sysctl.h>
 #elif defined(_WIN32)
     #define NOMINMAX
+    #define WIN32_LEAN_AND_MEAN
     #include <windows.h>
 #endif
 

diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
@@ -5,6 +5,7 @@
  * logic, matrix algebra, and channel parameters.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (distributing ranges over blocks)
  */
 
 #include "quest/include/types.h"
@@ -25,6 +26,7 @@
 
 #include <functional>
 #include <algorithm>
+#include <utility>
 #include <complex>
 #include <cmath>
 #include <vector>
@@ -902,6 +904,41 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
     return out;
 }
 
+std::pair<qindex, qindex> util_getBlockMultipleSubRange(
+    qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges
+) {
+    // divides a range into whole blocks (and a single leftover sub-block) and
+    // attempts to uniformly distribute the blocks across the specified number of
+    // sub-ranges. When the blocks do not divide evenly between sub-ranges, the
+    // leftover blocks are spread apart across sub-ranges. When the range does not 
+    // divide evenly into blocks, the overflow is given to the final sub-range.
+
+    qindex numFullBlocks = rangeLen / blockLen; // floors
+    qindex subBlockLen = rangeLen % blockLen;
+
+    qindex baseNumBlocksPerSubRange = numFullBlocks / numSubRanges;
+    qindex numExtraBlocks = numFullBlocks % numSubRanges;
+
+    // determine how many extra blocks this subrange should contain
+    qindex prevExtra = (idSubRange * numExtraBlocks) / numSubRanges;
+    qindex prevShift = (idSubRange * numExtraBlocks) % numSubRanges;
+    bool hereExtra = (prevShift + numExtraBlocks) >= numSubRanges;
+
+    // allocate blocks to this sub-range
+    qindex startBlockInd = idSubRange * baseNumBlocksPerSubRange + prevExtra;
+    qindex endBlockInd = startBlockInd + baseNumBlocksPerSubRange + hereExtra;
+
+    // find this sub-range indices within [0, rangeLen)
+    qindex startInd = startBlockInd * blockLen;
+    qindex endInd = endBlockInd * blockLen; // exclusive
+
+    // arbitrarily allocate the leftover sub-block to the final sub-range
+    if (idSubRange == numSubRanges - 1)
+        endInd += subBlockLen;
+
+    return std::make_pair(startInd, endInd);
+}
+
 
 
 /*

diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
@@ -21,6 +21,7 @@
 
 #include <type_traits>
 #include <functional>
+#include <utility>
 #include <string>
 #include <vector>
 #include <array>
@@ -342,6 +343,8 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s
 
 util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds);
 
+std::pair<qindex, qindex> util_getBlockMultipleSubRange(qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges);
+
 
 
 /*
@@ -352,6 +355,7 @@ qreal util_getPhaseFromGateAngle(qreal angle);
 qcomp util_getPhaseFromGateAngle(qcomp angle);
 
 
+
 /*
  * DECOHERENCE FACTORS
  */