From b3e1b435d6dc5c5de64a12123665a45ee269b181 Mon Sep 17 00:00:00 2001
From: Luc Jaulmes <ljaulmes@ed.ac.uk>
Date: Mon, 12 May 2025 11:21:34 +0100
Subject: [PATCH 01/16] Rename CurrentNumThreads to AvailableNumThreads

---
 quest/src/api/environment.cpp     |  4 ++--
 quest/src/core/autodeployer.cpp   |  2 +-
 quest/src/cpu/cpu_config.cpp      | 12 +++++++++++-
 quest/src/cpu/cpu_config.hpp      |  4 +++-
 quest/src/cpu/cpu_subroutines.cpp |  2 +-
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 6eef515c4..541491899 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -225,7 +225,7 @@ void printCpuInfo() {
         "cpu", {
         {"numCpuCores",   printer_toStr(std::thread::hardware_concurrency()) + pm},
         {"numOmpProcs",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
-        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na},
+        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na},
         {"cpuMemory",     ram},
         {"cpuMemoryFree", un},
     });
@@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) {
 
     QuESTEnv env = getQuESTEnv();
 
-    int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1;
+    int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
     int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
     int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
 
diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp
index 2b6645e42..27c412687 100644
--- a/quest/src/core/autodeployer.cpp
+++ b/quest/src/core/autodeployer.cpp
@@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us
 
     // and we require more than 1 thread available at QuESTEnv creation
     if (useMultithread == modeflag::USE_AUTO)
-        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0;
+        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0;
 }
 
 
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index f27471d38..a818f5be4 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -46,11 +46,12 @@ bool cpu_isOpenmpCompiled() {
 }
 
 
-int cpu_getCurrentNumThreads() {
+int cpu_getAvailableNumThreads() {
 #if COMPILE_OPENMP
     int n = -1;
 
     #pragma omp parallel shared(n)
+    #pragma omp single
     n = omp_get_num_threads();
 
     return n;
@@ -90,6 +91,15 @@ int cpu_getOpenmpThreadInd() {
 }
 
 
+int cpu_getCurrentNumThreads() {
+#if COMPILE_OPENMP
+    return omp_get_num_threads();
+#else
+    return 1;
+#endif
+}
+
+
 
 /*
  * MEMORY ALLOCATION
diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp
index 48f54b44f..982fc8a2f 100644
--- a/quest/src/cpu/cpu_config.hpp
+++ b/quest/src/cpu/cpu_config.hpp
@@ -23,7 +23,7 @@ using std::vector;
 
 bool cpu_isOpenmpCompiled();
 
-int cpu_getCurrentNumThreads();
+int cpu_getAvailableNumThreads();
 
 int cpu_getNumOpenmpProcessors();
 
@@ -35,6 +35,8 @@ int cpu_getNumOpenmpProcessors();
 
 int cpu_getOpenmpThreadInd();
 
+int cpu_getCurrentNumThreads();
+
 
 
 /*
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 9c90e08c1..0b60d85ca 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -873,7 +873,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
     // whenever each thread has at least 1 iteration for itself. And of course
     // we serialise both inner and outer loops when qureg multithreading is off.
 
-    if (!qureg.isMultithreaded || numOuterIts >= cpu_getCurrentNumThreads()) {
+    if (!qureg.isMultithreaded || numOuterIts >= cpu_getAvailableNumThreads()) {
     
         // parallel
         #pragma omp parallel for if(qureg.isMultithreaded)

From 08b7db228b6b01a0ea450e0e1d9c182e2e2eef7b Mon Sep 17 00:00:00 2001
From: Luc Jaulmes <ljaulmes@ed.ac.uk>
Date: Tue, 6 May 2025 10:57:14 +0200
Subject: [PATCH 02/16] Assign init work in parallel to enforce first-touch

---
 quest/src/core/utilities.hpp      | 18 ++++++++++++++++++
 quest/src/cpu/cpu_subroutines.cpp |  9 ++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index 6d741312b..3fd47c43a 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -343,6 +343,24 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s
 util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds);
 
 
+// Generic function to split a workload fairly, with granularity >= block_size
+static inline
+std::pair<qindex, qindex>
+util_distribute(const qindex work, const qindex block, const int id, const int n) {
+    // ASSUME(work % block == 0);
+    const qindex blocks = work / block;
+
+    qindex spread = blocks / n;
+    qindex extra = blocks % n;
+
+    qindex prev_extra = (id * extra) / n;
+    qindex prev_shift = (id * extra) % n;
+    qindex here_extra = (prev_shift + extra) >= n;
+
+    qindex pos = id * spread + prev_extra;
+    return std::make_pair(pos * block, (pos + spread + here_extra) * block);
+}
+
 
 /*
  * GATE PARAMETERS
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 0b60d85ca..87e9e47a9 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -2357,7 +2357,14 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {
 
     // faster on average (though perhaps not for large quregs)
     // than a custom multithreaded loop
-    std::fill(qureg.cpuAmps, qureg.cpuAmps + qureg.numAmpsPerNode, amp);
+    #pragma omp parallel if(qureg.isMultithreaded)
+    {
+
+        // Distribute number of tasks and convert to indexes. 4kB page standard?
+        const auto [start, end] = util_distribute(qureg.numAmpsPerNode, 4096 / sizeof(qcomp),
+                cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads());
+        std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp);
+    }
 }
 
 

From 3adba93c99b8b0aa7facce01fbbd62dd1c5c38dd Mon Sep 17 00:00:00 2001
From: Luc Jaulmes <ljaulmes@ed.ac.uk>
Date: Tue, 6 May 2025 11:46:31 +0200
Subject: [PATCH 03/16] Numa aware alloc

---
 CMakeLists.txt               | 15 ++++++
 quest/src/api/qureg.cpp      |  4 +-
 quest/src/core/errors.cpp    | 12 +++++
 quest/src/core/errors.hpp    |  3 ++
 quest/src/cpu/cpu_config.cpp | 96 +++++++++++++++++++++++++++++++-----
 quest/src/cpu/cpu_config.hpp |  3 ++
 6 files changed, 118 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 933e23086..a5a69e761 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -262,6 +262,21 @@ if (ENABLE_MULTITHREADING)
     OpenMP::OpenMP_C
   )
 
+  # Find NUMA - location of NUMA headers
+  if (WIN32)
+    compile_option(NUMA_AWARE 0)
+    message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
+  else()
+    include(FindPkgConfig)
+    pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
+    compile_option(NUMA_AWARE ${NUMA_FOUND})
+    if (${NUMA_FOUND})
+      target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
+    else()
+      message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
+    endif()
+  endif()
+
   if (VERBOSE_LIB_NAME)
     string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
   endif()
diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
index 98068f079..7d68528a1 100644
--- a/quest/src/api/qureg.cpp
+++ b/quest/src/api/qureg.cpp
@@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
     Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
 
     // always allocate CPU memory
-    qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed
+    qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed
 
     // conditionally allocate GPU memory and communication buffers (even if numNodes == 1).
     // note that in distributed settings but where useDistrib=false, each node will have a
@@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) {
     validate_quregFields(qureg, __func__);
 
     // free CPU memory
-    cpu_deallocArray(qureg.cpuAmps);
+    cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode);
 
     // free CPU communication buffer
     if (qureg.isDistributed)
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 2f44127c8..afcd8c014 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -94,6 +94,18 @@ void error_allocOfQuESTEnvFailed() {
 }
 
 
+void error_gettingPageSizeFailed() {
+
+    raiseInternalError("Failed to get the page size.");
+}
+
+
+void error_gettingNumaNodesFailed() {
+
+    raiseInternalError("Failed to get the numa node count");
+}
+
+
 
 /*
  * MEMORY ERRORS
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index ce8f7e68c..88cffeb50 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -42,6 +42,9 @@ void error_validationListUniquenessCheckExceededMaskSize();
 
 void error_allocOfQuESTEnvFailed();
 
+void error_gettingPageSizeFailed();
+
+void error_gettingNumaNodesFailed();
 
 
 /*
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index a818f5be4..385efa1a2 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -14,6 +14,7 @@
 #include <vector>
 #include <cstring>
 #include <cstdlib>
+#include <cstdint>
 
 using std::vector;
 
@@ -34,6 +35,12 @@ using std::vector;
     #include <omp.h>
 #endif
 
+#if NUMA_AWARE
+    #include <sys/mman.h>
+    #include <unistd.h>
+    #include <numaif.h>
+    #include <numa.h>
+#endif // NUMA_AWARE
 
 
 /*
@@ -105,23 +112,69 @@ int cpu_getCurrentNumThreads() {
  * MEMORY ALLOCATION
  */
 
+#if NUMA_AWARE
+unsigned long getPageSize() {
+    static unsigned long page_size = 0;
+    if (!page_size) {
+        page_size = sysconf(_SC_PAGESIZE);
+        if (page_size == ~0UL) {
+            error_gettingPageSizeFailed();
+        }
+    }
+    return page_size;
+}
+
+unsigned long getNumaNodes() {
+    static int n_nodes = 0;
+    if (!n_nodes) {
+        n_nodes = numa_num_configured_nodes();
+        if (n_nodes < 1) {
+            error_gettingNumaNodesFailed();
+        }
+    }
+    return n_nodes;
+}
+#endif
 
 qcomp* cpu_allocArray(qindex length) {
+    return (qcomp*) calloc(length, sizeof(qcomp));
+}
 
-    /// @todo
-    /// here, we calloc the entire array in a serial setting, rather than one malloc 
-    /// followed by threads subsequently memset'ing their own partitions. The latter
-    /// approach would distribute the array pages across NUMA nodes, accelerating 
-    /// their subsequent access by the same threads (via NUMA's first-touch policy).
-    /// We have so far foregone this optimisation since a thread's memory-access pattern
-    /// in many of the QuEST functions is non-trivial, and likely to be inconsistent 
-    /// with the memset pattern. As such, I expect the benefit is totally occluded
-    /// and only introduces potential new bugs - but this should be tested and confirmed!
-
-    // we call calloc over malloc in order to fail immediately if mem isn't available;
-    // caller must handle nullptr result
 
-    return (qcomp*) calloc(length, sizeof(qcomp));
+qcomp* cpu_allocNumaArray(qindex length) {
+#if !NUMA_AWARE
+    return cpu_allocArray(length);
+#else
+    unsigned long page_size = getPageSize();
+    int n_nodes = getNumaNodes();
+
+    qindex size = length * sizeof(qcomp);
+    int pages = (size + page_size - 1) / page_size;
+    void *addr = mmap(NULL, pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (n_nodes == 1) {
+        return reinterpret_cast<qcomp*>(addr);
+    }
+
+    // distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible
+    int floor_pages = pages / n_nodes;
+    int spread_pages = pages % n_nodes;
+
+    uintptr_t pos = (uintptr_t)addr;
+    for (int node = 0, shift = n_nodes; node < n_nodes; ++node) {
+        shift -= spread_pages;
+        int node_pages = floor_pages + (shift <= 0);
+
+        unsigned long node_mask = 1UL << node;
+        mbind((void*)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof(node_mask) * 8, 0);
+
+        pos += node_pages * page_size;
+        if (shift <= 0) {
+            shift += n_nodes;
+        }
+    }
+
+    return reinterpret_cast<qcomp*>(addr);
+#endif // NUMA_AWARE
 }
 
 
@@ -132,6 +185,23 @@ void cpu_deallocArray(qcomp* arr) {
 }
 
 
+void cpu_deallocNumaArray(qcomp* arr, qindex length) {
+    if (arr == nullptr) {
+        return;
+    }
+
+#if !NUMA_AWARE
+    return cpu_deallocArray(arr);
+#else
+    unsigned long page_size = getPageSize();
+    qindex size = length * sizeof(qcomp);
+    int pages = (size + page_size - 1) / page_size;
+
+    munmap(arr, pages * page_size);
+#endif // NUMA_AWARE
+}
+
+
 qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) {
 
     // do not allocate if arr alloc failed (caller will handle)
diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp
index 982fc8a2f..ba55a785f 100644
--- a/quest/src/cpu/cpu_config.hpp
+++ b/quest/src/cpu/cpu_config.hpp
@@ -46,6 +46,9 @@ int cpu_getCurrentNumThreads();
 qcomp* cpu_allocArray(qindex length);
 void cpu_deallocArray(qcomp* arr);
 
+qcomp* cpu_allocNumaArray(qindex length);
+void cpu_deallocNumaArray(qcomp* arr, qindex length);
+
 qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim);
 void cpu_deallocMatrixWrapper(qcomp** wrapper);
 

From 827cd09e62cada3849543870be9997bbc45bee35 Mon Sep 17 00:00:00 2001
From: Luc Jaulmes <ljaulmes@ed.ac.uk>
Date: Mon, 23 Jun 2025 14:24:41 +0100
Subject: [PATCH 04/16] Move util_distribute() out of headers

---
 quest/src/core/utilities.cpp | 15 +++++++++++++++
 quest/src/core/utilities.hpp | 17 +----------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 1c22d43d8..2ad1a1309 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -903,6 +903,21 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
 }
 
 
+std::pair<qindex, qindex>
+util_distribute(const qindex work, const qindex block, const int id, const int n) {
+    // ASSUME(work % block == 0);
+    const qindex blocks = work / block;
+
+    qindex spread = blocks / n;
+    qindex extra = blocks % n;
+
+    qindex prev_extra = (id * extra) / n;
+    qindex prev_shift = (id * extra) % n;
+    qindex here_extra = (prev_shift + extra) >= n;
+
+    qindex pos = id * spread + prev_extra;
+    return std::make_pair(pos * block, (pos + spread + here_extra) * block);
+}
 
 /*
  * GATE PARAMETERS
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index 3fd47c43a..36f051595 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -344,22 +344,7 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
 
 
 // Generic function to split a workload fairly, with granularity >= block_size
-static inline
-std::pair<qindex, qindex>
-util_distribute(const qindex work, const qindex block, const int id, const int n) {
-    // ASSUME(work % block == 0);
-    const qindex blocks = work / block;
-
-    qindex spread = blocks / n;
-    qindex extra = blocks % n;
-
-    qindex prev_extra = (id * extra) / n;
-    qindex prev_shift = (id * extra) % n;
-    qindex here_extra = (prev_shift + extra) >= n;
-
-    qindex pos = id * spread + prev_extra;
-    return std::make_pair(pos * block, (pos + spread + here_extra) * block);
-}
+std::pair<qindex, qindex> util_distribute(const qindex work, const qindex block, const int id, const int n);
 
 
 /*

From 7960aebaf38d9a2a32ddf9c1c146bae1bcf21f70 Mon Sep 17 00:00:00 2001
From: Luc Jaulmes <ljaulmes@ed.ac.uk>
Date: Sun, 29 Jun 2025 11:18:07 +0100
Subject: [PATCH 05/16] Make util_distribute() work on toy examples

---
 quest/src/core/utilities.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 2ad1a1309..9377be4ee 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -905,8 +905,9 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
 
 std::pair<qindex, qindex>
 util_distribute(const qindex work, const qindex block, const int id, const int n) {
-    // ASSUME(work % block == 0);
     const qindex blocks = work / block;
+    // really should not happen except when work < block as work is power of 2 and block should be too.
+    const qindex last = work % block;
 
     qindex spread = blocks / n;
     qindex extra = blocks % n;
@@ -916,7 +917,8 @@ util_distribute(const qindex work, const qindex block, const int id, const int n
     qindex here_extra = (prev_shift + extra) >= n;
 
     qindex pos = id * spread + prev_extra;
-    return std::make_pair(pos * block, (pos + spread + here_extra) * block);
+    qindex end = pos + spread + here_extra;
+    return std::make_pair(pos * block, end * block + (id == n - 1 ? last : 0));
 }
 
 /*

From 98c50a4d962a717d311b05e9de3d35e9ca65b5d2 Mon Sep 17 00:00:00 2001
From: Luc Jaulmes <ljaulmes@ed.ac.uk>
Date: Sun, 29 Jun 2025 11:26:39 +0100
Subject: [PATCH 06/16] Clean up hardcoded 4k page size

---
 quest/src/core/memory.hpp         |  6 ++++++
 quest/src/cpu/cpu_config.cpp      | 15 ++++++++++-----
 quest/src/cpu/cpu_config.hpp      |  2 ++
 quest/src/cpu/cpu_subroutines.cpp |  4 +---
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/quest/src/core/memory.hpp b/quest/src/core/memory.hpp
index 7b112b027..cd20d867b 100644
--- a/quest/src/core/memory.hpp
+++ b/quest/src/core/memory.hpp
@@ -21,6 +21,12 @@
 
 
 
+/*
+ * Not necessarily the actual page size, but a sensible default for page-sized things.
+ */
+#define FALLBACK_PAGE_SIZE 4096
+
+
 /*
  * HARDWARE QUERYING
  */
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index 385efa1a2..9aa640fe8 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -9,6 +9,7 @@
 #include "quest/include/types.h"
 #include "quest/include/paulis.h"
 
+#include "quest/src/core/memory.hpp"
 #include "quest/src/core/errors.hpp"
 
 #include <vector>
@@ -112,8 +113,8 @@ int cpu_getCurrentNumThreads() {
  * MEMORY ALLOCATION
  */
 
+unsigned long cpu_getPageSize() {
 #if NUMA_AWARE
-unsigned long getPageSize() {
     static unsigned long page_size = 0;
     if (!page_size) {
         page_size = sysconf(_SC_PAGESIZE);
@@ -122,9 +123,13 @@ unsigned long getPageSize() {
         }
     }
     return page_size;
+#else
+    return FALLBACK_PAGE_SIZE;
+#endif
 }
 
-unsigned long getNumaNodes() {
+#if NUMA_AWARE
+unsigned long cpu_getNumaNodes() {
     static int n_nodes = 0;
     if (!n_nodes) {
         n_nodes = numa_num_configured_nodes();
@@ -145,8 +150,8 @@ qcomp* cpu_allocNumaArray(qindex length) {
 #if !NUMA_AWARE
     return cpu_allocArray(length);
 #else
-    unsigned long page_size = getPageSize();
-    int n_nodes = getNumaNodes();
+    unsigned long page_size = cpu_getPageSize();
+    int n_nodes = cpu_getNumaNodes();
 
     qindex size = length * sizeof(qcomp);
     int pages = (size + page_size - 1) / page_size;
@@ -193,7 +198,7 @@ void cpu_deallocNumaArray(qcomp* arr, qindex length) {
 #if !NUMA_AWARE
     return cpu_deallocArray(arr);
 #else
-    unsigned long page_size = getPageSize();
+    unsigned long page_size = cpu_getPageSize();
     qindex size = length * sizeof(qcomp);
     int pages = (size + page_size - 1) / page_size;
 
diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp
index ba55a785f..0d8a4e416 100644
--- a/quest/src/cpu/cpu_config.hpp
+++ b/quest/src/cpu/cpu_config.hpp
@@ -65,6 +65,8 @@ PauliStr* cpu_allocPauliStrings(qindex numStrings);
 void cpu_deallocPauliStrings(PauliStr* strings);
 
 
+unsigned long cpu_getPageSize();
+
 
 /*
  * MEMORY MOVEMENT
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 87e9e47a9..ad5266e08 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -2359,9 +2359,7 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {
     // than a custom multithreaded loop
     #pragma omp parallel if(qureg.isMultithreaded)
     {
-
-        // Distribute number of tasks and convert to indexes. 4kB page standard?
-        const auto [start, end] = util_distribute(qureg.numAmpsPerNode, 4096 / sizeof(qcomp),
+        const auto [start, end] = util_distribute(qureg.numAmpsPerNode, cpu_getPageSize() / sizeof(qcomp),
                 cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads());
         std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp);
     }

From 99e2705dab6e56dd2f09d74504b5eefc75c51f33 Mon Sep 17 00:00:00 2001
From: Luc Jaulmes <ljaulmes@ed.ac.uk>
Date: Sun, 29 Jun 2025 11:39:21 +0100
Subject: [PATCH 07/16] Fix cmake to handle missing NUMA_FOUND variable

In the case libnuma is not found, NUMA_FOUND is (can be?) not defined
instead of being set to 0.
---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5a69e761..9e629de55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -269,10 +269,11 @@ if (ENABLE_MULTITHREADING)
   else()
     include(FindPkgConfig)
     pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
-    compile_option(NUMA_AWARE ${NUMA_FOUND})
     if (${NUMA_FOUND})
+      compile_option(NUMA_AWARE ${NUMA_FOUND})
       target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
     else()
+      compile_option(NUMA_AWARE 0)
       message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
     endif()
   endif()

From 9e9e2308948f9dff4f4e74fd0e1cae7d3646ae84 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 3 Jul 2025 22:44:56 +0200
Subject: [PATCH 08/16] clarified  util_distribute to
 util_getBlockMultipleSubRange

---
 quest/src/core/utilities.cpp      | 46 ++++++++++++++++++++++---------
 quest/src/core/utilities.hpp      |  5 ++--
 quest/src/cpu/cpu_subroutines.cpp | 15 +++++++---
 3 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 9377be4ee..7aa4a257f 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -5,6 +5,7 @@
  * logic, matrix algebra, and channel parameters.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (distributing ranges over blocks)
  */
 
 #include "quest/include/types.h"
@@ -25,6 +26,7 @@
 
 #include <functional>
 #include <algorithm>
+#include <utility>
 #include <complex>
 #include <cmath>
 #include <vector>
@@ -902,25 +904,43 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
     return out;
 }
 
+std::pair<qindex, qindex> util_getBlockMultipleSubRange(
+    qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges
+) {
+    // divides a range into whole blocks (and a single leftover sub-block) and
+    // attempts to uniformly distribute the blocks across the specified number of
+    // sub-ranges. When the blocks do not divide evenly between sub-ranges, the
+    // leftover blocks are spread apart across sub-ranges. When the range does not 
+    // divide evenly into blocks, the overflow is given to the final sub-range.
 
-std::pair<qindex, qindex>
-util_distribute(const qindex work, const qindex block, const int id, const int n) {
-    const qindex blocks = work / block;
-    // really should not happen except when work < block as work is power of 2 and block should be too.
-    const qindex last = work % block;
+    qindex numFullBlocks = rangeLen / blockLen; // floors
+    qindex subBlockLen = rangeLen % blockLen;
 
-    qindex spread = blocks / n;
-    qindex extra = blocks % n;
+    qindex baseNumBlocksPerSubRange = numFullBlocks / numSubRanges;
+    qindex numExtraBlocks = numFullBlocks % numSubRanges;
 
-    qindex prev_extra = (id * extra) / n;
-    qindex prev_shift = (id * extra) % n;
-    qindex here_extra = (prev_shift + extra) >= n;
+    // determine how many extra blocks this subrange should contain
+    qindex prevExtra = (idSubRange * numExtraBlocks) / numSubRanges;
+    qindex prevShift = (idSubRange * numExtraBlocks) % numSubRanges;
+    bool hereExtra = (prevShift + numExtraBlocks) >= numSubRanges;
 
-    qindex pos = id * spread + prev_extra;
-    qindex end = pos + spread + here_extra;
-    return std::make_pair(pos * block, end * block + (id == n - 1 ? last : 0));
+    // allocate blocks to this sub-range
+    qindex startBlockInd = idSubRange * baseNumBlocksPerSubRange + prevExtra;
+    qindex endBlockInd = startBlockInd + baseNumBlocksPerSubRange + hereExtra;
+
+    // find this sub-range indices within [0, rangeLen)
+    qindex startInd = startBlockInd * blockLen;
+    qindex endInd = endBlockInd * blockLen; // exclusive
+
+    // arbitrarily allocate the leftover sub-block to the final sub-range
+    if (idSubRange == numSubRanges - 1)
+        endInd += subBlockLen;
+
+    return std::make_pair(startInd, endInd);
 }
 
+
+
 /*
  * GATE PARAMETERS
  */
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index 36f051595..56691c2f4 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -21,6 +21,7 @@
 
 #include <type_traits>
 #include <functional>
+#include <utility>
 #include <string>
 #include <vector>
 #include <array>
@@ -342,9 +343,8 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s
 
 util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds);
 
+std::pair<qindex, qindex> util_getBlockMultipleSubRange(qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges);
 
-// Generic function to split a workload fairly, with granularity >= block_size
-std::pair<qindex, qindex> util_distribute(const qindex work, const qindex block, const int id, const int n);
 
 
 /*
@@ -355,6 +355,7 @@ qreal util_getPhaseFromGateAngle(qreal angle);
 qcomp util_getPhaseFromGateAngle(qcomp angle);
 
 
+
 /*
  * DECOHERENCE FACTORS
  */
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index ad5266e08..b267df03a 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -9,6 +9,7 @@
  * 
  * @author Tyson Jones
  * @author Oliver Brown (OpenMP 'if' clauses)
+ * @author Luc Jaulmes (optimised initUniformState)
  * @author Richard Meister (helped patch on LLVM)
  * @author Kshitij Chhabra (patched v3 clauses with gcc9)
  * @author Ania (Anna) Brown (developed QuEST v1 logic)
@@ -2355,12 +2356,18 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_multiQubitProjector
 
 void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {
 
-    // faster on average (though perhaps not for large quregs)
-    // than a custom multithreaded loop
+    // approx-uniformly distribute modified memory pages across threads,
+    // in the hope that each std::fill() will touch only memory within 
+    // the thread's corresponding NUMA node, for best performance 
+
+    int numAmpsPerPage = cpu_getPageSize() / sizeof(qcomp); // divides evenly
+
     #pragma omp parallel if(qureg.isMultithreaded)
     {
-        const auto [start, end] = util_distribute(qureg.numAmpsPerNode, cpu_getPageSize() / sizeof(qcomp),
-                cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads());
+        const auto [start, end] = util_getBlockMultipleSubRange(
+            qureg.numAmpsPerNode, numAmpsPerPage,
+            cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads());
+
         std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp);
     }
 }

From 311b4d135055603634131d61ce221b71a527255a Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 3 Jul 2025 22:49:35 +0200
Subject: [PATCH 09/16] added Windows getPageSize

---
 quest/src/cpu/cpu_config.cpp | 65 +++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 26 deletions(-)

diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index 9aa640fe8..778eae882 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -3,6 +3,7 @@
  * configuration, and allocating and copying RAM data.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA awareness)
  */
 
 #include "quest/include/modes.h"
@@ -32,16 +33,35 @@ using std::vector;
 #endif
 
 
+/// @todo
+/// Windows provides a NUMA API we could access in theory, although we 
+/// forego the hassle for now - who is running QuEST on big multi-core 
+/// Windows? This validation protects against enabling NUMA awareness
+/// on Windows but silently recieving no benefit due to no NUMA API calls
+
+#if NUMA_AWARE && defined(_WIN32)
+    #error "NUMA awareness is not currently supported on non-POSIX systems like Windows."
+#endif
+
+
 #if COMPILE_OPENMP
     #include <omp.h>
 #endif
 
-#if NUMA_AWARE
+#if NUMA_AWARE && ! defined(_WIN32)
     #include <sys/mman.h>
-    #include <unistd.h>
     #include <numaif.h>
     #include <numa.h>
-#endif // NUMA_AWARE
+#endif
+
+#if defined(_WIN32)
+    #define NOMINMAX
+    #define WIN32_LEAN_AND_MEAN
+    #include <windows.h>
+#else
+    #include <unistd.h>
+#endif
+
 
 
 /*
@@ -113,33 +133,26 @@ int cpu_getCurrentNumThreads() {
  * MEMORY ALLOCATION
  */
 
-unsigned long cpu_getPageSize() {
-#if NUMA_AWARE
-    static unsigned long page_size = 0;
-    if (!page_size) {
-        page_size = sysconf(_SC_PAGESIZE);
-        if (page_size == ~0UL) {
-            error_gettingPageSizeFailed();
-        }
-    }
-    return page_size;
+long cpu_getPageSize() {
+
+    // avoid repeated queries to this fixed value
+    static long pageSize = 0;
+    if (pageSize > 0)
+        return pageSize;
+
+    // obtain pageSize for the first time
+#if defined(_WIN32)
+    SYSTEM_INFO sysInfo;
+    GetSystemInfo(&sysInfo);
+    pageSize = sysInfo.dwPageSize;
 #else
-    return FALLBACK_PAGE_SIZE;
+    pageSize = sysconf(_SC_PAGESIZE);
 #endif
-}
 
-#if NUMA_AWARE
-unsigned long cpu_getNumaNodes() {
-    static int n_nodes = 0;
-    if (!n_nodes) {
-        n_nodes = numa_num_configured_nodes();
-        if (n_nodes < 1) {
-            error_gettingNumaNodesFailed();
-        }
-    }
-    return n_nodes;
+
+    return pageSize;
 }
-#endif
+
 
 qcomp* cpu_allocArray(qindex length) {
     return (qcomp*) calloc(length, sizeof(qcomp));

From 1ffd8041cb3bfaa6dfe5aac922c172ced3a3cbcb Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 3 Jul 2025 22:52:00 +0200
Subject: [PATCH 10/16] clarified NUMA alloc

---
 quest/src/core/memory.hpp    |   6 --
 quest/src/cpu/cpu_config.cpp | 118 ++++++++++++++++++++++++-----------
 2 files changed, 81 insertions(+), 43 deletions(-)

diff --git a/quest/src/core/memory.hpp b/quest/src/core/memory.hpp
index cd20d867b..7b112b027 100644
--- a/quest/src/core/memory.hpp
+++ b/quest/src/core/memory.hpp
@@ -21,12 +21,6 @@
 
 
 
-/*
- * Not necessarily the actual page size, but a sensible default for page-sized things.
- */
-#define FALLBACK_PAGE_SIZE 4096
-
-
 /*
  * HARDWARE QUERYING
  */
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index 778eae882..a831869bd 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -133,6 +133,14 @@ int cpu_getCurrentNumThreads() {
  * MEMORY ALLOCATION
  */
 
+
+qindex getNumPagesToContainArray(long pageLen, qindex arrLen) {
+
+    // round up to the nearest page
+    return static_cast<qindex>(std::ceil(arrLen / (qreal) pageLen));
+}
+
+
 long cpu_getPageSize() {
 
     // avoid repeated queries to this fixed value
@@ -160,39 +168,65 @@ qcomp* cpu_allocArray(qindex length) {
 
 
 qcomp* cpu_allocNumaArray(qindex length) {
-#if !NUMA_AWARE
+#if ! NUMA_AWARE
     return cpu_allocArray(length);
-#else
-    unsigned long page_size = cpu_getPageSize();
-    int n_nodes = cpu_getNumaNodes();
-
-    qindex size = length * sizeof(qcomp);
-    int pages = (size + page_size - 1) / page_size;
-    void *addr = mmap(NULL, pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-    if (n_nodes == 1) {
-        return reinterpret_cast<qcomp*>(addr);
-    }
-
-    // distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible
-    int floor_pages = pages / n_nodes;
-    int spread_pages = pages % n_nodes;
-
-    uintptr_t pos = (uintptr_t)addr;
-    for (int node = 0, shift = n_nodes; node < n_nodes; ++node) {
-        shift -= spread_pages;
-        int node_pages = floor_pages + (shift <= 0);
 
-        unsigned long node_mask = 1UL << node;
-        mbind((void*)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof(node_mask) * 8, 0);
+#elif defined(_WIN32)
+    error_numaAllocOrDeallocAttemptedOnWindows();
 
-        pos += node_pages * page_size;
-        if (shift <= 0) {
-            shift += n_nodes;
-        }
+#else
+    // we will divide array's memory into pages
+    long pageSize = cpu_getPageSize();
+    qindex arraySize = length * sizeof(qcomp); // gauranteed no overflow
+
+    // if entire array fits within a single page, alloc like normal
+    if (arraySize <= pageSize)
+        return cpu_allocArray(length);
+
+    // otherwise we will bind pages across NUMA nodes
+    static int numNodes = numa_num_configured_nodes();
+    if (numNodes < 1)
+        error_gettingNumNumaNodesFailed();
+
+    qindex numPages = getNumPagesToContainArray(pageSize, arraySize);
+    qindex numBytes = numPages * pageSize; // prior validation gaurantees no overflow
+    
+    // allocate memory, potentially more than arraySize (depending on page divisibility)
+    void *rawAddr = mmap(NULL, numBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    
+    // if there is only a single NUMA node, then all memory access will occur within it
+    qcomp* outAddr = reinterpret_cast<qcomp*>(rawAddr);
+    if (numNodes == 1)
+        return outAddr;
+
+    // otherwise, we bind continguous pages to NUMA nodes, distributing the pages 
+    // attemptedly uniformly and spreading remaining pages maximally apart
+    qindex baseNumPagesPerNode = numPages / numNodes; // floors
+    qindex remainingNumPagesTotal = numPages % numNodes;
+
+    // use integer type for safe address arithmetic below
+    uintptr_t offsetAddr = reinterpret_cast<uintptr_t>(rawAddr);
+
+    for (int node=0, shift=numNodes; node < numNodes; ++node) {
+
+        // decide number of pages to bind to NUMA node
+        shift -= remainingNumPagesTotal;
+        qindex numPagesInNode = baseNumPagesPerNode + (shift <= 0);
+        qindex numBytesInNode = numPagesInNode * pageSize; // validation prevents overflow
+
+        // bind those pages from the offset address to the node (identified by mask)
+        unsigned long nodeMask = 1UL << node;
+        void* nodeAddr = reinterpret_cast<void*>(offsetAddr);
+        long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numNodes, 0);
+
+        // prepare next node's address
+        offsetAddr += numPagesInNode * pageSize;
+        if (shift <= 0)
+            shift += numNodes;
     }
 
-    return reinterpret_cast<qcomp*>(addr);
-#endif // NUMA_AWARE
+    return outAddr;
+#endif
 }
 
 
@@ -204,19 +238,29 @@ void cpu_deallocArray(qcomp* arr) {
 
 
 void cpu_deallocNumaArray(qcomp* arr, qindex length) {
-    if (arr == nullptr) {
+
+    // musn't pass nullptr to munmap() below
+    if (arr == nullptr)
         return;
-    }
 
-#if !NUMA_AWARE
-    return cpu_deallocArray(arr);
+#if ! NUMA_AWARE
+    cpu_deallocArray(arr);
+
+#elif defined(_WIN32)
+    error_numaAllocOrDeallocAttemptedOnWindows();
+
 #else
-    unsigned long page_size = cpu_getPageSize();
-    qindex size = length * sizeof(qcomp);
-    int pages = (size + page_size - 1) / page_size;
+    qindex arrSize = length * sizeof(qcomp);
+    unsigned long pageSize = cpu_getPageSize();
 
-    munmap(arr, pages * page_size);
-#endif // NUMA_AWARE
+    // sub-page arrays were allocated with calloc()
+    if (arrSize <= pageSize)
+        return cpu_deallocArray(length);
+
+    qindex numPages = getNumPagesToContainArray(pageSize, arraySize);
+    qindex numBytes = numPages * pageSize; // gauranteed no overflow
+    int success = munmap(arr, numBytes);
+#endif
 }
 
 

From 872336ed1a6f16241c266798eb9f4ec5bb53270d Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 3 Jul 2025 22:52:41 +0200
Subject: [PATCH 11/16] added error checking

---
 quest/src/core/errors.cpp    | 38 +++++++++++++++++++++++++++++-------
 quest/src/core/errors.hpp    | 17 ++++++++++++++--
 quest/src/cpu/cpu_config.cpp | 26 ++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index afcd8c014..112e94230 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -5,6 +5,7 @@
  * deployment is consistent with the compiled deployment modes.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA & pagesize errors)
  */
 
 #include "quest/include/types.h"
@@ -94,26 +95,49 @@ void error_allocOfQuESTEnvFailed() {
 }
 
 
+
+/*
+ * MEMORY ERRORS
+ */
+
+void error_memSizeQueriedButWouldOverflow() {
+
+    raiseInternalError("Attempted to obtain memory necessary to allocate a distributed object's single-node partition but it overflowed size_t despite prior validation.");
+}
+
 void error_gettingPageSizeFailed() {
 
     raiseInternalError("Failed to get the page size.");
 }
 
+void error_pageSizeNotAPowerOf2() {
+
+    raiseInternalError("The discovered page size was not a power of 2. Get Dr Denning on the phone.");
+}
 
-void error_gettingNumaNodesFailed() {
+void error_pageSizeNotAMultipleOfQcomp() {
 
-    raiseInternalError("Failed to get the numa node count");
+    raiseInternalError("The page size was indivisible by the number of bytes in a qcomp.");
 }
 
+void error_gettingNumNumaNodesFailed() {
 
+    raiseInternalError("Failed to get the NUMA node count");
+}
 
-/*
- * MEMORY ERRORS
- */
+void error_numaAllocOrDeallocAttemptedOnWindows() {
 
-void error_memSizeQueriedButWouldOverflow() {
+    raiseInternalError("NUMA-aware memory allocation or deallocation was attempted on Windows though this is not yet implemented, indicating a potential build issue.");
+}
 
-    raiseInternalError("Attempted to obtain memory necessary to allocate a distributed object's single-node partition but it overflowed size_t despite prior validation.");
+void error_numaBindingFailed() {
+
+    raiseInternalError("The binding of memory pages to NUMA nodes (with mbind) unexpectedly failed, despite prior reservation (with mmap) succeeding.");
+}
+
+void error_numaUnmappingFailed() {
+
+    raiseInternalError("NUMA-aware memory deallocation unexpectedly failed.");
 }
 
 
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 88cffeb50..a7f3615b5 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -5,6 +5,7 @@
  * deployment is consistent with the compiled deployment modes.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA & pagesize errors)
  */
 
 #ifndef ERRORS_HPP
@@ -42,9 +43,7 @@ void error_validationListUniquenessCheckExceededMaskSize();
 
 void error_allocOfQuESTEnvFailed();
 
-void error_gettingPageSizeFailed();
 
-void error_gettingNumaNodesFailed();
 
 
 /*
@@ -53,6 +52,20 @@ void error_gettingNumaNodesFailed();
 
 void error_memSizeQueriedButWouldOverflow();
 
+void error_gettingPageSizeFailed();
+
+void error_pageSizeNotAPowerOf2();
+
+void error_pageSizeNotAMultipleOfQcomp();
+
+void error_gettingNumNumaNodesFailed();
+
+void error_numaAllocOrDeallocAttemptedOnWindows();
+
+void error_numaBindingFailed();
+
+void error_numaUnmappingFailed();
+
 
 
 /*
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index a831869bd..3339f7772 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -12,6 +12,7 @@
 
 #include "quest/src/core/memory.hpp"
 #include "quest/src/core/errors.hpp"
+#include "quest/src/core/bitwise.hpp"
 
 #include <vector>
 #include <cstring>
@@ -157,6 +158,19 @@ long cpu_getPageSize() {
     pageSize = sysconf(_SC_PAGESIZE);
 #endif
 
+    // rigorously check the found pagesize is valid
+    // and consistent with preconditions assumed by
+    // callers, to avoid extremely funky bugs on
+    // esoteric future systems
+
+    if (pageSize <= 0)
+        error_gettingPageSizeFailed();
+
+    if (!isPowerOf2(pageSize))
+        error_pageSizeNotAPowerOf2();
+
+    if (pageSize % sizeof(qcomp) != 0)
+        error_pageSizeNotAMultipleOfQcomp();
 
     return pageSize;
 }
@@ -194,6 +208,10 @@ qcomp* cpu_allocNumaArray(qindex length) {
     // allocate memory, potentially more than arraySize (depending on page divisibility)
     void *rawAddr = mmap(NULL, numBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     
+    // indicate memory alloc failure to caller (no NUMA-specific validation error message)
+    if (rawAddr == MAP_FAILED)
+        return nullptr;
+
     // if there is only a single NUMA node, then all memory access will occur within it
     qcomp* outAddr = reinterpret_cast<qcomp*>(rawAddr);
     if (numNodes == 1)
@@ -219,6 +237,11 @@ qcomp* cpu_allocNumaArray(qindex length) {
         void* nodeAddr = reinterpret_cast<void*>(offsetAddr);
         long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numNodes, 0);
 
+        // treat bind failure as internal error (even though it can result from insufficient kernel mem),
+        // rather than permitting silent fallback to non-NUMA awareness which might be astonishingly slow
+        if (success == -1)
+            error_numaBindingFailed();
+
         // prepare next node's address
         offsetAddr += numPagesInNode * pageSize;
         if (shift <= 0)
@@ -260,6 +283,9 @@ void cpu_deallocNumaArray(qcomp* arr, qindex length) {
     qindex numPages = getNumPagesToContainArray(pageSize, arraySize);
     qindex numBytes = numPages * pageSize; // gauranteed no overflow
     int success = munmap(arr, numBytes);
+
+    if (success == -1)
+        error_numaUnmappingFailed();
 #endif
 }
 

From 99583117f577b1806556d6d885cf68a5b1057c46 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 3 Jul 2025 22:59:09 +0200
Subject: [PATCH 12/16] updating authorlists

and speeding up Windows compilation
---
 CMakeLists.txt            | 2 +-
 quest/src/core/errors.hpp | 1 -
 quest/src/core/memory.cpp | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e629de55..a05a70e44 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 # @author Oliver Thomson Brown
 # @author Erich Essmann (patches including MSVC support)
 # @author Tyson Jones (patches including clang multithreading)
-# @author Luc Jaulmes (patching install)
+# @author Luc Jaulmes (NUMA awareness, patching install)
 #
 # Contributions to previous builds from:
 #  - Ania Brown
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index a7f3615b5..d5b2ddadf 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -45,7 +45,6 @@ void error_allocOfQuESTEnvFailed();
 
 
 
-
 /*
  * MEMORY ERRORS
  */
diff --git a/quest/src/core/memory.cpp b/quest/src/core/memory.cpp
index 79d4301a4..c8b81fc88 100644
--- a/quest/src/core/memory.cpp
+++ b/quest/src/core/memory.cpp
@@ -30,6 +30,7 @@
     #include <sys/sysctl.h>
 #elif defined(_WIN32)
     #define NOMINMAX
+    #define WIN32_LEAN_AND_MEAN
     #include <windows.h>
 #endif
 

From eacba2db3f7135c3c91d388e4a81d628e2a28430 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 3 Jul 2025 23:06:50 +0200
Subject: [PATCH 13/16] patching cpu_getPageSize() return type

---
 quest/src/cpu/cpu_config.cpp | 2 +-
 quest/src/cpu/cpu_config.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index 3339f7772..5e93114a0 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -274,7 +274,7 @@ void cpu_deallocNumaArray(qcomp* arr, qindex length) {
 
 #else
     qindex arrSize = length * sizeof(qcomp);
-    unsigned long pageSize = cpu_getPageSize();
+    long pageSize = cpu_getPageSize();
 
     // sub-page arrays were allocated with calloc()
     if (arrSize <= pageSize)
diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp
index 0d8a4e416..21d39e359 100644
--- a/quest/src/cpu/cpu_config.hpp
+++ b/quest/src/cpu/cpu_config.hpp
@@ -65,7 +65,7 @@ PauliStr* cpu_allocPauliStrings(qindex numStrings);
 void cpu_deallocPauliStrings(PauliStr* strings);
 
 
-unsigned long cpu_getPageSize();
+long cpu_getPageSize();
 
 
 /*

From 76cffba574460198b76a9640c2251a8d20c2b05d Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 3 Jul 2025 23:42:34 +0200
Subject: [PATCH 14/16] embarrassing really

---
 quest/src/cpu/cpu_config.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index 5e93114a0..bada05bca 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -278,9 +278,9 @@ void cpu_deallocNumaArray(qcomp* arr, qindex length) {
 
     // sub-page arrays were allocated with calloc()
     if (arrSize <= pageSize)
-        return cpu_deallocArray(length);
+        return cpu_deallocArray(arr);
 
-    qindex numPages = getNumPagesToContainArray(pageSize, arraySize);
+    qindex numPages = getNumPagesToContainArray(pageSize, arrSize);
     qindex numBytes = numPages * pageSize; // gauranteed no overflow
     int success = munmap(arr, numBytes);
 

From 223447c37c15ce13a12a85131dacc3438a5cbf55 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sat, 5 Jul 2025 01:13:19 +0200
Subject: [PATCH 15/16] patched mbind args

---
 CMakeLists.txt               | 1 +
 quest/src/cpu/cpu_config.cpp | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a05a70e44..e2e52d85e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -272,6 +272,7 @@ if (ENABLE_MULTITHREADING)
     if (${NUMA_FOUND})
       compile_option(NUMA_AWARE ${NUMA_FOUND})
       target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
+      message(STATUS "NUMA awareness is enabled.")
     else()
       compile_option(NUMA_AWARE 0)
       message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index bada05bca..b24a34e9e 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -234,8 +234,9 @@ qcomp* cpu_allocNumaArray(qindex length) {
 
         // bind those pages from the offset address to the node (identified by mask)
         unsigned long nodeMask = 1UL << node;
+        unsigned long numBitsInMask = 8 * nodeMask;
         void* nodeAddr = reinterpret_cast<void*>(offsetAddr);
-        long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numNodes, 0);
+        long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numBitsInMask, 0);
 
         // treat bind failure as internal error (even though it can result from insufficient kernel mem),
         // rather than permitting silent fallback to non-NUMA awareness which might be astonishingly slow

From 2227b0186fd64371bc3d3b2805d1a537d400a6be Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sat, 5 Jul 2025 01:15:22 +0200
Subject: [PATCH 16/16] absolutely shameful

---
 quest/src/cpu/cpu_config.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index b24a34e9e..e488e6a9c 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -234,7 +234,7 @@ qcomp* cpu_allocNumaArray(qindex length) {
 
         // bind those pages from the offset address to the node (identified by mask)
         unsigned long nodeMask = 1UL << node;
-        unsigned long numBitsInMask = 8 * nodeMask;
+        unsigned long numBitsInMask = 8 * sizeof(nodeMask);
         void* nodeAddr = reinterpret_cast<void*>(offsetAddr);
         long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numBitsInMask, 0);