diff --git a/CMakeLists.txt b/CMakeLists.txt
index 933e23086..9e629de55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -262,6 +262,22 @@ if (ENABLE_MULTITHREADING)
     OpenMP::OpenMP_C
   )
 
+  # Find NUMA - location of NUMA headers
+  if (WIN32)
+    compile_option(NUMA_AWARE 0)
+    message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
+  else()
+    include(FindPkgConfig)
+    pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
+    if (${NUMA_FOUND})
+      compile_option(NUMA_AWARE ${NUMA_FOUND})
+      target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
+    else()
+      compile_option(NUMA_AWARE 0)
+      message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
+    endif()
+  endif()
+
   if (VERBOSE_LIB_NAME)
     string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
   endif()
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 6eef515c4..541491899 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -225,7 +225,7 @@ void printCpuInfo() {
         "cpu", {
         {"numCpuCores",   printer_toStr(std::thread::hardware_concurrency()) + pm},
         {"numOmpProcs",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
-        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na},
+        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na},
         {"cpuMemory",     ram},
         {"cpuMemoryFree", un},
     });
@@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) {
 
     QuESTEnv env = getQuESTEnv();
 
-    int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1;
+    int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
     int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
     int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
 
diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
index 98068f079..7d68528a1 100644
--- a/quest/src/api/qureg.cpp
+++ b/quest/src/api/qureg.cpp
@@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
     Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
 
     // always allocate CPU memory
-    qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed
+    qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed
 
     // conditionally allocate GPU memory and communication buffers (even if numNodes == 1).
     // note that in distributed settings but where useDistrib=false, each node will have a
@@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) {
     validate_quregFields(qureg, __func__);
 
     // free CPU memory
-    cpu_deallocArray(qureg.cpuAmps);
+    cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode);
 
     // free CPU communication buffer
     if (qureg.isDistributed)
diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp
index 2b6645e42..27c412687 100644
--- a/quest/src/core/autodeployer.cpp
+++ b/quest/src/core/autodeployer.cpp
@@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us
 
     // and we require more than 1 thread available at QuESTEnv creation
     if (useMultithread == modeflag::USE_AUTO)
-        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0;
+        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0;
 }
 
 
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 2f44127c8..afcd8c014 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -94,6 +94,18 @@ void error_allocOfQuESTEnvFailed() {
 }
 
 
+void error_gettingPageSizeFailed() {
+
+    raiseInternalError("Failed to get the page size.");
+}
+
+
+void error_gettingNumaNodesFailed() {
+
+    raiseInternalError("Failed to get the numa node count");
+}
+
+
 
 /*
  * MEMORY ERRORS
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index ce8f7e68c..88cffeb50 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -42,6 +42,9 @@ void error_validationListUniquenessCheckExceededMaskSize();
 
 void error_allocOfQuESTEnvFailed();
 
+void error_gettingPageSizeFailed();
+
+void error_gettingNumaNodesFailed();
 
 
 /*
diff --git a/quest/src/core/memory.hpp b/quest/src/core/memory.hpp
index 7b112b027..cd20d867b 100644
--- a/quest/src/core/memory.hpp
+++ b/quest/src/core/memory.hpp
@@ -21,6 +21,12 @@
 
 
 
+/*
+ * Not necessarily the actual page size, but a sensible default for page-sized things.
+ */
+#define FALLBACK_PAGE_SIZE 4096
+
+
 /*
  * HARDWARE QUERYING
  */
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 1c22d43d8..9377be4ee 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -903,6 +903,23 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
 }
 
 
+std::pair<qindex, qindex>
+util_distribute(const qindex work, const qindex block, const int id, const int n) {
+    const qindex blocks = work / block;
+    // really should not happen except when work < block as work is power of 2 and block should be too.
+    const qindex last = work % block;
+
+    qindex spread = blocks / n;
+    qindex extra = blocks % n;
+
+    qindex prev_extra = (id * extra) / n;
+    qindex prev_shift = (id * extra) % n;
+    qindex here_extra = (prev_shift + extra) >= n;
+
+    qindex pos = id * spread + prev_extra;
+    qindex end = pos + spread + here_extra;
+    return std::make_pair(pos * block, end * block + (id == n - 1 ? last : 0));
+}
 
 /*
  * GATE PARAMETERS
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index 6d741312b..36f051595 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -343,6 +343,9 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s
 util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds);
 
 
+// Generic function to split a workload fairly, with granularity >= block_size
+std::pair<qindex, qindex> util_distribute(const qindex work, const qindex block, const int id, const int n);
+
 
 /*
  * GATE PARAMETERS
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index f27471d38..9aa640fe8 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -9,11 +9,13 @@
 #include "quest/include/types.h"
 #include "quest/include/paulis.h"
 
+#include "quest/src/core/memory.hpp"
 #include "quest/src/core/errors.hpp"
 
 #include <vector>
 #include <cstring>
 #include <cstdlib>
+#include <cstdint>
 
 using std::vector;
 
@@ -34,6 +36,12 @@ using std::vector;
     #include <omp.h>
 #endif
 
+#if NUMA_AWARE
+    #include <sys/mman.h>
+    #include <unistd.h>
+    #include <numaif.h>
+    #include <numa.h>
+#endif // NUMA_AWARE
 
 
 /*
@@ -46,11 +54,12 @@ bool cpu_isOpenmpCompiled() {
 }
 
 
-int cpu_getCurrentNumThreads() {
+int cpu_getAvailableNumThreads() {
 #if COMPILE_OPENMP
     int n = -1;
 
     #pragma omp parallel shared(n)
+    #pragma omp single
     n = omp_get_num_threads();
 
     return n;
@@ -90,28 +99,87 @@ int cpu_getOpenmpThreadInd() {
 }
 
 
+int cpu_getCurrentNumThreads() {
+#if COMPILE_OPENMP
+    return omp_get_num_threads();
+#else
+    return 1;
+#endif
+}
+
+
 
 /*
  * MEMORY ALLOCATION
  */
 
+unsigned long cpu_getPageSize() {
+#if NUMA_AWARE
+    static unsigned long page_size = 0;
+    if (!page_size) {
+        page_size = sysconf(_SC_PAGESIZE);
+        if (page_size == ~0UL) {
+            error_gettingPageSizeFailed();
+        }
+    }
+    return page_size;
+#else
+    return FALLBACK_PAGE_SIZE;
+#endif
+}
+
+#if NUMA_AWARE
+unsigned long cpu_getNumaNodes() {
+    static int n_nodes = 0;
+    if (!n_nodes) {
+        n_nodes = numa_num_configured_nodes();
+        if (n_nodes < 1) {
+            error_gettingNumaNodesFailed();
+        }
+    }
+    return n_nodes;
+}
+#endif
 
 qcomp* cpu_allocArray(qindex length) {
+    return (qcomp*) calloc(length, sizeof(qcomp));
+}
 
-    /// @todo
-    /// here, we calloc the entire array in a serial setting, rather than one malloc 
-    /// followed by threads subsequently memset'ing their own partitions. The latter
-    /// approach would distribute the array pages across NUMA nodes, accelerating 
-    /// their subsequent access by the same threads (via NUMA's first-touch policy).
-    /// We have so far foregone this optimisation since a thread's memory-access pattern
-    /// in many of the QuEST functions is non-trivial, and likely to be inconsistent 
-    /// with the memset pattern. As such, I expect the benefit is totally occluded
-    /// and only introduces potential new bugs - but this should be tested and confirmed!
-
-    // we call calloc over malloc in order to fail immediately if mem isn't available;
-    // caller must handle nullptr result
 
-    return (qcomp*) calloc(length, sizeof(qcomp));
+qcomp* cpu_allocNumaArray(qindex length) {
+#if !NUMA_AWARE
+    return cpu_allocArray(length);
+#else
+    unsigned long page_size = cpu_getPageSize();
+    int n_nodes = cpu_getNumaNodes();
+
+    qindex size = length * sizeof(qcomp);
+    int pages = (size + page_size - 1) / page_size;
+    void *addr = mmap(NULL, pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (n_nodes == 1) {
+        return reinterpret_cast<qcomp*>(addr);
+    }
+
+    // distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible
+    int floor_pages = pages / n_nodes;
+    int spread_pages = pages % n_nodes;
+
+    uintptr_t pos = (uintptr_t)addr;
+    for (int node = 0, shift = n_nodes; node < n_nodes; ++node) {
+        shift -= spread_pages;
+        int node_pages = floor_pages + (shift <= 0);
+
+        unsigned long node_mask = 1UL << node;
+        mbind((void*)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof(node_mask) * 8, 0);
+
+        pos += node_pages * page_size;
+        if (shift <= 0) {
+            shift += n_nodes;
+        }
+    }
+
+    return reinterpret_cast<qcomp*>(addr);
+#endif // NUMA_AWARE
 }
 
 
@@ -122,6 +190,23 @@ void cpu_deallocArray(qcomp* arr) {
 }
 
 
+void cpu_deallocNumaArray(qcomp* arr, qindex length) {
+    if (arr == nullptr) {
+        return;
+    }
+
+#if !NUMA_AWARE
+    return cpu_deallocArray(arr);
+#else
+    unsigned long page_size = cpu_getPageSize();
+    qindex size = length * sizeof(qcomp);
+    int pages = (size + page_size - 1) / page_size;
+
+    munmap(arr, pages * page_size);
+#endif // NUMA_AWARE
+}
+
+
 qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) {
 
     // do not allocate if arr alloc failed (caller will handle)
diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp
index 48f54b44f..0d8a4e416 100644
--- a/quest/src/cpu/cpu_config.hpp
+++ b/quest/src/cpu/cpu_config.hpp
@@ -23,7 +23,7 @@ using std::vector;
 
 bool cpu_isOpenmpCompiled();
 
-int cpu_getCurrentNumThreads();
+int cpu_getAvailableNumThreads();
 
 int cpu_getNumOpenmpProcessors();
 
@@ -35,6 +35,8 @@ int cpu_getNumOpenmpProcessors();
 
 int cpu_getOpenmpThreadInd();
 
+int cpu_getCurrentNumThreads();
+
 
 
 /*
@@ -44,6 +46,9 @@ int cpu_getOpenmpThreadInd();
 qcomp* cpu_allocArray(qindex length);
 void cpu_deallocArray(qcomp* arr);
 
+qcomp* cpu_allocNumaArray(qindex length);
+void cpu_deallocNumaArray(qcomp* arr, qindex length);
+
 qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim);
 void cpu_deallocMatrixWrapper(qcomp** wrapper);
 
@@ -60,6 +65,8 @@ PauliStr* cpu_allocPauliStrings(qindex numStrings);
 void cpu_deallocPauliStrings(PauliStr* strings);
 
 
+unsigned long cpu_getPageSize();
+
 
 /*
  * MEMORY MOVEMENT
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 9c90e08c1..ad5266e08 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -873,7 +873,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
     // whenever each thread has at least 1 iteration for itself. And of course
     // we serialise both inner and outer loops when qureg multithreading is off.
 
-    if (!qureg.isMultithreaded || numOuterIts >= cpu_getCurrentNumThreads()) {
+    if (!qureg.isMultithreaded || numOuterIts >= cpu_getAvailableNumThreads()) {
     
         // parallel
         #pragma omp parallel for if(qureg.isMultithreaded)
@@ -2357,7 +2357,12 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {
 
     // faster on average (though perhaps not for large quregs)
     // than a custom multithreaded loop
-    std::fill(qureg.cpuAmps, qureg.cpuAmps + qureg.numAmpsPerNode, amp);
+    #pragma omp parallel if(qureg.isMultithreaded)
+    {
+        const auto [start, end] = util_distribute(qureg.numAmpsPerNode, cpu_getPageSize() / sizeof(qcomp),
+                cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads());
+        std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp);
+    }
 }