diff --git a/CMakeLists.txt b/CMakeLists.txt index 933e23086..9e629de55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -262,6 +262,22 @@ if (ENABLE_MULTITHREADING) OpenMP::OpenMP_C ) + # Find NUMA - location of NUMA headers + if (WIN32) + compile_option(NUMA_AWARE 0) + message(WARNING "Building on Windows, QuEST will not be aware of numa locality") + else() + include(FindPkgConfig) + pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL) + if (${NUMA_FOUND}) + compile_option(NUMA_AWARE ${NUMA_FOUND}) + target_link_libraries(QuEST PRIVATE PkgConfig::NUMA) + else() + compile_option(NUMA_AWARE 0) + message(WARNING "libnuma not found, QuEST will not be aware of numa locality") + endif() + endif() + if (VERBOSE_LIB_NAME) string(CONCAT LIB_NAME ${LIB_NAME} "+mt") endif() diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index 6eef515c4..541491899 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -225,7 +225,7 @@ void printCpuInfo() { "cpu", { {"numCpuCores", printer_toStr(std::thread::hardware_concurrency()) + pm}, {"numOmpProcs", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na}, - {"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na}, + {"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na}, {"cpuMemory", ram}, {"cpuMemoryFree", un}, }); @@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) { QuESTEnv env = getQuESTEnv(); - int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1; + int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1; int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled(); int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible(); diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp index 98068f079..7d68528a1 100644 --- a/quest/src/api/qureg.cpp +++ b/quest/src/api/qureg.cpp @@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread); // always allocate CPU memory - qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed + qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed // conditionally allocate GPU memory and communication buffers (even if numNodes == 1). // note that in distributed settings but where useDistrib=false, each node will have a @@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) { validate_quregFields(qureg, __func__); // free CPU memory - cpu_deallocArray(qureg.cpuAmps); + cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode); // free CPU communication buffer if (qureg.isDistributed) diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp index 2b6645e42..27c412687 100644 --- a/quest/src/core/autodeployer.cpp +++ b/quest/src/core/autodeployer.cpp @@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us // and we require more than 1 thread available at QuESTEnv creation if (useMultithread == modeflag::USE_AUTO) - useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0; + useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0; } diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp index 2f44127c8..afcd8c014 100644 --- a/quest/src/core/errors.cpp +++ b/quest/src/core/errors.cpp @@ -94,6 +94,18 @@ void error_allocOfQuESTEnvFailed() { } +void error_gettingPageSizeFailed() { + + raiseInternalError("Failed to get the page size."); +} + + +void error_gettingNumaNodesFailed() { + + raiseInternalError("Failed to get the numa node count"); +} + + /* * MEMORY ERRORS diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp index ce8f7e68c..88cffeb50 100644 --- a/quest/src/core/errors.hpp +++ b/quest/src/core/errors.hpp @@ -42,6 +42,9 @@ void error_validationListUniquenessCheckExceededMaskSize(); void error_allocOfQuESTEnvFailed(); +void error_gettingPageSizeFailed(); + +void error_gettingNumaNodesFailed(); /* diff --git a/quest/src/core/memory.hpp b/quest/src/core/memory.hpp index 7b112b027..cd20d867b 100644 --- a/quest/src/core/memory.hpp +++ b/quest/src/core/memory.hpp @@ -21,6 +21,12 @@ +/* + * Not necessarily the actual page size, but a sensible default for page-sized things. + */ +#define FALLBACK_PAGE_SIZE 4096 + + /* * HARDWARE QUERYING */ diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp index 1c22d43d8..9377be4ee 100644 --- a/quest/src/core/utilities.cpp +++ b/quest/src/core/utilities.cpp @@ -903,6 +903,23 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin } +std::pair +util_distribute(const qindex work, const qindex block, const int id, const int n) { + const qindex blocks = work / block; + // really should not happen except when work < block as work is power of 2 and block should be too. + const qindex last = work % block; + + qindex spread = blocks / n; + qindex extra = blocks % n; + + qindex prev_extra = (id * extra) / n; + qindex prev_shift = (id * extra) % n; + qindex here_extra = (prev_shift + extra) >= n; + + qindex pos = id * spread + prev_extra; + qindex end = pos + spread + here_extra; + return std::make_pair(pos * block, end * block + (id == n - 1 ? last : 0)); +} /* * GATE PARAMETERS diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp index 6d741312b..36f051595 100644 --- a/quest/src/core/utilities.hpp +++ b/quest/src/core/utilities.hpp @@ -343,6 +343,9 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds); +// Generic function to split a workload fairly, with granularity >= block_size +std::pair util_distribute(const qindex work, const qindex block, const int id, const int n); + /* * GATE PARAMETERS diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index f27471d38..9aa640fe8 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -9,11 +9,13 @@ #include "quest/include/types.h" #include "quest/include/paulis.h" +#include "quest/src/core/memory.hpp" #include "quest/src/core/errors.hpp" #include #include #include +#include using std::vector; @@ -34,6 +36,12 @@ using std::vector; #include #endif +#if NUMA_AWARE + #include + #include + #include + #include +#endif // NUMA_AWARE /* @@ -46,11 +54,12 @@ bool cpu_isOpenmpCompiled() { } -int cpu_getCurrentNumThreads() { +int cpu_getAvailableNumThreads() { #if COMPILE_OPENMP int n = -1; #pragma omp parallel shared(n) + #pragma omp single n = omp_get_num_threads(); return n; @@ -90,28 +99,87 @@ int cpu_getOpenmpThreadInd() { } +int cpu_getCurrentNumThreads() { +#if COMPILE_OPENMP + return omp_get_num_threads(); +#else + return 1; +#endif +} + + /* * MEMORY ALLOCATION */ +unsigned long cpu_getPageSize() { +#if NUMA_AWARE + static unsigned long page_size = 0; + if (!page_size) { + page_size = sysconf(_SC_PAGESIZE); + if (page_size == ~0UL) { + error_gettingPageSizeFailed(); + } + } + return page_size; +#else + return FALLBACK_PAGE_SIZE; +#endif +} + +#if NUMA_AWARE +unsigned long cpu_getNumaNodes() { + static int n_nodes = 0; + if (!n_nodes) { + n_nodes = numa_num_configured_nodes(); + if (n_nodes < 1) { + error_gettingNumaNodesFailed(); + } + } + return n_nodes; +} +#endif qcomp* cpu_allocArray(qindex length) { + return (qcomp*) calloc(length, sizeof(qcomp)); +} - /// @todo - /// here, we calloc the entire array in a serial setting, rather than one malloc - /// followed by threads subsequently memset'ing their own partitions. The latter - /// approach would distribute the array pages across NUMA nodes, accelerating - /// their subsequent access by the same threads (via NUMA's first-touch policy). - /// We have so far foregone this optimisation since a thread's memory-access pattern - /// in many of the QuEST functions is non-trivial, and likely to be inconsistent - /// with the memset pattern. As such, I expect the benefit is totally occluded - /// and only introduces potential new bugs - but this should be tested and confirmed! - - // we call calloc over malloc in order to fail immediately if mem isn't available; - // caller must handle nullptr result - return (qcomp*) calloc(length, sizeof(qcomp)); +qcomp* cpu_allocNumaArray(qindex length) { +#if !NUMA_AWARE + return cpu_allocArray(length); +#else + unsigned long page_size = cpu_getPageSize(); + int n_nodes = cpu_getNumaNodes(); + + qindex size = length * sizeof(qcomp); + int pages = (size + page_size - 1) / page_size; + void *addr = mmap(NULL, pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (n_nodes == 1) { + return reinterpret_cast(addr); + } + + // distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible + int floor_pages = pages / n_nodes; + int spread_pages = pages % n_nodes; + + uintptr_t pos = (uintptr_t)addr; + for (int node = 0, shift = n_nodes; node < n_nodes; ++node) { + shift -= spread_pages; + int node_pages = floor_pages + (shift <= 0); + + unsigned long node_mask = 1UL << node; + mbind((void*)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof(node_mask) * 8, 0); + + pos += node_pages * page_size; + if (shift <= 0) { + shift += n_nodes; + } + } + + return reinterpret_cast(addr); +#endif // NUMA_AWARE } @@ -122,6 +190,23 @@ void cpu_deallocArray(qcomp* arr) { } +void cpu_deallocNumaArray(qcomp* arr, qindex length) { + if (arr == nullptr) { + return; + } + +#if !NUMA_AWARE + return cpu_deallocArray(arr); +#else + unsigned long page_size = cpu_getPageSize(); + qindex size = length * sizeof(qcomp); + int pages = (size + page_size - 1) / page_size; + + munmap(arr, pages * page_size); +#endif // NUMA_AWARE +} + + qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) { // do not allocate if arr alloc failed (caller will handle) diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp index 48f54b44f..0d8a4e416 100644 --- a/quest/src/cpu/cpu_config.hpp +++ b/quest/src/cpu/cpu_config.hpp @@ -23,7 +23,7 @@ using std::vector; bool cpu_isOpenmpCompiled(); -int cpu_getCurrentNumThreads(); +int cpu_getAvailableNumThreads(); int cpu_getNumOpenmpProcessors(); @@ -35,6 +35,8 @@ int cpu_getNumOpenmpProcessors(); int cpu_getOpenmpThreadInd(); +int cpu_getCurrentNumThreads(); + /* @@ -44,6 +46,9 @@ int cpu_getOpenmpThreadInd(); qcomp* cpu_allocArray(qindex length); void cpu_deallocArray(qcomp* arr); +qcomp* cpu_allocNumaArray(qindex length); +void cpu_deallocNumaArray(qcomp* arr, qindex length); + qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim); void cpu_deallocMatrixWrapper(qcomp** wrapper); @@ -60,6 +65,8 @@ PauliStr* cpu_allocPauliStrings(qindex numStrings); void cpu_deallocPauliStrings(PauliStr* strings); +unsigned long cpu_getPageSize(); + /* * MEMORY MOVEMENT diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 9c90e08c1..ad5266e08 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -873,7 +873,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( // whenever each thread has at least 1 iteration for itself. And of course // we serialise both inner and outer loops when qureg multithreading is off. - if (!qureg.isMultithreaded || numOuterIts >= cpu_getCurrentNumThreads()) { + if (!qureg.isMultithreaded || numOuterIts >= cpu_getAvailableNumThreads()) { // parallel #pragma omp parallel for if(qureg.isMultithreaded) @@ -2357,7 +2357,12 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { // faster on average (though perhaps not for large quregs) // than a custom multithreaded loop - std::fill(qureg.cpuAmps, qureg.cpuAmps + qureg.numAmpsPerNode, amp); + #pragma omp parallel if(qureg.isMultithreaded) + { + const auto [start, end] = util_distribute(qureg.numAmpsPerNode, cpu_getPageSize() / sizeof(qcomp), + cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads()); + std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); + } }