diff --git a/CMakeLists.txt b/CMakeLists.txt index 933e23086..e2e52d85e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ # @author Oliver Thomson Brown # @author Erich Essmann (patches including MSVC support) # @author Tyson Jones (patches including clang multithreading) -# @author Luc Jaulmes (patching install) +# @author Luc Jaulmes (NUMA awareness, patching install) # # Contributions to previous builds from: # - Ania Brown @@ -262,6 +262,23 @@ if (ENABLE_MULTITHREADING) OpenMP::OpenMP_C ) + # Find NUMA - location of NUMA headers + if (WIN32) + compile_option(NUMA_AWARE 0) + message(WARNING "Building on Windows, QuEST will not be aware of numa locality") + else() + include(FindPkgConfig) + pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL) + if (${NUMA_FOUND}) + compile_option(NUMA_AWARE ${NUMA_FOUND}) + target_link_libraries(QuEST PRIVATE PkgConfig::NUMA) + message(STATUS "NUMA awareness is enabled.") + else() + compile_option(NUMA_AWARE 0) + message(WARNING "libnuma not found, QuEST will not be aware of numa locality") + endif() + endif() + if (VERBOSE_LIB_NAME) string(CONCAT LIB_NAME ${LIB_NAME} "+mt") endif() diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index 6eef515c4..541491899 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -225,7 +225,7 @@ void printCpuInfo() { "cpu", { {"numCpuCores", printer_toStr(std::thread::hardware_concurrency()) + pm}, {"numOmpProcs", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na}, - {"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na}, + {"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na}, {"cpuMemory", ram}, {"cpuMemoryFree", un}, }); @@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) { QuESTEnv env = getQuESTEnv(); - int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1; + int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1; int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled(); int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible(); diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp index 98068f079..7d68528a1 100644 --- a/quest/src/api/qureg.cpp +++ b/quest/src/api/qureg.cpp @@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread); // always allocate CPU memory - qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed + qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed // conditionally allocate GPU memory and communication buffers (even if numNodes == 1). // note that in distributed settings but where useDistrib=false, each node will have a @@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) { validate_quregFields(qureg, __func__); // free CPU memory - cpu_deallocArray(qureg.cpuAmps); + cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode); // free CPU communication buffer if (qureg.isDistributed) diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp index 2b6645e42..27c412687 100644 --- a/quest/src/core/autodeployer.cpp +++ b/quest/src/core/autodeployer.cpp @@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us // and we require more than 1 thread available at QuESTEnv creation if (useMultithread == modeflag::USE_AUTO) - useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0; + useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0; } diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp index 2f44127c8..112e94230 100644 --- a/quest/src/core/errors.cpp +++ b/quest/src/core/errors.cpp @@ -5,6 +5,7 @@ * deployment is consistent with the compiled deployment modes. * * @author Tyson Jones + * @author Luc Jaulmes (NUMA & pagesize errors) */ #include "quest/include/types.h" @@ -104,6 +105,41 @@ void error_memSizeQueriedButWouldOverflow() { raiseInternalError("Attempted to obtain memory necessary to allocate a distributed object's single-node partition but it overflowed size_t despite prior validation."); } +void error_gettingPageSizeFailed() { + + raiseInternalError("Failed to get the page size."); +} + +void error_pageSizeNotAPowerOf2() { + + raiseInternalError("The discovered page size was not a power of 2. Get Dr Denning on the phone."); +} + +void error_pageSizeNotAMultipleOfQcomp() { + + raiseInternalError("The page size was indivisible by the number of bytes in a qcomp."); +} + +void error_gettingNumNumaNodesFailed() { + + raiseInternalError("Failed to get the NUMA node count"); +} + +void error_numaAllocOrDeallocAttemptedOnWindows() { + + raiseInternalError("NUMA-aware memory allocation or deallocation was attempted on Windows though this is not yet implemented, indicating a potential build issue."); +} + +void error_numaBindingFailed() { + + raiseInternalError("The binding of memory pages to NUMA nodes (with mbind) unexpectedly failed, despite prior reservation (with mmap) succeeding."); +} + +void error_numaUnmappingFailed() { + + raiseInternalError("NUMA-aware memory deallocation unexpectedly failed."); +} + /* diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp index ce8f7e68c..d5b2ddadf 100644 --- a/quest/src/core/errors.hpp +++ b/quest/src/core/errors.hpp @@ -5,6 +5,7 @@ * deployment is consistent with the compiled deployment modes. * * @author Tyson Jones + * @author Luc Jaulmes (NUMA & pagesize errors) */ #ifndef ERRORS_HPP @@ -50,6 +51,20 @@ void error_allocOfQuESTEnvFailed(); void error_memSizeQueriedButWouldOverflow(); +void error_gettingPageSizeFailed(); + +void error_pageSizeNotAPowerOf2(); + +void error_pageSizeNotAMultipleOfQcomp(); + +void error_gettingNumNumaNodesFailed(); + +void error_numaAllocOrDeallocAttemptedOnWindows(); + +void error_numaBindingFailed(); + +void error_numaUnmappingFailed(); + /* diff --git a/quest/src/core/memory.cpp b/quest/src/core/memory.cpp index 79d4301a4..c8b81fc88 100644 --- a/quest/src/core/memory.cpp +++ b/quest/src/core/memory.cpp @@ -30,6 +30,7 @@ #include #elif defined(_WIN32) #define NOMINMAX + #define WIN32_LEAN_AND_MEAN #include #endif diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp index 1c22d43d8..7aa4a257f 100644 --- a/quest/src/core/utilities.cpp +++ b/quest/src/core/utilities.cpp @@ -5,6 +5,7 @@ * logic, matrix algebra, and channel parameters. * * @author Tyson Jones + * @author Luc Jaulmes (distributing ranges over blocks) */ #include "quest/include/types.h" @@ -25,6 +26,7 @@ #include #include +#include #include #include #include @@ -902,6 +904,41 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin return out; } +std::pair util_getBlockMultipleSubRange( + qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges +) { + // divides a range into whole blocks (and a single leftover sub-block) and + // attempts to uniformly distribute the blocks across the specified number of + // sub-ranges. When the blocks do not divide evenly between sub-ranges, the + // leftover blocks are spread apart across sub-ranges. When the range does not + // divide evenly into blocks, the overflow is given to the final sub-range. + + qindex numFullBlocks = rangeLen / blockLen; // floors + qindex subBlockLen = rangeLen % blockLen; + + qindex baseNumBlocksPerSubRange = numFullBlocks / numSubRanges; + qindex numExtraBlocks = numFullBlocks % numSubRanges; + + // determine how many extra blocks this subrange should contain + qindex prevExtra = (idSubRange * numExtraBlocks) / numSubRanges; + qindex prevShift = (idSubRange * numExtraBlocks) % numSubRanges; + bool hereExtra = (prevShift + numExtraBlocks) >= numSubRanges; + + // allocate blocks to this sub-range + qindex startBlockInd = idSubRange * baseNumBlocksPerSubRange + prevExtra; + qindex endBlockInd = startBlockInd + baseNumBlocksPerSubRange + hereExtra; + + // find this sub-range indices within [0, rangeLen) + qindex startInd = startBlockInd * blockLen; + qindex endInd = endBlockInd * blockLen; // exclusive + + // arbitrarily allocate the leftover sub-block to the final sub-range + if (idSubRange == numSubRanges - 1) + endInd += subBlockLen; + + return std::make_pair(startInd, endInd); +} + /* diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp index 6d741312b..56691c2f4 100644 --- a/quest/src/core/utilities.hpp +++ b/quest/src/core/utilities.hpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -342,6 +343,8 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds); +std::pair util_getBlockMultipleSubRange(qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges); + /* @@ -352,6 +355,7 @@ qreal util_getPhaseFromGateAngle(qreal angle); qcomp util_getPhaseFromGateAngle(qcomp angle); + /* * DECOHERENCE FACTORS */ diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index f27471d38..e488e6a9c 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -3,17 +3,21 @@ * configuration, and allocating and copying RAM data. * * @author Tyson Jones + * @author Luc Jaulmes (NUMA awareness) */ #include "quest/include/modes.h" #include "quest/include/types.h" #include "quest/include/paulis.h" +#include "quest/src/core/memory.hpp" #include "quest/src/core/errors.hpp" +#include "quest/src/core/bitwise.hpp" #include #include #include +#include using std::vector; @@ -30,10 +34,35 @@ using std::vector; #endif +/// @todo +/// Windows provides a NUMA API we could access in theory, although we +/// forego the hassle for now - who is running QuEST on big multi-core +/// Windows? This validation protects against enabling NUMA awareness +/// on Windows but silently recieving no benefit due to no NUMA API calls + +#if NUMA_AWARE && defined(_WIN32) + #error "NUMA awareness is not currently supported on non-POSIX systems like Windows." +#endif + + #if COMPILE_OPENMP #include #endif +#if NUMA_AWARE && ! defined(_WIN32) + #include + #include + #include +#endif + +#if defined(_WIN32) + #define NOMINMAX + #define WIN32_LEAN_AND_MEAN + #include +#else + #include +#endif + /* @@ -46,11 +75,12 @@ bool cpu_isOpenmpCompiled() { } -int cpu_getCurrentNumThreads() { +int cpu_getAvailableNumThreads() { #if COMPILE_OPENMP int n = -1; #pragma omp parallel shared(n) + #pragma omp single n = omp_get_num_threads(); return n; @@ -90,31 +120,140 @@ int cpu_getOpenmpThreadInd() { } +int cpu_getCurrentNumThreads() { +#if COMPILE_OPENMP + return omp_get_num_threads(); +#else + return 1; +#endif +} + + /* * MEMORY ALLOCATION */ -qcomp* cpu_allocArray(qindex length) { +qindex getNumPagesToContainArray(long pageLen, qindex arrLen) { - /// @todo - /// here, we calloc the entire array in a serial setting, rather than one malloc - /// followed by threads subsequently memset'ing their own partitions. The latter - /// approach would distribute the array pages across NUMA nodes, accelerating - /// their subsequent access by the same threads (via NUMA's first-touch policy). - /// We have so far foregone this optimisation since a thread's memory-access pattern - /// in many of the QuEST functions is non-trivial, and likely to be inconsistent - /// with the memset pattern. As such, I expect the benefit is totally occluded - /// and only introduces potential new bugs - but this should be tested and confirmed! - - // we call calloc over malloc in order to fail immediately if mem isn't available; - // caller must handle nullptr result + // round up to the nearest page + return static_cast(std::ceil(arrLen / (qreal) pageLen)); +} + + +long cpu_getPageSize() { + + // avoid repeated queries to this fixed value + static long pageSize = 0; + if (pageSize > 0) + return pageSize; + + // obtain pageSize for the first time +#if defined(_WIN32) + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + pageSize = sysInfo.dwPageSize; +#else + pageSize = sysconf(_SC_PAGESIZE); +#endif + + // rigorously check the found pagesize is valid + // and consistent with preconditions assumed by + // callers, to avoid extremely funky bugs on + // esoteric future systems + + if (pageSize <= 0) + error_gettingPageSizeFailed(); + + if (!isPowerOf2(pageSize)) + error_pageSizeNotAPowerOf2(); + if (pageSize % sizeof(qcomp) != 0) + error_pageSizeNotAMultipleOfQcomp(); + + return pageSize; +} + + +qcomp* cpu_allocArray(qindex length) { return (qcomp*) calloc(length, sizeof(qcomp)); } +qcomp* cpu_allocNumaArray(qindex length) { +#if ! NUMA_AWARE + return cpu_allocArray(length); + +#elif defined(_WIN32) + error_numaAllocOrDeallocAttemptedOnWindows(); + +#else + // we will divide array's memory into pages + long pageSize = cpu_getPageSize(); + qindex arraySize = length * sizeof(qcomp); // gauranteed no overflow + + // if entire array fits within a single page, alloc like normal + if (arraySize <= pageSize) + return cpu_allocArray(length); + + // otherwise we will bind pages across NUMA nodes + static int numNodes = numa_num_configured_nodes(); + if (numNodes < 1) + error_gettingNumNumaNodesFailed(); + + qindex numPages = getNumPagesToContainArray(pageSize, arraySize); + qindex numBytes = numPages * pageSize; // prior validation gaurantees no overflow + + // allocate memory, potentially more than arraySize (depending on page divisibility) + void *rawAddr = mmap(NULL, numBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + // indicate memory alloc failure to caller (no NUMA-specific validation error message) + if (rawAddr == MAP_FAILED) + return nullptr; + + // if there is only a single NUMA node, then all memory access will occur within it + qcomp* outAddr = reinterpret_cast(rawAddr); + if (numNodes == 1) + return outAddr; + + // otherwise, we bind continguous pages to NUMA nodes, distributing the pages + // attemptedly uniformly and spreading remaining pages maximally apart + qindex baseNumPagesPerNode = numPages / numNodes; // floors + qindex remainingNumPagesTotal = numPages % numNodes; + + // use integer type for safe address arithmetic below + uintptr_t offsetAddr = reinterpret_cast(rawAddr); + + for (int node=0, shift=numNodes; node < numNodes; ++node) { + + // decide number of pages to bind to NUMA node + shift -= remainingNumPagesTotal; + qindex numPagesInNode = baseNumPagesPerNode + (shift <= 0); + qindex numBytesInNode = numPagesInNode * pageSize; // validation prevents overflow + + // bind those pages from the offset address to the node (identified by mask) + unsigned long nodeMask = 1UL << node; + unsigned long numBitsInMask = 8 * sizeof(nodeMask); + void* nodeAddr = reinterpret_cast(offsetAddr); + long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numBitsInMask, 0); + + // treat bind failure as internal error (even though it can result from insufficient kernel mem), + // rather than permitting silent fallback to non-NUMA awareness which might be astonishingly slow + if (success == -1) + error_numaBindingFailed(); + + // prepare next node's address + offsetAddr += numPagesInNode * pageSize; + if (shift <= 0) + shift += numNodes; + } + + return outAddr; +#endif +} + + void cpu_deallocArray(qcomp* arr) { // arr can safely be nullptr @@ -122,6 +261,36 @@ void cpu_deallocArray(qcomp* arr) { } +void cpu_deallocNumaArray(qcomp* arr, qindex length) { + + // musn't pass nullptr to munmap() below + if (arr == nullptr) + return; + +#if ! NUMA_AWARE + cpu_deallocArray(arr); + +#elif defined(_WIN32) + error_numaAllocOrDeallocAttemptedOnWindows(); + +#else + qindex arrSize = length * sizeof(qcomp); + long pageSize = cpu_getPageSize(); + + // sub-page arrays were allocated with calloc() + if (arrSize <= pageSize) + return cpu_deallocArray(arr); + + qindex numPages = getNumPagesToContainArray(pageSize, arrSize); + qindex numBytes = numPages * pageSize; // gauranteed no overflow + int success = munmap(arr, numBytes); + + if (success == -1) + error_numaUnmappingFailed(); +#endif +} + + qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) { // do not allocate if arr alloc failed (caller will handle) diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp index 48f54b44f..21d39e359 100644 --- a/quest/src/cpu/cpu_config.hpp +++ b/quest/src/cpu/cpu_config.hpp @@ -23,7 +23,7 @@ using std::vector; bool cpu_isOpenmpCompiled(); -int cpu_getCurrentNumThreads(); +int cpu_getAvailableNumThreads(); int cpu_getNumOpenmpProcessors(); @@ -35,6 +35,8 @@ int cpu_getNumOpenmpProcessors(); int cpu_getOpenmpThreadInd(); +int cpu_getCurrentNumThreads(); + /* @@ -44,6 +46,9 @@ int cpu_getOpenmpThreadInd(); qcomp* cpu_allocArray(qindex length); void cpu_deallocArray(qcomp* arr); +qcomp* cpu_allocNumaArray(qindex length); +void cpu_deallocNumaArray(qcomp* arr, qindex length); + qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim); void cpu_deallocMatrixWrapper(qcomp** wrapper); @@ -60,6 +65,8 @@ PauliStr* cpu_allocPauliStrings(qindex numStrings); void cpu_deallocPauliStrings(PauliStr* strings); +long cpu_getPageSize(); + /* * MEMORY MOVEMENT diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 9c90e08c1..b267df03a 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -9,6 +9,7 @@ * * @author Tyson Jones * @author Oliver Brown (OpenMP 'if' clauses) + * @author Luc Jaulmes (optimised initUniformState) * @author Richard Meister (helped patch on LLVM) * @author Kshitij Chhabra (patched v3 clauses with gcc9) * @author Ania (Anna) Brown (developed QuEST v1 logic) @@ -873,7 +874,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( // whenever each thread has at least 1 iteration for itself. And of course // we serialise both inner and outer loops when qureg multithreading is off. - if (!qureg.isMultithreaded || numOuterIts >= cpu_getCurrentNumThreads()) { + if (!qureg.isMultithreaded || numOuterIts >= cpu_getAvailableNumThreads()) { // parallel #pragma omp parallel for if(qureg.isMultithreaded) @@ -2355,9 +2356,20 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_multiQubitProjector void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { - // faster on average (though perhaps not for large quregs) - // than a custom multithreaded loop - std::fill(qureg.cpuAmps, qureg.cpuAmps + qureg.numAmpsPerNode, amp); + // approx-uniformly distribute modified memory pages across threads, + // in the hope that each std::fill() will touch only memory within + // the thread's corresponding NUMA node, for best performance + + int numAmpsPerPage = cpu_getPageSize() / sizeof(qcomp); // divides evenly + + #pragma omp parallel if(qureg.isMultithreaded) + { + const auto [start, end] = util_getBlockMultipleSubRange( + qureg.numAmpsPerNode, numAmpsPerPage, + cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads()); + + std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); + } }