Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,22 @@ if (ENABLE_MULTITHREADING)
OpenMP::OpenMP_C
)

# Find NUMA - location of NUMA headers
if (WIN32)
compile_option(NUMA_AWARE 0)
message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
else()
include(FindPkgConfig)
pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
if (${NUMA_FOUND})
compile_option(NUMA_AWARE ${NUMA_FOUND})
target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
else()
compile_option(NUMA_AWARE 0)
message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
endif()
endif()

if (VERBOSE_LIB_NAME)
string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
endif()
Expand Down
4 changes: 2 additions & 2 deletions quest/src/api/environment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ void printCpuInfo() {
"cpu", {
{"numCpuCores", printer_toStr(std::thread::hardware_concurrency()) + pm},
{"numOmpProcs", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
{"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na},
{"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na},
{"cpuMemory", ram},
{"cpuMemoryFree", un},
});
Expand Down Expand Up @@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) {

QuESTEnv env = getQuESTEnv();

int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1;
int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();

Expand Down
4 changes: 2 additions & 2 deletions quest/src/api/qureg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);

// always allocate CPU memory
qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed
qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed

// conditionally allocate GPU memory and communication buffers (even if numNodes == 1).
// note that in distributed settings but where useDistrib=false, each node will have a
Expand Down Expand Up @@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) {
validate_quregFields(qureg, __func__);

// free CPU memory
cpu_deallocArray(qureg.cpuAmps);
cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode);

// free CPU communication buffer
if (qureg.isDistributed)
Expand Down
2 changes: 1 addition & 1 deletion quest/src/core/autodeployer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us

// and we require more than 1 thread available at QuESTEnv creation
if (useMultithread == modeflag::USE_AUTO)
useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0;
useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0;
}


Expand Down
12 changes: 12 additions & 0 deletions quest/src/core/errors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,18 @@ void error_allocOfQuESTEnvFailed() {
}


void error_gettingPageSizeFailed() {

raiseInternalError("Failed to get the page size.");
}


void error_gettingNumaNodesFailed() {

raiseInternalError("Failed to get the numa node count");
}



/*
* MEMORY ERRORS
Expand Down
3 changes: 3 additions & 0 deletions quest/src/core/errors.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ void error_validationListUniquenessCheckExceededMaskSize();

void error_allocOfQuESTEnvFailed();

void error_gettingPageSizeFailed();

void error_gettingNumaNodesFailed();


/*
Expand Down
6 changes: 6 additions & 0 deletions quest/src/core/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@



/*
* Not necessarily the actual page size, but a sensible default for page-sized things.
*/
#define FALLBACK_PAGE_SIZE 4096


/*
* HARDWARE QUERYING
*/
Expand Down
17 changes: 17 additions & 0 deletions quest/src/core/utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,23 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
}


std::pair<qindex, qindex>
util_distribute(const qindex work, const qindex block, const int id, const int n) {
const qindex blocks = work / block;
// really should not happen except when work < block as work is power of 2 and block should be too.
const qindex last = work % block;

qindex spread = blocks / n;
qindex extra = blocks % n;

qindex prev_extra = (id * extra) / n;
qindex prev_shift = (id * extra) % n;
qindex here_extra = (prev_shift + extra) >= n;

qindex pos = id * spread + prev_extra;
qindex end = pos + spread + here_extra;
return std::make_pair(pos * block, end * block + (id == n - 1 ? last : 0));
}

/*
* GATE PARAMETERS
Expand Down
3 changes: 3 additions & 0 deletions quest/src/core/utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,9 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s
util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds);


// Generic function to split a workload fairly, with granularity >= block_size
std::pair<qindex, qindex> util_distribute(const qindex work, const qindex block, const int id, const int n);


/*
* GATE PARAMETERS
Expand Down
113 changes: 99 additions & 14 deletions quest/src/cpu/cpu_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
#include "quest/include/types.h"
#include "quest/include/paulis.h"

#include "quest/src/core/memory.hpp"
#include "quest/src/core/errors.hpp"

#include <vector>
#include <cstring>
#include <cstdlib>
#include <cstdint>

using std::vector;

Expand All @@ -34,6 +36,12 @@ using std::vector;
#include <omp.h>
#endif

#if NUMA_AWARE
#include <sys/mman.h>
#include <unistd.h>
#include <numaif.h>
#include <numa.h>
#endif // NUMA_AWARE


/*
Expand All @@ -46,11 +54,12 @@ bool cpu_isOpenmpCompiled() {
}


int cpu_getCurrentNumThreads() {
int cpu_getAvailableNumThreads() {
#if COMPILE_OPENMP
int n = -1;

#pragma omp parallel shared(n)
#pragma omp single
n = omp_get_num_threads();

return n;
Expand Down Expand Up @@ -90,28 +99,87 @@ int cpu_getOpenmpThreadInd() {
}


int cpu_getCurrentNumThreads() {
#if COMPILE_OPENMP
return omp_get_num_threads();
#else
return 1;
#endif
}



/*
* MEMORY ALLOCATION
*/

unsigned long cpu_getPageSize() {
#if NUMA_AWARE
static unsigned long page_size = 0;
if (!page_size) {
page_size = sysconf(_SC_PAGESIZE);
if (page_size == ~0UL) {
error_gettingPageSizeFailed();
}
}
return page_size;
#else
return FALLBACK_PAGE_SIZE;
#endif
}

#if NUMA_AWARE
unsigned long cpu_getNumaNodes() {
static int n_nodes = 0;
if (!n_nodes) {
n_nodes = numa_num_configured_nodes();
if (n_nodes < 1) {
error_gettingNumaNodesFailed();
}
}
return n_nodes;
}
#endif

qcomp* cpu_allocArray(qindex length) {
return (qcomp*) calloc(length, sizeof(qcomp));
}

/// @todo
/// here, we calloc the entire array in a serial setting, rather than one malloc
/// followed by threads subsequently memset'ing their own partitions. The latter
/// approach would distribute the array pages across NUMA nodes, accelerating
/// their subsequent access by the same threads (via NUMA's first-touch policy).
/// We have so far foregone this optimisation since a thread's memory-access pattern
/// in many of the QuEST functions is non-trivial, and likely to be inconsistent
/// with the memset pattern. As such, I expect the benefit is totally occluded
/// and only introduces potential new bugs - but this should be tested and confirmed!

// we call calloc over malloc in order to fail immediately if mem isn't available;
// caller must handle nullptr result

return (qcomp*) calloc(length, sizeof(qcomp));
qcomp* cpu_allocNumaArray(qindex length) {
#if !NUMA_AWARE
return cpu_allocArray(length);
#else
unsigned long page_size = cpu_getPageSize();
int n_nodes = cpu_getNumaNodes();

qindex size = length * sizeof(qcomp);
int pages = (size + page_size - 1) / page_size;
void *addr = mmap(NULL, pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (n_nodes == 1) {
return reinterpret_cast<qcomp*>(addr);
}

// distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible
int floor_pages = pages / n_nodes;
int spread_pages = pages % n_nodes;

uintptr_t pos = (uintptr_t)addr;
for (int node = 0, shift = n_nodes; node < n_nodes; ++node) {
shift -= spread_pages;
int node_pages = floor_pages + (shift <= 0);

unsigned long node_mask = 1UL << node;
mbind((void*)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof(node_mask) * 8, 0);

pos += node_pages * page_size;
if (shift <= 0) {
shift += n_nodes;
}
}

return reinterpret_cast<qcomp*>(addr);
#endif // NUMA_AWARE
}


Expand All @@ -122,6 +190,23 @@ void cpu_deallocArray(qcomp* arr) {
}


void cpu_deallocNumaArray(qcomp* arr, qindex length) {
if (arr == nullptr) {
return;
}

#if !NUMA_AWARE
return cpu_deallocArray(arr);
#else
unsigned long page_size = cpu_getPageSize();
qindex size = length * sizeof(qcomp);
int pages = (size + page_size - 1) / page_size;

munmap(arr, pages * page_size);
#endif // NUMA_AWARE
}


qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) {

// do not allocate if arr alloc failed (caller will handle)
Expand Down
9 changes: 8 additions & 1 deletion quest/src/cpu/cpu_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ using std::vector;

bool cpu_isOpenmpCompiled();

int cpu_getCurrentNumThreads();
int cpu_getAvailableNumThreads();

int cpu_getNumOpenmpProcessors();

Expand All @@ -35,6 +35,8 @@ int cpu_getNumOpenmpProcessors();

int cpu_getOpenmpThreadInd();

int cpu_getCurrentNumThreads();



/*
Expand All @@ -44,6 +46,9 @@ int cpu_getOpenmpThreadInd();
qcomp* cpu_allocArray(qindex length);
void cpu_deallocArray(qcomp* arr);

qcomp* cpu_allocNumaArray(qindex length);
void cpu_deallocNumaArray(qcomp* arr, qindex length);

qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim);
void cpu_deallocMatrixWrapper(qcomp** wrapper);

Expand All @@ -60,6 +65,8 @@ PauliStr* cpu_allocPauliStrings(qindex numStrings);
void cpu_deallocPauliStrings(PauliStr* strings);


unsigned long cpu_getPageSize();


/*
* MEMORY MOVEMENT
Expand Down
9 changes: 7 additions & 2 deletions quest/src/cpu/cpu_subroutines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
// whenever each thread has at least 1 iteration for itself. And of course
// we serialise both inner and outer loops when qureg multithreading is off.

if (!qureg.isMultithreaded || numOuterIts >= cpu_getCurrentNumThreads()) {
if (!qureg.isMultithreaded || numOuterIts >= cpu_getAvailableNumThreads()) {

// parallel
#pragma omp parallel for if(qureg.isMultithreaded)
Expand Down Expand Up @@ -2357,7 +2357,12 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {

// faster on average (though perhaps not for large quregs)
// than a custom multithreaded loop
std::fill(qureg.cpuAmps, qureg.cpuAmps + qureg.numAmpsPerNode, amp);
#pragma omp parallel if(qureg.isMultithreaded)
{
const auto [start, end] = util_distribute(qureg.numAmpsPerNode, cpu_getPageSize() / sizeof(qcomp),
cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads());
std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp);
}
}


Expand Down