Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/test_paid.yml
Original file line number Diff line number Diff line change
Expand Up @@ -257,12 +257,15 @@ jobs:
-DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
-DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
-DTEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}
-DPERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}
-DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}

- name: Compile
run: cmake --build ${{ env.build_dir }} --parallel

# permit use of single GPU by multiple MPI processes (detriments performance)
- name: Set env-var to permit GPU sharing
run: echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV

# cannot use ctests when distributed, grr!
- name: Run GPU + distributed v4 mixed tests (4 nodes sharing 1 GPU)
run: |
Expand Down
19 changes: 1 addition & 18 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -231,15 +231,6 @@ if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4)
message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.")
endif()

option(
PERMIT_NODES_TO_SHARE_GPU
"Whether to permit multiple distributed nodes to share a single GPU at the detriment of performance. Turned OFF by default."
OFF
)
if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
message(STATUS "Permitting nodes to share GPUs is turned ${PERMIT_NODES_TO_SHARE_GPU}. Set PERMIT_NODES_TO_SHARE_GPU to modify.")
endif()

# Deprecated API
option(
ENABLE_DEPRECATED_API
Expand Down Expand Up @@ -318,7 +309,7 @@ if (ENABLE_MULTITHREADING)
if (NOT OpenMP_FOUND)
set(ErrorMsg "Could not find OpenMP, necessary for enabling multithreading.")
if (APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
string(APPEND ErrorMsg " Try first calling `brew install libomp` then `export OpenMP_ROOT=$(brew --prefix)/opt/libomp`")
string(APPEND ErrorMsg " Try first calling \n\tbrew install libomp\nthen\n\texport OpenMP_ROOT=$(brew --prefix)/opt/libomp")
endif()
message(FATAL_ERROR ${ErrorMsg})
endif()
Expand Down Expand Up @@ -434,14 +425,6 @@ else()
endif()


if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
target_compile_definitions(
QuEST PRIVATE
PERMIT_NODES_TO_SHARE_GPU=$<IF:$<BOOL:${PERMIT_NODES_TO_SHARE_GPU}>,1,0>
)
endif()


# add math library
if (NOT MSVC)
target_link_libraries(QuEST PRIVATE ${MATH_LIBRARY})
Expand Down
3 changes: 3 additions & 0 deletions quest/include/environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ typedef struct {
// deployment modes which cannot be directly changed after compilation
int isCuQuantumEnabled;

// deployment configurations which can be changed via environment variables
int isGpuSharingEnabled;

// distributed configuration
int rank;
int numNodes;
Expand Down
46 changes: 37 additions & 9 deletions quest/include/modes.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,6 @@

// define optional-macro defaults (mostly to list them)

#ifndef PERMIT_NODES_TO_SHARE_GPU
#define PERMIT_NODES_TO_SHARE_GPU 0
#endif

#ifndef INCLUDE_DEPRECATED_FUNCTIONS
#define INCLUDE_DEPRECATED_FUNCTIONS 0
#endif
Expand All @@ -93,11 +89,6 @@
#if 0


/// @notyetdoced
/// @macrodoc
const int PERMIT_NODES_TO_SHARE_GPU = 0;


/// @notyetdoced
/// @macrodoc
const int INCLUDE_DEPRECATED_FUNCTIONS = 0;
Expand All @@ -112,6 +103,43 @@



// document environment variables

// spoof env-vars as consts to doc (hackily and hopefully temporarily)
#if 0


/** @envvardoc
*
* Specifies whether to permit multiple MPI processes to deploy to the same GPU.
*
* @attention
* This environment variable has no effect when either (or both) of distribution or
* GPU-acceleration are disabled.
*
* In multi-GPU execution, which combines distribution with GPU-acceleration, it is
* prudent to assign each GPU to at most one MPI process in order to avoid superfluous
* slowdown. Hence by default, initQuESTEnv() will forbid assigning multiple MPI processes
* to the same GPU. This environment variable can be set to `1` to disable this validation,
* permitting sharing of a single GPU, as is often useful for debugging or unit testing
* (for example, testing multi-GPU execution when only a single GPU is available).
*
* @warning
* Permitting GPU sharing may cause unintended behaviour when additionally using cuQuantum.
*
* @par Values
* - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
* - permit sharing: @p 1, @p '1'
*
* @author Tyson Jones
*/
const int PERMIT_NODES_TO_SHARE_GPU = 0;


#endif



// user flags for choosing automatic deployment; only accessible by C++
// backend and C++ users; C users must hardcode -1

Expand Down
37 changes: 23 additions & 14 deletions quest/src/api/environment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "quest/src/core/errors.hpp"
#include "quest/src/core/memory.hpp"
#include "quest/src/core/parser.hpp"
#include "quest/src/core/printer.hpp"
#include "quest/src/core/autodeployer.hpp"
#include "quest/src/core/validation.hpp"
Expand Down Expand Up @@ -102,12 +103,18 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
if (useGpuAccel)
gpu_bindLocalGPUsToNodes();

// each MPI process must use a unique GPU. This is critical when
// initializing cuQuantum, so we don't re-init cuStateVec on any
// paticular GPU (causing runtime error), but still ensures we
// keep good performance in our custom backend GPU code; there is
// no reason to use multi-nodes-per-GPU except for dev/debugging.
if (useGpuAccel && useDistrib && ! PERMIT_NODES_TO_SHARE_GPU)
// consult environment variable to decide whether to allow GPU sharing
// (default 'no'=0) which informs whether below validation is triggered
bool permitGpuSharing = parser_validateAndParseOptionalBoolEnvVar(
"PERMIT_NODES_TO_SHARE_GPU", false, caller);

// each MPI process should ordinarily use a unique GPU. This is
// critical when initializing cuQuantum so that we don't re-init
// cuStateVec on any paticular GPU (which can apparently cause a
// so-far-unwitnessed runtime error), but is otherwise essential
// for good performance. GPU sharing is useful for unit testing
// however permitting a single GPU to test CUDA+MPI deployment
if (useGpuAccel && useDistrib && ! permitGpuSharing)
validate_newEnvNodesEachHaveUniqueGpu(caller);

/// @todo
Expand All @@ -132,10 +139,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
error_allocOfQuESTEnvFailed();

// bind deployment info to global instance
globalEnvPtr->isMultithreaded = useMultithread;
globalEnvPtr->isGpuAccelerated = useGpuAccel;
globalEnvPtr->isDistributed = useDistrib;
globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
globalEnvPtr->isMultithreaded = useMultithread;
globalEnvPtr->isGpuAccelerated = useGpuAccel;
globalEnvPtr->isDistributed = useDistrib;
globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;

// bind distributed info
globalEnvPtr->rank = (useDistrib)? comm_getRank() : 0;
Expand Down Expand Up @@ -188,10 +196,11 @@ void printDeploymentInfo() {

print_table(
"deployment", {
{"isMpiEnabled", globalEnvPtr->isDistributed},
{"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
{"isOmpEnabled", globalEnvPtr->isMultithreaded},
{"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
{"isMpiEnabled", globalEnvPtr->isDistributed},
{"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
{"isOmpEnabled", globalEnvPtr->isMultithreaded},
{"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
{"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
});
}

Expand Down
6 changes: 6 additions & 0 deletions quest/src/comm/comm_routines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ using std::vector;
*
* - look into UCX CUDA multi-rail:
* https://docs.nvidia.com/networking/display/hpcxv215/unified+communication+-+x+framework+library#src-119764120_UnifiedCommunicationXFrameworkLibrary-Multi-RailMulti-Rail
*
* - by default, we validate to prevent sharing a GPU between multiple MPI processes since it is
* easy to do unintentionally yet is rarely necessary (outside of unit testing) and can severely
* degrade performance. If we motivated a strong non-testing use-case for this however, we could
* improve performance through use of CUDA's Multi-Process Service (MPS) which will prevent
* serialisation of memcpy to distinct memory partitions and improve kernel scheduling.
*/


Expand Down
28 changes: 28 additions & 0 deletions quest/src/core/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <string>
#include <sstream>
#include <fstream>
#include <cstdlib>
#include <stdexcept>
#include <algorithm>

Expand Down Expand Up @@ -443,3 +444,30 @@ string parser_loadFile(string fn) {
buffer << file.rdbuf();
return buffer.str();
}



/*
* ENVIRONMENT VARIABLES
*/


bool parser_isStrEmpty(const char* str) {

// str can be unallocated or empty, but not e.g. whitespace
return (str == nullptr) || (str[0] == '\0');
}


bool parser_validateAndParseOptionalBoolEnvVar(string varName, bool defaultVal, const char* caller) {

const char* varStr = std::getenv(varName.c_str());

// permit specifying no or empty environment variable (triggering default)
if (parser_isStrEmpty(varStr))
return defaultVal;

// otherwise it must be precisely 0 or 1 without whitespace
validate_envVarIsBoolean(varName, varStr, caller);
return (varStr[0] == '0')? 0 : 1;
}
8 changes: 8 additions & 0 deletions quest/src/core/parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,13 @@ bool parser_canReadFile(string fn);
string parser_loadFile(string fn);


/*
* ENVIRONMENT VARIABLES
*/

bool parser_isStrEmpty(const char* str);

bool parser_validateAndParseOptionalBoolEnvVar(string varName, bool defaultVal, const char* caller);


#endif // PARSER_HPP
48 changes: 41 additions & 7 deletions quest/src/core/validation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <algorithm>
#include <iostream>
#include <cstdlib>
#include <cstring>
#include <string>
#include <vector>
#include <map>
Expand Down Expand Up @@ -1066,6 +1067,19 @@ namespace report {
string TEMP_ALLOC_FAILED =
"A temporary allocation of ${NUM_ELEMS} elements (each of ${NUM_BYTES_PER_ELEM} bytes) failed, possibly because of insufficient memory.";


/*
* ENVIRONMENT VARIABLES
*/

string COMPULSORY_ENV_VAR_WAS_NOT_SPECIFIED_OR_EMPTY =
"A compulsory (but alas here unspecified) environment variable was not set, or was set to an empty string.";

string INVALID_BOOLEAN_ENVIRONMENT_VARIABLE =
"A boolean environment variable (alas here unspecified) was given a value other than '0' or '1'.";

string INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR =
"The optional, boolean PERMIT_NODES_TO_SHARE_GPU environment variable was specified to a value other than '', '0' or '1'.";
}


Expand Down Expand Up @@ -1364,13 +1378,8 @@ void validate_newEnvDistributedBetweenPower2Nodes(const char* caller) {

void validate_newEnvNodesEachHaveUniqueGpu(const char* caller) {

// this validation can be disabled for debugging/dev purposes
// (caller should explicitly check this preprocessor too for clarity)
if (PERMIT_NODES_TO_SHARE_GPU)
return;

bool uniqueGpus = ! gpu_areAnyNodesBoundToSameGpu();
assertAllNodesAgreeThat(uniqueGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller);
bool sharedGpus = gpu_areAnyNodesBoundToSameGpu();
assertAllNodesAgreeThat(!sharedGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller);
}

void validate_gpuIsCuQuantumCompatible(const char* caller) {
Expand Down Expand Up @@ -4165,3 +4174,28 @@ void validate_tempAllocSucceeded(bool succeeded, qindex numElems, qindex numByte

assertThat(succeeded, report::TEMP_ALLOC_FAILED, vars, caller);
}



/*
* ENVIRONMENT VARIABLES
*/

void validate_envVarIsBoolean(string varName, const char* varStr, const char* caller) {

// empty non-compulsory environment vars never reach this validation function
assertThat(!parser_isStrEmpty(varStr), report::COMPULSORY_ENV_VAR_WAS_NOT_SPECIFIED_OR_EMPTY, caller);

// value must be a single 0 or 1 character (below expr works even when str has no terminal)
bool isValid = (varStr[0] == '0' || varStr[0] == '1') && (varStr[1] == '\0');

/// @todo include 'varName' in printed vars once tokenSubs can support strings
// hackily ensure "PERMIT_NODES_TO_SHARE_GPU" is featured in the error message as
// the only currently supported environment variable and is important to specify
string errMsg = (varName == "PERMIT_NODES_TO_SHARE_GPU")?
report::INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR :
report::INVALID_BOOLEAN_ENVIRONMENT_VARIABLE;

/// @todo include 'varStr' in printed vars once tokenSubs can support strings
assertThat(isValid, errMsg, caller);
}
9 changes: 8 additions & 1 deletion quest/src/core/validation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,6 @@ void validate_densMatrExpecDiagMatrValueIsReal(qcomp value, qcomp exponent, cons
* PARTIAL TRACE
*/


void validate_quregCanBeReduced(Qureg qureg, int numTraceQubits, const char* caller);

void validate_quregCanBeSetToReducedDensMatr(Qureg out, Qureg in, int numTraceQubits, const char* caller);
Expand All @@ -511,4 +510,12 @@ void validate_tempAllocSucceeded(bool succeeded, qindex numElems, qindex numByte



/*
* ENVIRONMENT VARIABLES
*/

void validate_envVarIsBoolean(std::string varName, const char* varStr, const char* caller);



#endif // VALIDATION_HPP
15 changes: 8 additions & 7 deletions tests/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,14 @@ class startListener : public Catch::EventListenerBase {
QuESTEnv env = getQuESTEnv();
std::cout << std::endl;
std::cout << "QuEST execution environment:" << std::endl;
std::cout << " precision: " << FLOAT_PRECISION << std::endl;
std::cout << " multithreaded: " << env.isMultithreaded << std::endl;
std::cout << " distributed: " << env.isDistributed << std::endl;
std::cout << " GPU-accelerated: " << env.isGpuAccelerated << std::endl;
std::cout << " cuQuantum: " << env.isCuQuantumEnabled << std::endl;
std::cout << " num nodes: " << env.numNodes << std::endl;
std::cout << " num qubits: " << getNumCachedQubits() << std::endl;
std::cout << " precision: " << FLOAT_PRECISION << std::endl;
std::cout << " multithreaded: " << env.isMultithreaded << std::endl;
std::cout << " distributed: " << env.isDistributed << std::endl;
std::cout << " GPU-accelerated: " << env.isGpuAccelerated << std::endl;
std::cout << " GPU-sharing ok: " << env.isGpuSharingEnabled << std::endl;
std::cout << " cuQuantum: " << env.isCuQuantumEnabled << std::endl;
std::cout << " num nodes: " << env.numNodes << std::endl;
std::cout << " num qubits: " << getNumCachedQubits() << std::endl;
std::cout << " num qubit perms: " << TEST_MAX_NUM_QUBIT_PERMUTATIONS << std::endl;
std::cout << std::endl;

Expand Down
Loading