From 52a3b4bd2368d3915cda6c1efa343f3c86fc8ed8 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sat, 14 Jun 2025 19:24:14 +0200 Subject: [PATCH 1/2] changed PERMIT_NODES_TO_SHARE_GPU from macro to env-var This allows it to be changed post-compilation/installation, pre-execution, as per the machination in #645 Additionally inserted whitespaces into cmake error message about MacOS multithreading to make the advised commands clearer --- .github/workflows/test_paid.yml | 5 +++- CMakeLists.txt | 19 +------------ quest/include/environment.h | 3 ++ quest/include/modes.h | 43 ++++++++++++++++++++++------ quest/src/api/environment.cpp | 37 ++++++++++++++---------- quest/src/comm/comm_routines.cpp | 6 ++++ quest/src/core/parser.cpp | 28 +++++++++++++++++++ quest/src/core/parser.hpp | 8 ++++++ quest/src/core/validation.cpp | 48 +++++++++++++++++++++++++++----- quest/src/core/validation.hpp | 9 +++++- tests/main.cpp | 15 +++++----- tests/unit/environment.cpp | 16 ++++++++--- utils/docs/Doxyfile | 1 + 13 files changed, 177 insertions(+), 61 deletions(-) diff --git a/.github/workflows/test_paid.yml b/.github/workflows/test_paid.yml index 878336f7f..c8fe34c03 100644 --- a/.github/workflows/test_paid.yml +++ b/.github/workflows/test_paid.yml @@ -257,12 +257,15 @@ jobs: -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }} -DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }} -DTEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }} - -DPERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }} -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }} - name: Compile run: cmake --build ${{ env.build_dir }} --parallel + # permit use of single GPU by multiple MPI processes (detriments performance) + - name: Set env-var to permit GPU sharing + run: echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV + # cannot use ctests when distributed, grr! - name: Run GPU + distributed v4 mixed tests (4 nodes sharing 1 GPU) run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 1080c55fc..8418b5ba5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -231,15 +231,6 @@ if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4) message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.") endif() -option( - PERMIT_NODES_TO_SHARE_GPU - "Whether to permit multiple distributed nodes to share a single GPU at the detriment of performance. Turned OFF by default." - OFF -) -if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP)) - message(STATUS "Permitting nodes to share GPUs is turned ${PERMIT_NODES_TO_SHARE_GPU}. Set PERMIT_NODES_TO_SHARE_GPU to modify.") -endif() - # Deprecated API option( ENABLE_DEPRECATED_API @@ -318,7 +309,7 @@ if (ENABLE_MULTITHREADING) if (NOT OpenMP_FOUND) set(ErrorMsg "Could not find OpenMP, necessary for enabling multithreading.") if (APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang") - string(APPEND ErrorMsg " Try first calling `brew install libomp` then `export OpenMP_ROOT=$(brew --prefix)/opt/libomp`") + string(APPEND ErrorMsg " Try first calling \n\tbrew install libomp\nthen\n\texport OpenMP_ROOT=$(brew --prefix)/opt/libomp") endif() message(FATAL_ERROR ${ErrorMsg}) endif() @@ -434,14 +425,6 @@ else() endif() -if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP)) - target_compile_definitions( - QuEST PRIVATE - PERMIT_NODES_TO_SHARE_GPU=$,1,0> - ) -endif() - - # add math library if (NOT MSVC) target_link_libraries(QuEST PRIVATE ${MATH_LIBRARY}) diff --git a/quest/include/environment.h b/quest/include/environment.h index a71454f0e..04f24bfe2 100644 --- a/quest/include/environment.h +++ b/quest/include/environment.h @@ -40,6 +40,9 @@ typedef struct { // deployment modes which cannot be directly changed after compilation int isCuQuantumEnabled; + // deployment configurations which can be changed via environment variables + int isGpuSharingEnabled; + // distributed configuration int rank; int numNodes; diff --git a/quest/include/modes.h b/quest/include/modes.h index b90797acd..3d8ccc622 100644 --- a/quest/include/modes.h +++ b/quest/include/modes.h @@ -75,10 +75,6 @@ // define optional-macro defaults (mostly to list them) -#ifndef PERMIT_NODES_TO_SHARE_GPU -#define PERMIT_NODES_TO_SHARE_GPU 0 -#endif - #ifndef INCLUDE_DEPRECATED_FUNCTIONS #define INCLUDE_DEPRECATED_FUNCTIONS 0 #endif @@ -93,11 +89,6 @@ #if 0 - /// @notyetdoced - /// @macrodoc - const int PERMIT_NODES_TO_SHARE_GPU = 0; - - /// @notyetdoced /// @macrodoc const int INCLUDE_DEPRECATED_FUNCTIONS = 0; @@ -112,6 +103,40 @@ +// document environment variables + +// spoof env-vars as consts to doc (hackily and hopefully temporarily) +#if 0 + + + /** @envvardoc + * + * Specifies whether to permit multiple MPI processes to deploy to the same GPU. + * + * @attention + * This environment variable has no effect when either (or both) of distribution or + * GPU-acceleration are disabled. + * + * In multi-GPU execution, which combines distribution with GPU-acceleration, it is + * prudent to assign each GPU to at most one MPI process in order to avoid superfluous + * slowdown. Hence by default, initQuESTEnv() will forbid assigning multiple MPI processes + * to the same GPU. This environment variable can be set to `1` to disable this validation, + * permitting sharing of a single GPU, as is often useful for debugging or unit testing + * (for example, testing multi-GPU execution when only a single GPU is available). + * + * @par Values + * - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified) + * - permit sharing: @p 1, @p '1' + * + * @author Tyson Jones + */ + const int PERMIT_NODES_TO_SHARE_GPU = 0; + + +#endif + + + // user flags for choosing automatic deployment; only accessible by C++ // backend and C++ users; C users must hardcode -1 diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index 63b6f41ef..959e93693 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -11,6 +11,7 @@ #include "quest/src/core/errors.hpp" #include "quest/src/core/memory.hpp" +#include "quest/src/core/parser.hpp" #include "quest/src/core/printer.hpp" #include "quest/src/core/autodeployer.hpp" #include "quest/src/core/validation.hpp" @@ -102,12 +103,18 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti if (useGpuAccel) gpu_bindLocalGPUsToNodes(); - // each MPI process must use a unique GPU. This is critical when - // initializing cuQuantum, so we don't re-init cuStateVec on any - // paticular GPU (causing runtime error), but still ensures we - // keep good performance in our custom backend GPU code; there is - // no reason to use multi-nodes-per-GPU except for dev/debugging. - if (useGpuAccel && useDistrib && ! PERMIT_NODES_TO_SHARE_GPU) + // consult environment variable to decide whether to allow GPU sharing + // (default 'no'=0) which informs whether below validation is triggered + bool permitGpuSharing = parser_validateAndParseOptionalBoolEnvVar( + "PERMIT_NODES_TO_SHARE_GPU", false, caller); + + // each MPI process should ordinarily use a unique GPU. This is + // critical when initializing cuQuantum so that we don't re-init + // cuStateVec on any paticular GPU (which can apparently cause a + // so-far-unwitnessed runtime error), but is otherwise essential + // for good performance. GPU sharing is useful for unit testing + // however permitting a single GPU to test CUDA+MPI deployment + if (useGpuAccel && useDistrib && ! permitGpuSharing) validate_newEnvNodesEachHaveUniqueGpu(caller); /// @todo @@ -132,10 +139,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti error_allocOfQuESTEnvFailed(); // bind deployment info to global instance - globalEnvPtr->isMultithreaded = useMultithread; - globalEnvPtr->isGpuAccelerated = useGpuAccel; - globalEnvPtr->isDistributed = useDistrib; - globalEnvPtr->isCuQuantumEnabled = useCuQuantum; + globalEnvPtr->isMultithreaded = useMultithread; + globalEnvPtr->isGpuAccelerated = useGpuAccel; + globalEnvPtr->isDistributed = useDistrib; + globalEnvPtr->isCuQuantumEnabled = useCuQuantum; + globalEnvPtr->isGpuSharingEnabled = permitGpuSharing; // bind distributed info globalEnvPtr->rank = (useDistrib)? comm_getRank() : 0; @@ -188,10 +196,11 @@ void printDeploymentInfo() { print_table( "deployment", { - {"isMpiEnabled", globalEnvPtr->isDistributed}, - {"isGpuEnabled", globalEnvPtr->isGpuAccelerated}, - {"isOmpEnabled", globalEnvPtr->isMultithreaded}, - {"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled}, + {"isMpiEnabled", globalEnvPtr->isDistributed}, + {"isGpuEnabled", globalEnvPtr->isGpuAccelerated}, + {"isOmpEnabled", globalEnvPtr->isMultithreaded}, + {"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled}, + {"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled}, }); } diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp index 3c03d23f6..6e161db18 100644 --- a/quest/src/comm/comm_routines.cpp +++ b/quest/src/comm/comm_routines.cpp @@ -76,6 +76,12 @@ using std::vector; * * - look into UCX CUDA multi-rail: * https://docs.nvidia.com/networking/display/hpcxv215/unified+communication+-+x+framework+library#src-119764120_UnifiedCommunicationXFrameworkLibrary-Multi-RailMulti-Rail + * + * - by default, we validate to prevent sharing a GPU between multiple MPI processes since it is + * easy to do unintentionally yet is rarely necessary (outside of unit testing) and can severely + * degrade performance. If we motivated a strong non-testing use-case for this however, we could + * improve performance through use of CUDA's Multi-Process Service (MPS) which will prevent + * serialisation of memcpy to distinct memory partitions and improve kernel scheduling. */ diff --git a/quest/src/core/parser.cpp b/quest/src/core/parser.cpp index 31dad4e5f..c83afcba0 100644 --- a/quest/src/core/parser.cpp +++ b/quest/src/core/parser.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -443,3 +444,30 @@ string parser_loadFile(string fn) { buffer << file.rdbuf(); return buffer.str(); } + + + +/* + * ENVIRONMENT VARIABLES + */ + + +bool parser_isStrEmpty(const char* str) { + + // str can be unallocated or empty, but not e.g. whitespace + return (str == nullptr) || (str[0] == '\0'); +} + + +bool parser_validateAndParseOptionalBoolEnvVar(string varName, bool defaultVal, const char* caller) { + + const char* varStr = std::getenv(varName.c_str()); + + // permit specifying no or empty environment variable (triggering default) + if (parser_isStrEmpty(varStr)) + return defaultVal; + + // otherwise it must be precisely 0 or 1 without whitespace + validate_envVarIsBoolean(varName, varStr, caller); + return (varStr[0] == '0')? 0 : 1; +} diff --git a/quest/src/core/parser.hpp b/quest/src/core/parser.hpp index 3e9d18c11..4aa1e8726 100644 --- a/quest/src/core/parser.hpp +++ b/quest/src/core/parser.hpp @@ -42,5 +42,13 @@ bool parser_canReadFile(string fn); string parser_loadFile(string fn); +/* + * ENVIRONMENT VARIABLES + */ + +bool parser_isStrEmpty(const char* str); + +bool parser_validateAndParseOptionalBoolEnvVar(string varName, bool defaultVal, const char* caller); + #endif // PARSER_HPP \ No newline at end of file diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp index 4b3406975..97d399f11 100644 --- a/quest/src/core/validation.cpp +++ b/quest/src/core/validation.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -1066,6 +1067,19 @@ namespace report { string TEMP_ALLOC_FAILED = "A temporary allocation of ${NUM_ELEMS} elements (each of ${NUM_BYTES_PER_ELEM} bytes) failed, possibly because of insufficient memory."; + + /* + * ENVIRONMENT VARIABLES + */ + + string COMPULSORY_ENV_VAR_WAS_NOT_SPECIFIED_OR_EMPTY = + "A compulsory (but alas here unspecified) environment variable was not set, or was set to an empty string."; + + string INVALID_BOOLEAN_ENVIRONMENT_VARIABLE = + "A boolean environment variable (alas here unspecified) was given a value other than '0' or '1'."; + + string INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR = + "The optional, boolean PERMIT_NODES_TO_SHARE_GPU environment variable was specified to a value other than '', '0' or '1'."; } @@ -1364,13 +1378,8 @@ void validate_newEnvDistributedBetweenPower2Nodes(const char* caller) { void validate_newEnvNodesEachHaveUniqueGpu(const char* caller) { - // this validation can be disabled for debugging/dev purposes - // (caller should explicitly check this preprocessor too for clarity) - if (PERMIT_NODES_TO_SHARE_GPU) - return; - - bool uniqueGpus = ! gpu_areAnyNodesBoundToSameGpu(); - assertAllNodesAgreeThat(uniqueGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller); + bool sharedGpus = gpu_areAnyNodesBoundToSameGpu(); + assertAllNodesAgreeThat(!sharedGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller); } void validate_gpuIsCuQuantumCompatible(const char* caller) { @@ -4165,3 +4174,28 @@ void validate_tempAllocSucceeded(bool succeeded, qindex numElems, qindex numByte assertThat(succeeded, report::TEMP_ALLOC_FAILED, vars, caller); } + + + +/* + * ENVIRONMENT VARIABLES + */ + +void validate_envVarIsBoolean(string varName, const char* varStr, const char* caller) { + + // empty non-compulsory environment vars never reach this validation function + assertThat(!parser_isStrEmpty(varStr), report::COMPULSORY_ENV_VAR_WAS_NOT_SPECIFIED_OR_EMPTY, caller); + + // value must be a single 0 or 1 character (below expr works even when str has no terminal) + bool isValid = (varStr[0] == '0' || varStr[0] == '1') && (varStr[1] == '\0'); + + /// @todo include 'varName' in printed vars once tokenSubs can support strings + // hackily ensure "PERMIT_NODES_TO_SHARE_GPU" is featured in the error message as + // the only currently supported environment variable and is important to specify + string errMsg = (varName == "PERMIT_NODES_TO_SHARE_GPU")? + report::INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR : + report::INVALID_BOOLEAN_ENVIRONMENT_VARIABLE; + + /// @todo include 'varStr' in printed vars once tokenSubs can support strings + assertThat(isValid, errMsg, caller); +} diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp index 0bf48b409..cf8b62535 100644 --- a/quest/src/core/validation.hpp +++ b/quest/src/core/validation.hpp @@ -488,7 +488,6 @@ void validate_densMatrExpecDiagMatrValueIsReal(qcomp value, qcomp exponent, cons * PARTIAL TRACE */ - void validate_quregCanBeReduced(Qureg qureg, int numTraceQubits, const char* caller); void validate_quregCanBeSetToReducedDensMatr(Qureg out, Qureg in, int numTraceQubits, const char* caller); @@ -511,4 +510,12 @@ void validate_tempAllocSucceeded(bool succeeded, qindex numElems, qindex numByte +/* + * ENVIRONMENT VARIABLES + */ + +void validate_envVarIsBoolean(std::string varName, const char* varStr, const char* caller); + + + #endif // VALIDATION_HPP \ No newline at end of file diff --git a/tests/main.cpp b/tests/main.cpp index 03294857a..29647753e 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -88,13 +88,14 @@ class startListener : public Catch::EventListenerBase { QuESTEnv env = getQuESTEnv(); std::cout << std::endl; std::cout << "QuEST execution environment:" << std::endl; - std::cout << " precision: " << FLOAT_PRECISION << std::endl; - std::cout << " multithreaded: " << env.isMultithreaded << std::endl; - std::cout << " distributed: " << env.isDistributed << std::endl; - std::cout << " GPU-accelerated: " << env.isGpuAccelerated << std::endl; - std::cout << " cuQuantum: " << env.isCuQuantumEnabled << std::endl; - std::cout << " num nodes: " << env.numNodes << std::endl; - std::cout << " num qubits: " << getNumCachedQubits() << std::endl; + std::cout << " precision: " << FLOAT_PRECISION << std::endl; + std::cout << " multithreaded: " << env.isMultithreaded << std::endl; + std::cout << " distributed: " << env.isDistributed << std::endl; + std::cout << " GPU-accelerated: " << env.isGpuAccelerated << std::endl; + std::cout << " GPU-sharing ok: " << env.isGpuSharingEnabled << std::endl; + std::cout << " cuQuantum: " << env.isCuQuantumEnabled << std::endl; + std::cout << " num nodes: " << env.numNodes << std::endl; + std::cout << " num qubits: " << getNumCachedQubits() << std::endl; std::cout << " num qubit perms: " << TEST_MAX_NUM_QUBIT_PERMUTATIONS << std::endl; std::cout << std::endl; diff --git a/tests/unit/environment.cpp b/tests/unit/environment.cpp index db9a1516c..6d4efb80d 100644 --- a/tests/unit/environment.cpp +++ b/tests/unit/environment.cpp @@ -54,6 +54,13 @@ TEST_CASE( "initQuESTEnv", TEST_CATEGORY ) { SECTION( LABEL_VALIDATION ) { REQUIRE_THROWS_WITH( initQuESTEnv(), ContainsSubstring( "already been initialised") ); + + // cannot automatically check other validations, such as: + // - has env been previously initialised then finalised? + // - is env distributed over power-of-2 nodes? + // - are environment-variables valid? + // - is max 1 MPI process bound to each GPU? + // - is GPU compatible with cuQuantum (if enabled)? } } @@ -133,10 +140,11 @@ TEST_CASE( "getQuESTEnv", TEST_CATEGORY ) { QuESTEnv env = getQuESTEnv(); - REQUIRE( (env.isMultithreaded == 0 || env.isMultithreaded == 1) ); - REQUIRE( (env.isGpuAccelerated == 0 || env.isGpuAccelerated == 1) ); - REQUIRE( (env.isDistributed == 0 || env.isDistributed == 1) ); - REQUIRE( (env.isCuQuantumEnabled == 0 || env.isCuQuantumEnabled == 1) ); + REQUIRE( (env.isMultithreaded == 0 || env.isMultithreaded == 1) ); + REQUIRE( (env.isGpuAccelerated == 0 || env.isGpuAccelerated == 1) ); + REQUIRE( (env.isDistributed == 0 || env.isDistributed == 1) ); + REQUIRE( (env.isCuQuantumEnabled == 0 || env.isCuQuantumEnabled == 1) ); + REQUIRE( (env.isGpuSharingEnabled == 0 || env.isGpuSharingEnabled == 1) ); REQUIRE( env.rank >= 0 ); REQUIRE( env.numNodes >= 0 ); diff --git a/utils/docs/Doxyfile b/utils/docs/Doxyfile index 4eb40b524..f113eaba5 100644 --- a/utils/docs/Doxyfile +++ b/utils/docs/Doxyfile @@ -301,6 +301,7 @@ ALIASES += "notyetdoced=@note Documentation for this function or struct is under ALIASES += "cpponly=@remark This function is only available in C++." ALIASES += "conly=@remark This function is only available in C." ALIASES += "macrodoc=@note This entity is actually a macro." +ALIASES += "envvardoc=@note This entity is actually an environment variable." ALIASES += "neverdoced=@warning This entity is a macro, undocumented directly due to a Doxygen limitation. If you see this doc rendered, contact the devs!" ALIASES += "myexample=@par Example" ALIASES += "equivalences=@par Equivalences" From fd49f55b149095e93b6ff33abd5918baac248274 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sat, 14 Jun 2025 19:27:57 +0200 Subject: [PATCH 2/2] added cuQuantum warning --- quest/include/modes.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/quest/include/modes.h b/quest/include/modes.h index 3d8ccc622..b29dc626a 100644 --- a/quest/include/modes.h +++ b/quest/include/modes.h @@ -124,6 +124,9 @@ * permitting sharing of a single GPU, as is often useful for debugging or unit testing * (for example, testing multi-GPU execution when only a single GPU is available). * + * @warning + * Permitting GPU sharing may cause unintended behaviour when additionally using cuQuantum. + * * @par Values * - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified) * - permit sharing: @p 1, @p '1'