QuEST-Kit · TysonRayJones · Jun 14, 2025 · Jun 14, 2025
diff --git a/.github/workflows/test_paid.yml b/.github/workflows/test_paid.yml
@@ -257,12 +257,15 @@ jobs:
           -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
           -DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
           -DTEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}
-          -DPERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}
           -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
 
       - name: Compile
         run: cmake --build ${{ env.build_dir }} --parallel
 
+      # permit use of single GPU by multiple MPI processes (detriments performance)
+      - name: Set env-var to permit GPU sharing
+        run: echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV
+
       # cannot use ctests when distributed, grr!
       - name: Run GPU + distributed v4 mixed tests (4 nodes sharing 1 GPU)
         run: |

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -231,15 +231,6 @@ if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4)
   message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.")
 endif()
 
-option(
-  PERMIT_NODES_TO_SHARE_GPU
-  "Whether to permit multiple distributed nodes to share a single GPU at the detriment of performance. Turned OFF by default."
-  OFF
-)
-if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
-  message(STATUS "Permitting nodes to share GPUs is turned ${PERMIT_NODES_TO_SHARE_GPU}. Set PERMIT_NODES_TO_SHARE_GPU to modify.")
-endif()
-
 # Deprecated API
 option(
   ENABLE_DEPRECATED_API
@@ -318,7 +309,7 @@ if (ENABLE_MULTITHREADING)
   if (NOT OpenMP_FOUND)
     set(ErrorMsg "Could not find OpenMP, necessary for enabling multithreading.")
     if (APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-      string(APPEND ErrorMsg " Try first calling `brew install libomp` then `export OpenMP_ROOT=$(brew --prefix)/opt/libomp`")
+      string(APPEND ErrorMsg " Try first calling \n\tbrew install libomp\nthen\n\texport OpenMP_ROOT=$(brew --prefix)/opt/libomp")
     endif()
     message(FATAL_ERROR ${ErrorMsg})
   endif()
@@ -434,14 +425,6 @@ else()
 endif()
 
 
-if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
-  target_compile_definitions(
-    QuEST PRIVATE 
-    PERMIT_NODES_TO_SHARE_GPU=$<IF:$<BOOL:${PERMIT_NODES_TO_SHARE_GPU}>,1,0>
-  )
-endif()
-
-
 # add math library
 if (NOT MSVC)
   target_link_libraries(QuEST PRIVATE ${MATH_LIBRARY})

diff --git a/quest/include/environment.h b/quest/include/environment.h
@@ -40,6 +40,9 @@ typedef struct {
     // deployment modes which cannot be directly changed after compilation
     int isCuQuantumEnabled;
 
+    // deployment configurations which can be changed via environment variables
+    int isGpuSharingEnabled;
+
     // distributed configuration
     int rank;
     int numNodes;

diff --git a/quest/include/modes.h b/quest/include/modes.h
@@ -75,10 +75,6 @@
 
 // define optional-macro defaults (mostly to list them)
 
-#ifndef PERMIT_NODES_TO_SHARE_GPU
-#define PERMIT_NODES_TO_SHARE_GPU 0
-#endif
-
 #ifndef INCLUDE_DEPRECATED_FUNCTIONS
 #define INCLUDE_DEPRECATED_FUNCTIONS 0
 #endif
@@ -93,11 +89,6 @@
 #if 0
 
 
-    /// @notyetdoced
-    /// @macrodoc
-    const int PERMIT_NODES_TO_SHARE_GPU = 0;
-
-
     /// @notyetdoced
     /// @macrodoc
     const int INCLUDE_DEPRECATED_FUNCTIONS = 0;
@@ -112,6 +103,43 @@
 
 
 
+// document environment variables
+
+// spoof env-vars as consts to doc (hackily and hopefully temporarily)
+#if 0
+
+
+    /** @envvardoc
+     * 
+     * Specifies whether to permit multiple MPI processes to deploy to the same GPU.
+     * 
+     * @attention 
+     * This environment variable has no effect when either (or both) of distribution or 
+     * GPU-acceleration are disabled.
+     * 
+     * In multi-GPU execution, which combines distribution with GPU-acceleration, it is 
+     * prudent to assign each GPU to at most one MPI process in order to avoid superfluous 
+     * slowdown. Hence by default, initQuESTEnv() will forbid assigning multiple MPI processes 
+     * to the same GPU. This environment variable can be set to `1` to disable this validation, 
+     * permitting sharing of a single GPU, as is often useful for debugging or unit testing 
+     * (for example, testing multi-GPU execution when only a single GPU is available).
+     * 
+     * @warning
+     * Permitting GPU sharing may cause unintended behaviour when additionally using cuQuantum.
+     * 
+     * @par Values
+     *  - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
+     *  - permit sharing: @p 1, @p '1'
+     * 
+     * @author Tyson Jones
+     */
+    const int PERMIT_NODES_TO_SHARE_GPU = 0;
+
+
+#endif
+
+
+
 // user flags for choosing automatic deployment; only accessible by C++ 
 // backend and C++ users; C users must hardcode -1 
 

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
@@ -11,6 +11,7 @@
 
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/memory.hpp"
+#include "quest/src/core/parser.hpp"
 #include "quest/src/core/printer.hpp"
 #include "quest/src/core/autodeployer.hpp"
 #include "quest/src/core/validation.hpp"
@@ -102,12 +103,18 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
     if (useGpuAccel)
         gpu_bindLocalGPUsToNodes();
 
-    // each MPI process must use a unique GPU. This is critical when
-    // initializing cuQuantum, so we don't re-init cuStateVec on any
-    // paticular GPU (causing runtime error), but still ensures we 
-    // keep good performance in our custom backend GPU code; there is
-    // no reason to use multi-nodes-per-GPU except for dev/debugging.
-    if (useGpuAccel && useDistrib && ! PERMIT_NODES_TO_SHARE_GPU)
+    // consult environment variable to decide whether to allow GPU sharing 
+    // (default 'no'=0) which informs whether below validation is triggered
+    bool permitGpuSharing = parser_validateAndParseOptionalBoolEnvVar(
+        "PERMIT_NODES_TO_SHARE_GPU", false, caller);
+
+    // each MPI process should ordinarily use a unique GPU. This is 
+    // critical when initializing cuQuantum so that we don't re-init 
+    // cuStateVec on any paticular GPU (which can apparently cause a
+    // so-far-unwitnessed runtime error), but is otherwise essential
+    // for good performance. GPU sharing is useful for unit testing
+    // however permitting a single GPU to test CUDA+MPI deployment
+    if (useGpuAccel && useDistrib && ! permitGpuSharing)
         validate_newEnvNodesEachHaveUniqueGpu(caller);
 
     /// @todo
@@ -132,10 +139,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
         error_allocOfQuESTEnvFailed();
 
     // bind deployment info to global instance
-    globalEnvPtr->isMultithreaded    = useMultithread;
-    globalEnvPtr->isGpuAccelerated   = useGpuAccel;
-    globalEnvPtr->isDistributed      = useDistrib;
-    globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
+    globalEnvPtr->isMultithreaded     = useMultithread;
+    globalEnvPtr->isGpuAccelerated    = useGpuAccel;
+    globalEnvPtr->isDistributed       = useDistrib;
+    globalEnvPtr->isCuQuantumEnabled  = useCuQuantum;
+    globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;
 
     // bind distributed info
     globalEnvPtr->rank     = (useDistrib)? comm_getRank()     : 0;
@@ -188,10 +196,11 @@ void printDeploymentInfo() {
 
     print_table(
         "deployment", {
-        {"isMpiEnabled",       globalEnvPtr->isDistributed},
-        {"isGpuEnabled",       globalEnvPtr->isGpuAccelerated},
-        {"isOmpEnabled",       globalEnvPtr->isMultithreaded},
-        {"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
+        {"isMpiEnabled",        globalEnvPtr->isDistributed},
+        {"isGpuEnabled",        globalEnvPtr->isGpuAccelerated},
+        {"isOmpEnabled",        globalEnvPtr->isMultithreaded},
+        {"isCuQuantumEnabled",  globalEnvPtr->isCuQuantumEnabled},
+        {"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
     });
 }
 

diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
@@ -76,6 +76,12 @@ using std::vector;
  * 
  * - look into UCX CUDA multi-rail:
  *   https://docs.nvidia.com/networking/display/hpcxv215/unified+communication+-+x+framework+library#src-119764120_UnifiedCommunicationXFrameworkLibrary-Multi-RailMulti-Rail 
+ * 
+ * - by default, we validate to prevent sharing a GPU between multiple MPI processes since it is
+ *   easy to do unintentionally yet is rarely necessary (outside of unit testing) and can severely 
+ *   degrade performance. If we motivated a strong non-testing use-case for this however, we could
+ *   improve performance through use of CUDA's Multi-Process Service (MPS) which will prevent
+ *   serialisation of memcpy to distinct memory partitions and improve kernel scheduling.  
  */
 
 

diff --git a/quest/src/core/parser.cpp b/quest/src/core/parser.cpp
@@ -22,6 +22,7 @@
 #include <string>
 #include <sstream>
 #include <fstream>
+#include <cstdlib>
 #include <stdexcept>
 #include <algorithm>
 
@@ -443,3 +444,30 @@ string parser_loadFile(string fn) {
     buffer << file.rdbuf();
     return buffer.str();
 }
+
+
+
+/*
+ * ENVIRONMENT VARIABLES
+ */
+
+
+bool parser_isStrEmpty(const char* str) {
+
+    // str can be unallocated or empty, but not e.g. whitespace
+    return (str == nullptr) || (str[0] == '\0');
+}
+
+
+bool parser_validateAndParseOptionalBoolEnvVar(string varName, bool defaultVal, const char* caller) {
+
+    const char* varStr = std::getenv(varName.c_str());
+
+    // permit specifying no or empty environment variable (triggering default)
+    if (parser_isStrEmpty(varStr))
+        return defaultVal;
+
+    // otherwise it must be precisely 0 or 1 without whitespace
+    validate_envVarIsBoolean(varName, varStr, caller);
+    return (varStr[0] == '0')? 0 : 1;
+}
diff --git a/quest/src/core/parser.hpp b/quest/src/core/parser.hpp
@@ -42,5 +42,13 @@ bool parser_canReadFile(string fn);
 string parser_loadFile(string fn);
 
 
+/*
+ * ENVIRONMENT VARIABLES
+ */
+
+bool parser_isStrEmpty(const char* str);
+
+bool parser_validateAndParseOptionalBoolEnvVar(string varName, bool defaultVal, const char* caller);
+
 
 #endif // PARSER_HPP
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
@@ -31,6 +31,7 @@
 #include <algorithm>
 #include <iostream>
 #include <cstdlib>
+#include <cstring>
 #include <string>
 #include <vector>
 #include <map>
@@ -1066,6 +1067,19 @@ namespace report {
     string TEMP_ALLOC_FAILED =
         "A temporary allocation of ${NUM_ELEMS} elements (each of ${NUM_BYTES_PER_ELEM} bytes) failed, possibly because of insufficient memory.";
 
+
+    /*
+     * ENVIRONMENT VARIABLES
+     */
+
+    string COMPULSORY_ENV_VAR_WAS_NOT_SPECIFIED_OR_EMPTY =
+        "A compulsory (but alas here unspecified) environment variable was not set, or was set to an empty string.";
+
+    string INVALID_BOOLEAN_ENVIRONMENT_VARIABLE =
+        "A boolean environment variable (alas here unspecified) was given a value other than '0' or '1'.";
+
+    string INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR =
+        "The optional, boolean PERMIT_NODES_TO_SHARE_GPU environment variable was specified to a value other than '', '0' or '1'.";
 }
 
 
@@ -1364,13 +1378,8 @@ void validate_newEnvDistributedBetweenPower2Nodes(const char* caller) {
 
 void validate_newEnvNodesEachHaveUniqueGpu(const char* caller) {
 
-    // this validation can be disabled for debugging/dev purposes
-    // (caller should explicitly check this preprocessor too for clarity)
-    if (PERMIT_NODES_TO_SHARE_GPU)
-        return;
-
-    bool uniqueGpus = ! gpu_areAnyNodesBoundToSameGpu();
-    assertAllNodesAgreeThat(uniqueGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller);
+    bool sharedGpus = gpu_areAnyNodesBoundToSameGpu();
+    assertAllNodesAgreeThat(!sharedGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller);
 }
 
 void validate_gpuIsCuQuantumCompatible(const char* caller) {
@@ -4165,3 +4174,28 @@ void validate_tempAllocSucceeded(bool succeeded, qindex numElems, qindex numByte
 
     assertThat(succeeded, report::TEMP_ALLOC_FAILED, vars, caller);
 }
+
+
+
+/*
+ * ENVIRONMENT VARIABLES
+ */
+
+void validate_envVarIsBoolean(string varName, const char* varStr, const char* caller) {
+
+    // empty non-compulsory environment vars never reach this validation function
+    assertThat(!parser_isStrEmpty(varStr), report::COMPULSORY_ENV_VAR_WAS_NOT_SPECIFIED_OR_EMPTY, caller);
+
+    // value must be a single 0 or 1 character (below expr works even when str has no terminal)
+    bool isValid = (varStr[0] == '0' || varStr[0] == '1') && (varStr[1] == '\0');
+
+    /// @todo include 'varName' in printed vars once tokenSubs can support strings 
+    // hackily ensure "PERMIT_NODES_TO_SHARE_GPU" is featured in the error message as
+    // the only currently supported environment variable and is important to specify
+    string errMsg = (varName == "PERMIT_NODES_TO_SHARE_GPU")?
+        report::INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR :
+        report::INVALID_BOOLEAN_ENVIRONMENT_VARIABLE;
+
+    /// @todo include 'varStr' in printed vars once tokenSubs can support strings
+    assertThat(isValid, errMsg, caller);
+}
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
@@ -488,7 +488,6 @@ void validate_densMatrExpecDiagMatrValueIsReal(qcomp value, qcomp exponent, cons
  * PARTIAL TRACE
  */
 
-
 void validate_quregCanBeReduced(Qureg qureg, int numTraceQubits, const char* caller);
 
 void validate_quregCanBeSetToReducedDensMatr(Qureg out, Qureg in, int numTraceQubits, const char* caller);
@@ -511,4 +510,12 @@ void validate_tempAllocSucceeded(bool succeeded, qindex numElems, qindex numByte
 
 
 
+/*
+ * ENVIRONMENT VARIABLES
+ */
+
+void validate_envVarIsBoolean(std::string varName, const char* varStr, const char* caller);
+
+
+
 #endif // VALIDATION_HPP
diff --git a/tests/main.cpp b/tests/main.cpp
@@ -88,13 +88,14 @@ class startListener : public Catch::EventListenerBase {
         QuESTEnv env = getQuESTEnv();
         std::cout << std::endl;
         std::cout << "QuEST execution environment:" << std::endl;
-        std::cout << "  precision:       " << FLOAT_PRECISION        << std::endl;
-        std::cout << "  multithreaded:   " << env.isMultithreaded    << std::endl;
-        std::cout << "  distributed:     " << env.isDistributed      << std::endl;
-        std::cout << "  GPU-accelerated: " << env.isGpuAccelerated   << std::endl;
-        std::cout << "  cuQuantum:       " << env.isCuQuantumEnabled << std::endl;
-        std::cout << "  num nodes:       " << env.numNodes           << std::endl;
-        std::cout << "  num qubits:      " << getNumCachedQubits()   << std::endl;
+        std::cout << "  precision:       " << FLOAT_PRECISION         << std::endl;
+        std::cout << "  multithreaded:   " << env.isMultithreaded     << std::endl;
+        std::cout << "  distributed:     " << env.isDistributed       << std::endl;
+        std::cout << "  GPU-accelerated: " << env.isGpuAccelerated    << std::endl;
+        std::cout << "  GPU-sharing ok:  " << env.isGpuSharingEnabled << std::endl;
+        std::cout << "  cuQuantum:       " << env.isCuQuantumEnabled  << std::endl;
+        std::cout << "  num nodes:       " << env.numNodes            << std::endl;
+        std::cout << "  num qubits:      " << getNumCachedQubits()    << std::endl;
         std::cout << "  num qubit perms: " << TEST_MAX_NUM_QUBIT_PERMUTATIONS << std::endl;
         std::cout << std::endl;