Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/test_paid.yml
Original file line number Diff line number Diff line change
Expand Up @@ -257,12 +257,15 @@ jobs:
-DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
-DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
-DTEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}
-DPERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}
-DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}

- name: Compile
run: cmake --build ${{ env.build_dir }} --parallel

# permit use of single GPU by multiple MPI processes (detriments performance)
- name: Set env-var to permit GPU sharing
run: echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV

# cannot use ctests when distributed, grr!
- name: Run GPU + distributed v4 mixed tests (4 nodes sharing 1 GPU)
run: |
Expand Down
19 changes: 1 addition & 18 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -231,15 +231,6 @@ if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4)
message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.")
endif()

option(
PERMIT_NODES_TO_SHARE_GPU
"Whether to permit multiple distributed nodes to share a single GPU at the detriment of performance. Turned OFF by default."
OFF
)
if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
message(STATUS "Permitting nodes to share GPUs is turned ${PERMIT_NODES_TO_SHARE_GPU}. Set PERMIT_NODES_TO_SHARE_GPU to modify.")
endif()

# Deprecated API
option(
ENABLE_DEPRECATED_API
Expand Down Expand Up @@ -318,7 +309,7 @@ if (ENABLE_MULTITHREADING)
if (NOT OpenMP_FOUND)
set(ErrorMsg "Could not find OpenMP, necessary for enabling multithreading.")
if (APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
string(APPEND ErrorMsg " Try first calling `brew install libomp` then `export OpenMP_ROOT=$(brew --prefix)/opt/libomp`")
string(APPEND ErrorMsg " Try first calling \n\tbrew install libomp\nthen\n\texport OpenMP_ROOT=$(brew --prefix)/opt/libomp")
endif()
message(FATAL_ERROR ${ErrorMsg})
endif()
Expand Down Expand Up @@ -434,14 +425,6 @@ else()
endif()


if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
target_compile_definitions(
QuEST PRIVATE
PERMIT_NODES_TO_SHARE_GPU=$<IF:$<BOOL:${PERMIT_NODES_TO_SHARE_GPU}>,1,0>
)
endif()


# add math library
if (NOT MSVC)
target_link_libraries(QuEST PRIVATE ${MATH_LIBRARY})
Expand Down
24 changes: 20 additions & 4 deletions docs/launch.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,19 @@ Launching your [compiled](compile.md) QuEST application can be as straightforwar
> - <a href="#launch_tests">Tests</a>
> * <a href="#launch_v4">v4</a>
> * <a href="#launch_v3">v3</a>
> - <a href="#launch_configuring">Configuring</a>
> - <a href="#launch_multithreading">Multithreading</a>
> * <a href="#launch_choosing-threads">Choosing threads</a>
> * <a href="#launch_monitoring-utilisation">Monitoring utilisation</a>
> * <a href="#launch_improving-performance">Improving performance</a>
> - <a href="#launch_gpu-acceleration">GPU-acceleration</a>
> * <a href="#launch_launching">Launching</a>
> * <a href="#launch_monitoring">Monitoring</a>
> * <a href="#launch_configuring">Configuring</a>
> * <a href="#launch_configuring-1">Configuring</a>
> * <a href="#launch_benchmarking">Benchmarking</a>
> - <a href="#launch_distribution">Distribution</a>
> * <a href="#launch_launching-1">Launching</a>
> * <a href="#launch_configuring-1">Configuring</a>
> * <a href="#launch_configuring-2">Configuring</a>
> * <a href="#launch_benchmarking-1">Benchmarking</a>
> - <a href="#launch_multi-gpu">Multi-GPU</a>
> - <a href="#launch_supercomputers">Supercomputers</a>
Expand Down Expand Up @@ -243,6 +244,21 @@ ctest



---------------------

<!-- permit doxygen to reference section -->
<a id="launch_configuring"></a>

## Configuring

QuEST execution can be configured prior to runtime using the below [environment variables](https://en.wikipedia.org/wiki/Environment_variable).

- [`PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga7e12922138caa68ddaa6221e40f62dda)
- [`DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#ga55810d6f3d23de810cd9b12a2bbb8cc2)




---------------------


Expand Down Expand Up @@ -429,7 +445,7 @@ Usage of GPU-acceleration can be (inadvisably) forced using [`createForcedQureg(


<!-- permit doxygen to reference section -->
<a id="launch_configuring"></a>
<a id="launch_configuring-1"></a>

### Configuring

Expand Down Expand Up @@ -514,7 +530,7 @@ mpirun -np 1024 --oversubscribe ./mytests


<!-- permit doxygen to reference section -->
<a id="launch_configuring-1"></a>
<a id="launch_configuring-2"></a>

### Configuring

Expand Down
3 changes: 3 additions & 0 deletions quest/include/environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ typedef struct {
// deployment modes which cannot be directly changed after compilation
int isCuQuantumEnabled;

// deployment configurations which can be changed via environment variables
int isGpuSharingEnabled;

// distributed configuration
int rank;
int numNodes;
Expand Down
76 changes: 67 additions & 9 deletions quest/include/modes.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,6 @@

// define optional-macro defaults (mostly to list them)

#ifndef PERMIT_NODES_TO_SHARE_GPU
#define PERMIT_NODES_TO_SHARE_GPU 0
#endif

#ifndef INCLUDE_DEPRECATED_FUNCTIONS
#define INCLUDE_DEPRECATED_FUNCTIONS 0
#endif
Expand All @@ -93,11 +89,6 @@
#if 0


/// @notyetdoced
/// @macrodoc
const int PERMIT_NODES_TO_SHARE_GPU = 0;


/// @notyetdoced
/// @macrodoc
const int INCLUDE_DEPRECATED_FUNCTIONS = 0;
Expand All @@ -112,6 +103,73 @@



// document environment variables

// spoof env-vars as consts to doc (hackily and hopefully temporarily)
#if 0


/** @envvardoc
*
* Specifies whether to permit multiple MPI processes to deploy to the same GPU.
*
* @attention
* This environment variable has no effect when either (or both) of distribution or
* GPU-acceleration are disabled.
*
* In multi-GPU execution, which combines distribution with GPU-acceleration, it is
* prudent to assign each GPU to at most one MPI process in order to avoid superfluous
* slowdown. Hence by default, initQuESTEnv() will forbid assigning multiple MPI processes
* to the same GPU. This environment variable can be set to `1` to disable this validation,
* permitting sharing of a single GPU, as is often useful for debugging or unit testing
* (for example, testing multi-GPU execution when only a single GPU is available).
*
* @warning
* Permitting GPU sharing may cause unintended behaviour when additionally using cuQuantum.
*
* @envvarvalues
* - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
* - permit sharing: @p 1, @p '1'
*
* @author Tyson Jones
*/
const int PERMIT_NODES_TO_SHARE_GPU = 0;


/** @envvardoc
*
* Specifies the default validation epsilon.
*
* Specifying `DEFAULT_VALIDATION_EPSILON` to a positive, real number overrides the
* precision-specific default (`1E-5`, `1E-12`, `1E-13` for single, double and quadruple
* precision respectively). The specified epsilon is used by QuEST for numerical validation
* unless overriden at runtime via setValidationEpsilon(), in which case it can be
* restored to that specified by this environment variable using setValidationEpsilonToDefault().
*
* @envvarvalues
* - setting @p DEFAULT_VALIDATION_EPSILON=0 disables numerical validation, as if the value
* were instead infinity.
* - setting @p DEFAULT_VALIDATION_EPSILON='' is equivalent to _not_ specifying the variable,
* adopting instead the precision-specific default above.
* - setting @p DEFAULT_VALIDATION_EPSILON=x where `x` is a positive, valid `qreal` in any
* format accepted by `C` or `C++` (e.g. `0.01`, `1E-2`, `+1e-2`) will use `x` as the
* default validation epsilon.
*
* @constraints
* The function initQuESTEnv() will throw a validation error if:
* - The specified epsilon must be `0` or positive.
* - The specified epsilon must not exceed that maximum or minimum value which can be stored
* in a `qreal`, which is specific to its precision.
*
* @author Tyson Jones
*/
const qreal DEFAULT_VALIDATION_EPSILON = 0;


#endif



// user flags for choosing automatic deployment; only accessible by C++
// backend and C++ users; C users must hardcode -1

Expand Down
31 changes: 8 additions & 23 deletions quest/include/precision.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,34 +121,19 @@


/*
* RE-CONFIGURABLE DEFAULT VALIDATION PRECISION
* DEFAULT VALIDATION PRECISION
*
* which is compile-time overridable by pre-defining DEFAULT_VALIDATION_EPSILON (e.g.
* in user code before importing QuEST, or passed as a preprocessor constant by the
* compiler using argument -D), and runtime overridable using setValidationEpsilon()
* which is pre-run-time overridable by specifying the corresponding environment variable.
*/

#ifndef DEFAULT_VALIDATION_EPSILON

#if FLOAT_PRECISION == 1
#define DEFAULT_VALIDATION_EPSILON 1E-5

#elif FLOAT_PRECISION == 2
#define DEFAULT_VALIDATION_EPSILON 1E-12

#elif FLOAT_PRECISION == 4
#define DEFAULT_VALIDATION_EPSILON 1E-13

#endif

#endif
#if FLOAT_PRECISION == 1
#define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5

// spoofing above macros as typedefs and consts to doc
#if 0
#elif FLOAT_PRECISION == 2
#define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12

/// @notyetdoced
/// @macrodoc
const qreal DEFAULT_VALIDATION_EPSILON = 1E-12;
#elif FLOAT_PRECISION == 4
#define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13

#endif

Expand Down
40 changes: 26 additions & 14 deletions quest/src/api/environment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

#include "quest/src/core/errors.hpp"
#include "quest/src/core/memory.hpp"
#include "quest/src/core/parser.hpp"
#include "quest/src/core/printer.hpp"
#include "quest/src/core/envvars.hpp"
#include "quest/src/core/autodeployer.hpp"
#include "quest/src/core/validation.hpp"
#include "quest/src/core/randomiser.hpp"
Expand Down Expand Up @@ -75,6 +77,9 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
// this leads to undefined behaviour in distributed mode, as per the MPI
validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);

envvars_validateAndLoadEnvVars(caller);
validateconfig_setEpsilonToDefault();

// ensure the chosen deployment is compiled and supported by hardware.
// note that these error messages will be printed by every node because
// validation occurs before comm_init() below, so all processes spawned
Expand Down Expand Up @@ -102,12 +107,17 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
if (useGpuAccel)
gpu_bindLocalGPUsToNodes();

// each MPI process must use a unique GPU. This is critical when
// initializing cuQuantum, so we don't re-init cuStateVec on any
// paticular GPU (causing runtime error), but still ensures we
// keep good performance in our custom backend GPU code; there is
// no reason to use multi-nodes-per-GPU except for dev/debugging.
if (useGpuAccel && useDistrib && ! PERMIT_NODES_TO_SHARE_GPU)
// consult environment variable to decide whether to allow GPU sharing
// (default = false) which informs whether below validation is triggered
bool permitGpuSharing = envvars_getWhetherGpuSharingIsPermitted();

// each MPI process should ordinarily use a unique GPU. This is
// critical when initializing cuQuantum so that we don't re-init
// cuStateVec on any paticular GPU (which can apparently cause a
// so-far-unwitnessed runtime error), but is otherwise essential
// for good performance. GPU sharing is useful for unit testing
// however permitting a single GPU to test CUDA+MPI deployment
if (useGpuAccel && useDistrib && ! permitGpuSharing)
validate_newEnvNodesEachHaveUniqueGpu(caller);

/// @todo
Expand All @@ -132,10 +142,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
error_allocOfQuESTEnvFailed();

// bind deployment info to global instance
globalEnvPtr->isMultithreaded = useMultithread;
globalEnvPtr->isGpuAccelerated = useGpuAccel;
globalEnvPtr->isDistributed = useDistrib;
globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
globalEnvPtr->isMultithreaded = useMultithread;
globalEnvPtr->isGpuAccelerated = useGpuAccel;
globalEnvPtr->isDistributed = useDistrib;
globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;

// bind distributed info
globalEnvPtr->rank = (useDistrib)? comm_getRank() : 0;
Expand Down Expand Up @@ -188,10 +199,11 @@ void printDeploymentInfo() {

print_table(
"deployment", {
{"isMpiEnabled", globalEnvPtr->isDistributed},
{"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
{"isOmpEnabled", globalEnvPtr->isMultithreaded},
{"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
{"isMpiEnabled", globalEnvPtr->isDistributed},
{"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
{"isOmpEnabled", globalEnvPtr->isMultithreaded},
{"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
{"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
});
}

Expand Down
6 changes: 6 additions & 0 deletions quest/src/comm/comm_routines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ using std::vector;
*
* - look into UCX CUDA multi-rail:
* https://docs.nvidia.com/networking/display/hpcxv215/unified+communication+-+x+framework+library#src-119764120_UnifiedCommunicationXFrameworkLibrary-Multi-RailMulti-Rail
*
* - by default, we validate to prevent sharing a GPU between multiple MPI processes since it is
* easy to do unintentionally yet is rarely necessary (outside of unit testing) and can severely
* degrade performance. If we motivated a strong non-testing use-case for this however, we could
* improve performance through use of CUDA's Multi-Process Service (MPS) which will prevent
* serialisation of memcpy to distinct memory partitions and improve kernel scheduling.
*/


Expand Down
1 change: 1 addition & 0 deletions quest/src/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ target_sources(QuEST
PRIVATE
accelerator.cpp
autodeployer.cpp
envvars.cpp
errors.cpp
localiser.cpp
memory.cpp
Expand Down
Loading