-
Couldn't load subscription status.
- Fork 165
Description
In v4, the core algorithm of practically all QuEST routines fundamentally changed as per this manuscript. As a comparison, compare this v3.6 routine:
QuEST/QuEST/src/CPU/QuEST_cpu.c
Lines 2196 to 2262 in cc19264
| void statevec_multiControlledUnitaryLocal( | |
| Qureg qureg, int targetQubit, | |
| long long int ctrlQubitsMask, long long int ctrlFlipMask, | |
| ComplexMatrix2 u) | |
| { | |
| long long int sizeBlock, sizeHalfBlock; | |
| long long int thisBlock, // current block | |
| indexUp,indexLo; // current index and corresponding index in lower half block | |
| qreal stateRealUp,stateRealLo,stateImagUp,stateImagLo; | |
| long long int thisTask; | |
| long long int numTasks=qureg.numAmpsPerChunk>>1; | |
| long long int chunkSize=qureg.numAmpsPerChunk; | |
| long long int chunkId=qureg.chunkId; | |
| // set dimensions | |
| sizeHalfBlock = 1LL << targetQubit; | |
| sizeBlock = 2LL * sizeHalfBlock; | |
| // Can't use qureg.stateVec as a private OMP var | |
| qreal *stateVecReal = qureg.stateVec.real; | |
| qreal *stateVecImag = qureg.stateVec.imag; | |
| # ifdef _OPENMP | |
| # pragma omp parallel \ | |
| default (none) \ | |
| shared (sizeBlock,sizeHalfBlock, stateVecReal,stateVecImag, u, ctrlQubitsMask,ctrlFlipMask, \ | |
| numTasks,chunkId,chunkSize) \ | |
| private (thisTask,thisBlock, indexUp,indexLo, stateRealUp,stateImagUp,stateRealLo,stateImagLo) | |
| # endif | |
| { | |
| # ifdef _OPENMP | |
| # pragma omp for schedule (static) | |
| # endif | |
| for (thisTask=0; thisTask<numTasks; thisTask++) { | |
| thisBlock = thisTask / sizeHalfBlock; | |
| indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock; | |
| indexLo = indexUp + sizeHalfBlock; | |
| // take the basis index, flip the designated (XOR) 'control' bits, AND with the controls. | |
| // if this equals the control mask, the control qubits have the desired values in the basis index | |
| if (ctrlQubitsMask == (ctrlQubitsMask & ((indexUp+chunkId*chunkSize) ^ ctrlFlipMask))) { | |
| // store current state vector values in temp variables | |
| stateRealUp = stateVecReal[indexUp]; | |
| stateImagUp = stateVecImag[indexUp]; | |
| stateRealLo = stateVecReal[indexLo]; | |
| stateImagLo = stateVecImag[indexLo]; | |
| // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo] | |
| stateVecReal[indexUp] = u.real[0][0]*stateRealUp - u.imag[0][0]*stateImagUp | |
| + u.real[0][1]*stateRealLo - u.imag[0][1]*stateImagLo; | |
| stateVecImag[indexUp] = u.real[0][0]*stateImagUp + u.imag[0][0]*stateRealUp | |
| + u.real[0][1]*stateImagLo + u.imag[0][1]*stateRealLo; | |
| // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo] | |
| stateVecReal[indexLo] = u.real[1][0]*stateRealUp - u.imag[1][0]*stateImagUp | |
| + u.real[1][1]*stateRealLo - u.imag[1][1]*stateImagLo; | |
| stateVecImag[indexLo] = u.real[1][0]*stateImagUp + u.imag[1][0]*stateRealUp | |
| + u.real[1][1]*stateImagLo + u.imag[1][1]*stateRealLo; | |
| } | |
| } | |
| } | |
| } |
to its v4 equivalent:
QuEST/quest/src/cpu/cpu_subroutines.cpp
Lines 361 to 390 in 4d44ec8
| template <int NumCtrls> | |
| void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr) { | |
| assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls); | |
| // each control qubit halves the needed iterations, and each iteration modifies two amplitudes | |
| qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); | |
| auto sortedQubits = util_getSorted(ctrls, {targ}); | |
| auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); | |
| // use template param to compile-time unroll loop in insertBits() | |
| SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); | |
| int numQubitBits = numCtrlBits + 1; | |
| #pragma omp parallel for if(qureg.isMultithreaded) | |
| for (qindex n=0; n<numIts; n++) { | |
| // i0 = nth local index where ctrl bits are in specified states and targ is 0 | |
| qindex i0 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask); | |
| qindex i1 = flipBit(i0, targ); | |
| // note the two amplitudes are likely strided and not adjacent (separated by 2^t) | |
| qcomp amp0 = qureg.cpuAmps[i0]; | |
| qcomp amp1 = qureg.cpuAmps[i1]; | |
| qureg.cpuAmps[i0] = matr.elems[0][0]*amp0 + matr.elems[0][1]*amp1; | |
| qureg.cpuAmps[i1] = matr.elems[1][0]*amp0 + matr.elems[1][1]*amp1; | |
| } | |
| } |
The v4 optimisations are focused upon avoiding redundant enumeration, avoiding branching within hot loops, and replacing integer arithmetic (which informs updated amplitude indices) with bitwise operations. The code was also cleaned to remove default and superfluous OpenMP boilerplate. As a consequence, the order of accessed amplitudes in the state has substantially changed; from conditional albeit contiguous to unconditional but potentially unpredictable (by a compiler or branch-predictor).
These changes are a benefit to GPU simulation which is detrimentally affected by warp-level looped branching. Alas, despite isolated testing during development, some changed functions may have regressed CPU performance as per #631 (and potentially #616). This might be remedied in multithreaded multi-NUMA-node settings by distributing over nodes, or impending memory-pinning. But in any case, the potential damage to single-CPU performance remains.
TODO: get to the bottom of this! A quick isolated test shows v4 bitwise non-branched enumeration still beats v3's logic (at least on my Apple M1 Max). Figure out which functions have regressed and why!