@@ -178,11 +178,109 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
178178 false );
179179}
180180
181+ // / Mostly like _v2 but with the builtin assumption that we have less than
182+ // / num_of_records (by default 1024) teams.
183+ int32_t __kmpc_nvptx_teams_reduce_nowait_v3 (
184+ IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer,
185+ uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct,
186+ InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
187+ ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
188+ // Terminate all threads in non-SPMD mode except for the main thread.
189+ uint32_t ThreadId = mapping::getThreadIdInBlock ();
190+ if (mapping::isGenericMode ()) {
191+ if (!mapping::isMainThreadInGenericMode ())
192+ return 0 ;
193+ ThreadId = 0 ;
194+ }
195+
196+ uint32_t &Cnt = state::getKernelLaunchEnvironment ().ReductionCnt ;
197+
198+ // In non-generic mode all workers participate in the teams reduction.
199+ // In generic mode only the team main participates in the teams
200+ // reduction because the workers are waiting for parallel work.
201+ uint32_t NumThreads = omp_get_num_threads ();
202+ uint32_t TeamId = omp_get_team_num ();
203+ uint32_t NumTeams = omp_get_num_teams ();
204+ static unsigned SHARED (ChunkTeamCount);
205+
206+ // Block progress for teams greater than the current upper
207+ // limit. We always only allow a number of teams less or equal
208+ // to the number of slots in the buffer.
209+ bool IsMain = (ThreadId == 0 );
210+
211+ if (IsMain) {
212+ lgcpyFct (GlobalBuffer, TeamId, reduce_data);
213+
214+ // Propagate the memory writes above to the world.
215+ fence::kernel (atomic::release);
216+
217+ // Increment team counter.
218+ // This counter is incremented by all teams in the current
219+ // BUFFER_SIZE chunk.
220+ ChunkTeamCount = atomic::inc (&Cnt, NumTeams, atomic::acq_rel,
221+ atomic::MemScopeTy::device);
222+ }
223+
224+ // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
225+ // state machine.
226+ if (mapping::isSPMDMode ())
227+ synchronize::threadsAligned (atomic::acq_rel);
228+
229+ // Each thread will have a local struct containing the values to be
230+ // reduced:
231+ // 1. do reduction within each warp.
232+ // 2. do reduction across warps.
233+ // 3. write the final result to the main reduction variable
234+ // by returning 1 in the thread holding the reduction result.
235+
236+ // Check if this is the very last team.
237+ if (ChunkTeamCount != NumTeams - 1 )
238+ return 0 ;
239+
240+ // Last team processing.
241+ NumThreads = roundToWarpsize (kmpcMin (NumThreads, NumTeams));
242+ if (ThreadId >= NumThreads)
243+ return 0 ;
244+
245+ // Ensure we see the global memory writes by other teams
246+ fence::kernel (atomic::aquire);
247+
248+ // Load from buffer and reduce.
249+ glcpyFct (GlobalBuffer, ThreadId, reduce_data);
250+ for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
251+ glredFct (GlobalBuffer, i, reduce_data);
252+
253+ // Reduce across warps to the warp main.
254+ gpu_regular_warp_reduce (reduce_data, shflFct);
255+
256+ uint32_t ActiveThreads = kmpcMin (NumTeams, NumThreads);
257+ uint32_t WarpsNeeded =
258+ (ActiveThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
259+ // Gather all the reduced values from each warp
260+ // to the first warp.
261+ cpyFct (reduce_data, WarpsNeeded);
262+
263+ if (mapping::getWarpIdInBlock () == 0 )
264+ gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded, ThreadId);
265+
266+ return IsMain;
267+ }
268+
181269int32_t __kmpc_nvptx_teams_reduce_nowait_v2 (
182270 IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
183271 void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
184272 ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
185273 ListGlobalFnTy glredFct) {
274+ // The first check is a compile time constant, the second one a runtime check.
275+ // If the first one succeeds we will use the specialized version.
276+ if ((state::getKernelEnvironment ().Configuration .MaxTeams >= 0 &&
277+ state::getKernelEnvironment ().Configuration .MaxTeams <= num_of_records &&
278+ num_of_records == 1024 ) ||
279+ (omp_get_num_teams () <= num_of_records))
280+ return __kmpc_nvptx_teams_reduce_nowait_v3 (
281+ Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
282+ lgcpyFct, lgredFct, glcpyFct, glredFct);
283+
186284 // Terminate all threads in non-SPMD mode except for the master thread.
187285 uint32_t ThreadId = mapping::getThreadIdInBlock ();
188286 if (mapping::isGenericMode ()) {
0 commit comments