Reduction specialization llvm#70766

jdoerfert · jdoerfert · commit ab9157c871e8 · 2024-04-08T14:50:20.000-07:00
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -175,11 +175,116 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
   return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
 }
 
+/// Mostly like _v2 but with the builtin assumption that we have less than
+/// num_of_records (by default 1024) teams.
+int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
+    IdentTy *Loc, void *__restrict__ GlobalBuffer, uint32_t num_of_records,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
+    ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
+    ListGlobalFnTy glredFct) {
+  // Terminate all threads in non-SPMD mode except for the main thread.
+  uint32_t ThreadId = mapping::getThreadIdInBlock();
+  if (mapping::isGenericMode()) {
+    if (!mapping::isMainThreadInGenericMode())
+      return 0;
+    ThreadId = 0;
+  }
+
+  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
+
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team main participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads = omp_get_num_threads();
+  uint32_t TeamId = omp_get_team_num();
+  uint32_t NumTeams = omp_get_num_teams();
+  static unsigned SHARED(ChunkTeamCount);
+
+  // Block progress for teams greater than the current upper
+  // limit. We always only allow a number of teams less or equal
+  // to the number of slots in the buffer.
+  bool IsMain = (ThreadId == 0);
+
+  if (IsMain) {
+    lgcpyFct(GlobalBuffer, TeamId, reduce_data);
+
+    // Propagate the memory writes above to the world.
+    fence::kernel(atomic::release);
+
+    // Increment team counter.
+    // This counter is incremented by all teams in the current
+    // BUFFER_SIZE chunk.
+    ChunkTeamCount = atomic::inc(&Cnt, NumTeams - 1, atomic::seq_cst,
+                                 atomic::MemScopeTy::device);
+  }
+
+  // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
+  // state machine.
+  if (mapping::isSPMDMode())
+    synchronize::threadsAligned(atomic::acq_rel);
+
+  // Each thread will have a local struct containing the values to be
+  // reduced:
+  //      1. do reduction within each warp.
+  //      2. do reduction across warps.
+  //      3. write the final result to the main reduction variable
+  //         by returning 1 in the thread holding the reduction result.
+
+  // Check if this is the very last team.
+  if (ChunkTeamCount != NumTeams - 1)
+    return 0;
+
+  if (ThreadId >= NumTeams)
+    return 0;
+
+  // Last team processing.
+  NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
+  if (ThreadId >= NumThreads)
+    return 0;
+
+  // Ensure we see the global memory writes by other teams
+  fence::kernel(atomic::aquire);
+
+  // Load from buffer and reduce.
+  glcpyFct(GlobalBuffer, ThreadId, reduce_data);
+  for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
+    glredFct(GlobalBuffer, i, reduce_data);
+
+  // Reduce across warps to the warp main.
+  if (NumThreads > 1)
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+
+  uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
+  uint32_t WarpsNeeded =
+      (ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+  if (ActiveThreads > mapping::getWarpSize()) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (mapping::getWarpIdInBlock() == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+  }
+
+  return IsMain;
+}
+
 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
     uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
     InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
     ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
+
+  // The first check is a compile time constant, the second one a runtime check.
+  // If the first one succeeds we will use the specialized version.
+  if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
+       state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
+       num_of_records == 1024) ||
+      (omp_get_num_teams() <= num_of_records))
+    return __kmpc_nvptx_teams_reduce_nowait_v3(
+        Loc, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
+        lgcpyFct, lgredFct, glcpyFct, glredFct);
+
   // Terminate all threads in non-SPMD mode except for the master thread.
   uint32_t ThreadId = mapping::getThreadIdInBlock();
   if (mapping::isGenericMode()) {
diff --git a/openmp/libomptarget/test/api/omp_device_managed_memory.c b/openmp/libomptarget/test/api/omp_device_managed_memory.c
@@ -26,4 +26,5 @@ int main() {
   // CHECK: PASS
   if (sum == N)
     printf("PASS\n");
+  printf("%i : %i\n", sum, N);
 }

Original file line number	Diff line number	Diff line change
`@@ -26,4 +26,5 @@ int main() {`
`26`	`26`	`// CHECK: PASS`
`27`	`27`	`if (sum == N)`
`28`	`28`	`printf("PASS\n");`
	`29`	`+ printf("%i : %i\n", sum, N);`
`29`	`30`	`}`