@@ -220,17 +220,20 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
220220 } else
221221 lgredFct (GlobalBuffer, ModBockId, reduce_data);
222222
223+ // Propagate the memory writes above to the world.
224+ fence::kernel (atomic::release);
225+
223226 // Increment team counter.
224227 // This counter is incremented by all teams in the current
225- // BUFFER_SIZE chunk.
228+ // num_of_records chunk.
226229 ChunkTeamCount = atomic::inc (&Cnt, num_of_records - 1u , atomic::seq_cst,
227230 atomic::MemScopeTy::device);
228231 }
229- // Synchronize
232+
233+ // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
234+ // state machine.
230235 if (mapping::isSPMDMode ())
231236 synchronize::threadsAligned (atomic::acq_rel);
232- else
233- fence::kernel (atomic::acq_rel);
234237
235238 // reduce_data is global or shared so before being reduced within the
236239 // warp we need to bring it in local memory:
@@ -257,6 +260,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
257260 // Check if this is the very last team.
258261 unsigned NumRecs = kmpcMin (NumTeams, uint32_t (num_of_records));
259262 if (ChunkTeamCount == NumTeams - Bound - 1 ) {
263+ // Ensure we see the global memory writes by other teams
264+ fence::kernel (atomic::aquire);
265+
260266 //
261267 // Last team processing.
262268 //
0 commit comments