From 71f378d28cb89dd80379dbad570849b297594cde Mon Sep 17 00:00:00 2001 From: Luke Robison Date: Wed, 14 Feb 2024 21:14:29 +0000 Subject: [PATCH] btl/smcuda: Add atomic_wmb() before sm_fifo_write This change fixes https://github.com/open-mpi/ompi/issues/12270 Testing on c7g instance type (arm64) confirms this change elminates hangs and crashes that were previously observed in 1 in 30 runs of IMB alltoall benchmark. Tested with over 300 runs and no failures. The write memory barrier prevents other CPUs from observing the fifo get updated before they observe the updated contents of the header itself. Without the barrier, uninitialized header contents caused the crashes and invalid data. Signed-off-by: Luke Robison --- opal/mca/btl/smcuda/btl_smcuda_fifo.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/opal/mca/btl/smcuda/btl_smcuda_fifo.h b/opal/mca/btl/smcuda/btl_smcuda_fifo.h index 08ca8acd1ad..278cbeafbf7 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_fifo.h +++ b/opal/mca/btl/smcuda/btl_smcuda_fifo.h @@ -85,6 +85,8 @@ static void add_pending(struct mca_btl_base_endpoint_t *ep, void *data, bool res #define MCA_BTL_SMCUDA_FIFO_WRITE(endpoint_peer, my_smp_rank, peer_smp_rank, hdr, resend, \ retry_pending_sends, rc) \ do { \ + /* memory barrier: ensure writes to the hdr have completed */ \ + opal_atomic_wmb(); \ sm_fifo_t *_fifo = &(mca_btl_smcuda_component.fifo[peer_smp_rank][FIFO_MAP(my_smp_rank)]);\ \ if (retry_pending_sends) { \