From 2c8db515b5b38a726900611bdc18c92fb4a83034 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Wed, 6 Mar 2024 07:35:48 -0800 Subject: [PATCH] mtl/ofi: bail gracefully if completion error context is null According to libfabric API, fi_cq_readerr also reports errors for requests that did not require completion, and associate a null context with the error entry. This patch adds null check and bail gracefully as to avoid invalid memory access. bot:notacherrypick Signed-off-by: Wenduo Wang --- ompi/mca/mtl/ofi/mtl_ofi.h | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 53f6a34362e..2e7b238dfa3 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -125,8 +125,7 @@ ompi_mtl_ofi_context_progress(int ctxt_id) opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n" "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", __FILE__, __LINE__, ret); - fflush(stderr); - exit(1); + goto bail; } } } @@ -148,16 +147,23 @@ ompi_mtl_ofi_context_progress(int ctxt_id) * thread fetches the entry while others get -FI_EAGAIN * indicating an empty queue, which is not erroneous. */ - if (ret == -FI_EAGAIN) + if (ret == -FI_EAGAIN) { return count; + } opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n" "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", __FILE__, __LINE__, fi_strerror(-ret), ret); - fflush(stderr); - exit(1); + goto bail; + } + + if (!error.op_context) { + opal_output(0, "%s:%d: Error returned from fi_cq_readerr with null context. " + "Completion flags: %016lx\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, error.flags); + goto bail; } - assert(error.op_context); ofi_req = TO_OFI_REQ(error.op_context); assert(ofi_req); ret = ofi_req->error_callback(&error, ofi_req); @@ -165,8 +171,7 @@ ompi_mtl_ofi_context_progress(int ctxt_id) opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n" "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", __FILE__, __LINE__, ret); - fflush(stderr); - exit(1); + goto bail; } } else { if (ret == -FI_EAGAIN || ret == -EINTR) { @@ -175,13 +180,16 @@ ompi_mtl_ofi_context_progress(int ctxt_id) opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n" "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", __FILE__, __LINE__, fi_strerror(-ret), ret); - fflush(stderr); - exit(1); + goto bail; } } } return count; + +bail: + fflush(stderr); + exit(1); } __opal_attribute_always_inline__ static inline int