From 216af9d17dfce408418dd583c3691f7c86ad0bca Mon Sep 17 00:00:00 2001 From: Michael Heinz Date: Mon, 23 Sep 2019 13:09:09 -0400 Subject: [PATCH] REF6976 Silent failure of OMPI over OFI with large messages sizes INTERNAL: STL-59403 The OFI (libfabric) MTL does not respect the maximum message size parameter that OFI provides in the fi_info data. This patch adds this missing max_msg_size field to the mca_ofi_module_t structure and adds a length check to the low-level send routines. (cherry-picked from commit 3aca4af548a3d781b6b52f89f4d6c7e66d379609) Change-Id: I65ab130794574393edab4a318365c1c125e50712 Signed-off-by: Michael Heinz Reviewed-by: Adam Goldman Reviewed-by: Brendan Cunningham --- ompi/mca/mtl/ofi/help-mtl-ofi.txt | 3 +++ ompi/mca/mtl/ofi/mtl_ofi.h | 11 +++++++++-- ompi/mca/mtl/ofi/mtl_ofi_component.c | 3 ++- ompi/mca/mtl/ofi/mtl_ofi_types.h | 3 +++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ompi/mca/mtl/ofi/help-mtl-ofi.txt b/ompi/mca/mtl/ofi/help-mtl-ofi.txt index 8131766ae00..58f55a42daf 100644 --- a/ompi/mca/mtl/ofi/help-mtl-ofi.txt +++ b/ompi/mca/mtl/ofi/help-mtl-ofi.txt @@ -16,3 +16,6 @@ unusual; your job may behave unpredictably (and/or abort) after this. Local host: %s Location: %s:%d Error: %s (%zd) +# +[message too big] +Message size %llu bigger than supported by selected transport. Max = %llu diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 8a6918cf78f..77261e4bc21 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -247,13 +247,20 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after); - if (OMPI_SUCCESS != ompi_ret) return ompi_ret; + if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) { + return ompi_ret; + } ofi_req->buffer = (free_after) ? start : NULL; ofi_req->length = length; ofi_req->status.MPI_ERROR = OMPI_SUCCESS; - if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_SYNCHRONOUS == mode)) { + if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) { + opal_show_help("help-mtl-ofi.txt", + "message too big", false, + length, endpoint->mtl_ofi_module->max_msg_size); + return OMPI_ERROR; + } else if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_SYNCHRONOUS == mode)) { ack_req = malloc(sizeof(ompi_mtl_ofi_request_t)); assert(ack_req); ack_req->parent = ofi_req; diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index b5d3959837b..56a68c13d08 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -466,9 +466,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, } /** - * Save the maximum inject size. + * Save the maximum sizes. */ ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size; + ompi_mtl_ofi.max_msg_size = prov->ep_attr->max_msg_size; /** * Create the objects that will be bound to the endpoint. diff --git a/ompi/mca/mtl/ofi/mtl_ofi_types.h b/ompi/mca/mtl/ofi/mtl_ofi_types.h index e8c3f21b53e..f0b36feefae 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_types.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_types.h @@ -49,6 +49,9 @@ typedef struct mca_mtl_ofi_module_t { /** Maximum inject size */ size_t max_inject_size; + /** Largest message that can be sent in a single send. */ + size_t max_msg_size; + /** Maximum number of CQ events to read in OFI Progress */ int ofi_progress_event_count;