From 00a5108d1796de5cf73227c4d46d25aa164e47fe Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Sat, 18 Sep 2021 16:56:37 +0000 Subject: [PATCH 1/5] ofi: Cruft Cleanup in common code Remove cruft from common_ofi, including unnecessary includes, declarations, and OPAL_DECLSPEC annotations on function definitions (they are only needed on declarations). Also move one function back into the C_DECLS block and properly OPAL_DECLSPEC the function. Signed-off-by: Brian Barrett --- opal/mca/common/ofi/common_ofi.c | 33 +++++++++++++++++++------------- opal/mca/common/ofi/common_ofi.h | 32 +++---------------------------- 2 files changed, 23 insertions(+), 42 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 1a5d0fb72f5..3da66b05554 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -15,23 +15,30 @@ * $HEADER$ */ + +#include "opal_config.h" + #include #include +#include +#if OPAL_OFI_IMPORT_MONITOR_SUPPORT +#include +#endif -#include "opal_config.h" #include "common_ofi.h" #include "opal/constants.h" #include "opal/mca/base/mca_base_framework.h" #include "opal/mca/base/mca_base_var.h" #include "opal/mca/hwloc/base/base.h" +#include "opal/mca/memory/base/base.h" #include "opal/mca/pmix/base/base.h" #include "opal/util/argv.h" #include "opal/util/show_help.h" -OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL, - .prov_exclude = NULL, - .registered = 0, - .verbose = 0}; +opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL, + .prov_exclude = NULL, + .registered = 0, + .verbose = 0}; static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic"; static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT; @@ -75,15 +82,15 @@ static struct fi_ops_mem_monitor opal_common_ofi_export_ops = { .valid = opal_common_ofi_monitor_valid, }; -OPAL_DECLSPEC void opal_common_ofi_mem_release_cb(void *buf, size_t length, - void *cbdata, bool from_alloc) +static void opal_common_ofi_mem_release_cb(void *buf, size_t length, + void *cbdata, bool from_alloc) { opal_common_ofi_monitor->import_ops->notify(opal_common_ofi_monitor, buf, length); } #endif /* OPAL_OFI_IMPORT_MONITOR_SUPPORT */ -OPAL_DECLSPEC int opal_common_ofi_init(void) +int opal_common_ofi_init(void) { int ret; @@ -135,7 +142,7 @@ OPAL_DECLSPEC int opal_common_ofi_init(void) #endif } -OPAL_DECLSPEC int opal_common_ofi_fini(void) +int opal_common_ofi_fini(void) { if (opal_common_ofi_initialized && !--opal_common_ofi_init_ref_cnt) { #if OPAL_OFI_IMPORT_MONITOR_SUPPORT @@ -150,7 +157,7 @@ OPAL_DECLSPEC int opal_common_ofi_fini(void) return OPAL_SUCCESS; } -OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item) +int opal_common_ofi_is_in_list(char **list, char *item) { int i = 0; @@ -169,7 +176,7 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item) return 0; } -OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component) +int opal_common_ofi_register_mca_variables(const mca_base_component_t *component) { static int include_index; static int exclude_index; @@ -253,7 +260,7 @@ OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_componen return OPAL_SUCCESS; } -OPAL_DECLSPEC void opal_common_ofi_mca_register(void) +void opal_common_ofi_mca_register(void) { opal_common_ofi.registered++; if (opal_common_ofi.registered > 1) { @@ -265,7 +272,7 @@ OPAL_DECLSPEC void opal_common_ofi_mca_register(void) opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose); } -OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void) +void opal_common_ofi_mca_deregister(void) { /* unregister only on last deregister */ opal_common_ofi.registered--; diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index d2e17c57ebe..274c8d11832 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -18,15 +18,8 @@ #ifndef OPAL_MCA_COMMON_OFI_H #define OPAL_MCA_COMMON_OFI_H -#include "opal_config.h" -#include "opal/mca/base/mca_base_framework.h" -#include "opal/mca/base/mca_base_var.h" #include "opal/util/proc.h" #include "opal/memoryhooks/memory.h" -#include -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT -#include -#endif BEGIN_C_DECLS @@ -39,7 +32,6 @@ typedef struct opal_common_ofi_module { } opal_common_ofi_module_t; extern opal_common_ofi_module_t opal_common_ofi; -extern mca_base_framework_t opal_memory_base_framework; OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component); OPAL_DECLSPEC void opal_common_ofi_mca_register(void); @@ -61,24 +53,6 @@ OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void); */ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT -/* - * @param buf (IN) Pointer to the start of the allocation - * @param length (IN) Length of the allocation - * @param cbdata (IN) Data passed to memory hooks when callback - * was registered - * @param from_alloc (IN) True if the callback is caused by a call to the - * general allocation routines (malloc, calloc, free, - * etc.) or directly from the user (mmap, munmap, etc.) - * - * Callback function triggered when memory is about to be freed. - * is about to be freed. The callback will be triggered according to - * the note in opal_mem_hooks_register_release(). - * - */ -OPAL_DECLSPEC void opal_common_ofi_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc); -#endif /* OPAL_OFI_IMPORT_MONITOR_SUPPORT */ - /* * Initializes common objects for libfabric */ @@ -89,9 +63,9 @@ OPAL_DECLSPEC int opal_common_ofi_init(void); */ OPAL_DECLSPEC int opal_common_ofi_fini(void); -END_C_DECLS +OPAL_DECLSPEC struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_list, + opal_process_info_t *process_info); -struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info); +END_C_DECLS #endif /* OPAL_MCA_COMMON_OFI_H */ From 947b3fe016500f06cff6c34ccf232554793b4966 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 21 Sep 2021 19:23:39 +0000 Subject: [PATCH 2/5] ofi: Improve memory monitor detection Test for the features needed to support injecting a memory monitor in Libfabric, rather than testing for a Libfabric version. Signed-off-by: Brian Barrett --- config/opal_check_ofi.m4 | 37 ++++++++++++++------------------ opal/mca/common/ofi/common_ofi.c | 8 +++---- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/config/opal_check_ofi.m4 b/config/opal_check_ofi.m4 index d3d4185ae9e..b7761e262f0 100644 --- a/config/opal_check_ofi.m4 +++ b/config/opal_check_ofi.m4 @@ -126,19 +126,26 @@ AC_DEFUN([_OPAL_CHECK_OFI],[ CPPFLAGS="$CPPFLAGS $opal_ofi_CPPFLAGS" AS_IF([test $opal_ofi_happy = yes], - [AC_CHECK_MEMBER([struct fi_info.nic], + [AC_CHECK_HEADERS([rdma/fi_ext.h]) + + AC_CHECK_MEMBER([struct fi_info.nic], [opal_check_fi_info_pci=1], [opal_check_fi_info_pci=0], - [[#include ]])]) + [[#include ]]) + + AC_DEFINE_UNQUOTED([OPAL_OFI_PCI_DATA_AVAILABLE], + [$opal_check_fi_info_pci], + [check if pci data is available in ofi]) - AC_DEFINE_UNQUOTED([OPAL_OFI_PCI_DATA_AVAILABLE], - [$opal_check_fi_info_pci], - [check if pci data is available in ofi]) + AC_CHECK_DECLS([PMIX_PACKAGE_RANK], + [], + [], + [#include ]) - AC_CHECK_DECLS([PMIX_PACKAGE_RANK], - [], - [], - [#include ]) + AC_CHECK_TYPES([struct fi_ops_mem_monitor], [], [], + [#ifdef HAVE_RDMA_FI_EXT_H +#include +#endif])]) CPPFLAGS=$opal_check_ofi_save_CPPFLAGS LDFLAGS=$opal_check_ofi_save_LDFLAGS @@ -157,18 +164,6 @@ AC_DEFUN([_OPAL_CHECK_OFI],[ [AC_MSG_WARN([OFI libfabric support requested (via --with-ofi or --with-libfabric), but not found.]) AC_MSG_ERROR([Cannot continue.])]) ]) - opal_ofi_import_monitor=no - AS_IF([test $opal_ofi_happy = "yes"], - [OPAL_CHECK_OFI_VERSION_GE([1,14], - [opal_ofi_import_monitor=yes], - [opal_ofi_import_monitor=no])]) - - -if test "$opal_ofi_import_monitor" = "yes"; then - AC_DEFINE_UNQUOTED([OPAL_OFI_IMPORT_MONITOR_SUPPORT],1, - [Whether libfabric supports monitor import]) -fi - ])dnl diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 3da66b05554..08e01971820 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -21,7 +21,7 @@ #include #include #include -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT +#ifdef HAVE_RDMA_FI_EXT_H #include #endif @@ -45,7 +45,7 @@ static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT; static bool opal_common_ofi_initialized = false; static int opal_common_ofi_init_ref_cnt = 0; -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT +#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR static int opal_common_ofi_monitor_start(struct fid_mem_monitor *monitor) { @@ -88,7 +88,7 @@ static void opal_common_ofi_mem_release_cb(void *buf, size_t length, opal_common_ofi_monitor->import_ops->notify(opal_common_ofi_monitor, buf, length); } -#endif /* OPAL_OFI_IMPORT_MONITOR_SUPPORT */ +#endif /* HAVE_STRUCT_FI_OPS_MEM_MONITOR */ int opal_common_ofi_init(void) { @@ -98,7 +98,7 @@ int opal_common_ofi_init(void) if (opal_common_ofi_initialized) { return OPAL_SUCCESS; } -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT +#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR mca_base_framework_open(&opal_memory_base_framework, 0); if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) From 348152ac79efc5b1e34ae8a5b7dae229b4752d90 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 21 Sep 2021 19:58:09 +0000 Subject: [PATCH 3/5] ofi: Update OFI common documentation The documentation in the header file for the ofi common code had been neglected, so show some love. Also add some helpful comments in understanding the memory monitor injection code. Signed-off-by: Brian Barrett --- opal/mca/common/ofi/common_ofi.c | 77 +++++++---------------- opal/mca/common/ofi/common_ofi.h | 103 ++++++++++++++++++++++++++++--- 2 files changed, 118 insertions(+), 62 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 08e01971820..b0fc47a91a9 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -47,24 +47,32 @@ static int opal_common_ofi_init_ref_cnt = 0; #ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR +/* + * These no-op functions are necessary since libfabric does not allow null + * function pointers here. + */ static int opal_common_ofi_monitor_start(struct fid_mem_monitor *monitor) { return 0; } + static void opal_common_ofi_monitor_stop(struct fid_mem_monitor *monitor) { return; } + static int opal_common_ofi_monitor_subscribe(struct fid_mem_monitor *monitor, const void *addr, size_t len) { return 0; } + static void opal_common_ofi_monitor_unsubscribe(struct fid_mem_monitor *monitor, const void *addr, size_t len) { return; } + static bool opal_common_ofi_monitor_valid(struct fid_mem_monitor *monitor, const void *addr, size_t len) { @@ -88,6 +96,7 @@ static void opal_common_ofi_mem_release_cb(void *buf, size_t length, opal_common_ofi_monitor->import_ops->notify(opal_common_ofi_monitor, buf, length); } + #endif /* HAVE_STRUCT_FI_OPS_MEM_MONITOR */ int opal_common_ofi_init(void) @@ -107,6 +116,12 @@ int opal_common_ofi_init(void) return OPAL_SUCCESS; } + /* + * This cache object doesn't do much, but is necessary for the API to work. + * It is required to call the fi_import_fid API. This API was introduced in + * libfabric version 1.13.0 and "mr_cache" is a "well known" name (documented + * in libfabric) to indicate the type of object that we are trying to open. + */ ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0, &opal_common_ofi_cache_fid, NULL); if (ret) { goto err; @@ -119,6 +134,13 @@ int opal_common_ofi_init(void) opal_common_ofi_monitor->fid.fclass = FI_CLASS_MEM_MONITOR; opal_common_ofi_monitor->export_ops = &opal_common_ofi_export_ops; + /* + * This import_fid call must occur before the libfabric provider creates + * its memory registration cache. This will typically occur during domain + * open as it is a domain level object. We put it early in initialization + * to guarantee this and share the import monitor between the ofi btl + * and ofi mtl. + */ ret = fi_import_fid(opal_common_ofi_cache_fid, &opal_common_ofi_monitor->fid, 0); if (ret) { goto err; @@ -491,61 +513,6 @@ static uint32_t get_package_rank(opal_process_info_t *process_info) return (uint32_t) package_ranks[process_info->my_local_rank]; } -/* Selects a NIC based on hardware locality between process cpuset and device BDF. - * - * Initializes opal_hwloc_topology to access hardware topology if not previously - * initialized - * - * There are 3 main cases that this covers: - * - * 1. If the first provider passed into this function is the only valid - * provider, this provider is returned. - * - * 2. If there is more than 1 provider that matches the type of the first - * provider in the list, and the BDF data - * is available then a provider is selected based on locality of device - * cpuset and process cpuset and tries to ensure that processes are distributed - * evenly across NICs. This has two separate cases: - * - * i. There is one or more provider local to the process: - * - * (local rank % number of providers of the same type that share the process cpuset) - * is used to select one of these providers. - * - * ii. There is no provider that is local to the process: - * - * (local rank % number of providers of the same type) - * is used to select one of these providers - * - * 3. If there is more than 1 providers of the same type in the list, and the BDF data - * is not available (the ofi version does not support fi_info.nic or the - * provider does not support BDF) then (local rank % number of providers of the same type) - * is used to select one of these providers - * - * @param provider_list (IN) struct fi_info* An initially selected - * provider NIC. The provider name and - * attributes are used to restrict NIC - * selection. This provider is returned if the - * NIC selection fails. - * - * @param package_rank (IN) uint32_t The rank of the process. Used to - * select one valid NIC if there is a case - * where more than one can be selected. This - * could occur when more than one provider - * shares the same cpuset as the process. - * This could either be a package_rank if one is - * successfully calculated, or the process id. - * - * @param provider (OUT) struct fi_info* object with the selected - * provider if the selection succeeds - * if the selection fails, returns the fi_info - * object that was initially provided. - * - * All errors should be recoverable and will return the initially provided - * provider. However, if an error occurs we can no longer guarantee - * that the provider returned is local to the process or that the processes will - * balance across available NICs. - */ struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t *process_info) { diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 274c8d11832..42ced038355 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -33,11 +33,47 @@ typedef struct opal_common_ofi_module { extern opal_common_ofi_module_t opal_common_ofi; + +/** + * Register component-specialized MCA variables + * + * Register MCA variables common to all OFI components on behalf of + * the calling component. Expected to be called during + * component_register for all OFI-related components. + * + * @param component (IN) OFI component being initialized + * + * @returns OPAL_SUCCESS on success, OPAL error code on failure + */ OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component); + +/** + * Common MCA registration + * + * Common MCA registration handlinge. After calling this function, + * \code opal_common_ofi.output will be properly initialized. + * + * @returns OPAL_SUCCESS on success, OPAL error code on failure + */ OPAL_DECLSPEC void opal_common_ofi_mca_register(void); + +/** + * Common MCA cleanup + * + * Cleanup for any resources registered during \code + * opal_common_ofi_mca_register(). + * + * @returns OPAL_SUCCESS on success, OPAL error code on failure + */ OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void); -/* +/** + * Search function for provider names + * + * This function will take a provider name string and a list of lower + * provider name strings as inputs. It will return true if the lower + * provider in the item string matches a lower provider in the list. + * * @param list (IN) List of strings corresponding to lower providers. * @param item (IN) Single string corresponding to a provider. * @@ -46,23 +82,76 @@ OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void); * @return 1 The lower provider of the item string matches * a string in the item list. * - * This function will take a provider name string and a list of lower - * provider name strings as inputs. It will return true if the lower - * provider in the item string matches a lower provider in the list. - * */ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); -/* +/** * Initializes common objects for libfabric + * + * @note This function is not thread safe and must be called in a + * serial portion of the code. */ OPAL_DECLSPEC int opal_common_ofi_init(void); -/* +/** * Cleans up common objects for libfabric + * + * @note This function is not thread safe and must be called in a + * serial portion of the code. */ OPAL_DECLSPEC int opal_common_ofi_fini(void); +/** + * Selects NIC (provider) based on hardware locality + * + * In multi-nic situations, use hardware topology to pick the "best" + * of the selected NICs. + * There are 3 main cases that this covers: + * + * 1. If the first provider passed into this function is the only valid + * provider, this provider is returned. + * + * 2. If there is more than 1 provider that matches the type of the first + * provider in the list, and the BDF data + * is available then a provider is selected based on locality of device + * cpuset and process cpuset and tries to ensure that processes + * are distributed evenly across NICs. This has two separate + * cases: + * + * i. There is one or more provider local to the process: + * + * (local rank % number of providers of the same type + * that share the process cpuset) is used to select one + * of these providers. + * + * ii. There is no provider that is local to the process: + * + * (local rank % number of providers of the same type) + * is used to select one of these providers + * + * 3. If there is more than 1 providers of the same type in the + * list, and the BDF data is not available (the ofi version does + * not support fi_info.nic or the provider does not support BDF) + * then (local rank % number of providers of the same type) is + * used to select one of these providers + * + * @param provider_list (IN) struct fi_info* An initially selected + * provider NIC. The provider name and + * attributes are used to restrict NIC + * selection. This provider is returned if the + * NIC selection fails. + * + * @param provider (OUT) struct fi_info* object with the selected + * provider if the selection succeeds + * if the selection fails, returns the fi_info + * object that was initially provided. + * + * All errors should be recoverable and will return the initially provided + * provider. However, if an error occurs we can no longer guarantee + * that the provider returned is local to the process or that the processes will + * balance across available NICs. + * + */ OPAL_DECLSPEC struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t *process_info); From bf36d24cc55d3ad4077cee72af030be8fd550f41 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 22 Sep 2021 02:27:28 +0000 Subject: [PATCH 4/5] ofi: match common and MCA interfaces Rename initialization calls to match the MCA terminology and restructure code to match the calling pattern of normal components (so during component_register, OFI components should call common_ofi_mca_register). Signed-off-by: Brian Barrett --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 12 ++--- opal/mca/btl/ofi/btl_ofi_component.c | 16 +++--- opal/mca/common/ofi/common_ofi.c | 74 ++++++++++++---------------- opal/mca/common/ofi/common_ofi.h | 52 +++++++------------ 4 files changed, 62 insertions(+), 92 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index bbe32213bd2..8d273c65d5b 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -254,9 +254,7 @@ ompi_mtl_ofi_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_mtl_ofi.num_ofi_contexts); - opal_common_ofi_register_mca_variables(&mca_mtl_ofi_component.super.mtl_version); - - return OMPI_SUCCESS; + return opal_common_ofi_mca_register(&mca_mtl_ofi_component.super.mtl_version); } @@ -285,7 +283,7 @@ ompi_mtl_ofi_component_open(void) "provider_exclude")) { return OMPI_ERR_NOT_AVAILABLE; } - return opal_common_ofi_init(); + return opal_common_ofi_open(); } static int @@ -302,9 +300,7 @@ ompi_mtl_ofi_component_close(void) #if OPAL_CUDA_SUPPORT mca_common_cuda_fini(); #endif - opal_common_ofi_mca_deregister(); - opal_common_ofi_fini(); - return OMPI_SUCCESS; + return opal_common_ofi_close(); } int @@ -582,8 +578,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, int universe_size; char *univ_size_str; - opal_common_ofi_mca_register(); - opal_output_verbose(1, opal_common_ofi.output, "%s:%d: mtl:ofi:provider_include = \"%s\"\n", __FILE__, __LINE__, *opal_common_ofi.prov_include); diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index cd1816b273a..279dd25d438 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -123,6 +123,7 @@ static int validate_info(struct fi_info *info, uint64_t required_caps, char **in /* Register the MCA parameters */ static int mca_btl_ofi_component_register(void) { + int ret; char *msg; mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template; @@ -191,7 +192,10 @@ static int mca_btl_ofi_component_register(void) /* for now we want this component to lose to the MTL. */ module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50; - opal_common_ofi_register_mca_variables(&mca_btl_ofi_component.super.btl_version); + ret = opal_common_ofi_mca_register(&mca_btl_ofi_component.super.btl_version); + if (OPAL_SUCCESS != ret) { + return ret; + } return mca_btl_base_param_register(&mca_btl_ofi_component.super.btl_version, &module->super); } @@ -199,7 +203,7 @@ static int mca_btl_ofi_component_register(void) static int mca_btl_ofi_component_open(void) { mca_btl_ofi_component.module_count = 0; - return opal_common_ofi_init(); + return opal_common_ofi_open(); } /* @@ -207,11 +211,11 @@ static int mca_btl_ofi_component_open(void) */ static int mca_btl_ofi_component_close(void) { - opal_common_ofi_mca_deregister(); - opal_common_ofi_fini(); + int ret; + ret = opal_common_ofi_close(); /* If we don't sleep, sockets provider freaks out. Ummm this is a scary comment */ sleep(1); - return OPAL_SUCCESS; + return ret; } void mca_btl_ofi_exit(void) @@ -259,8 +263,6 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, struct fi_domain_attr domain_attr = {0}; uint64_t required_caps; - opal_common_ofi_mca_register(); - switch (mca_btl_ofi_component.mode) { case MCA_BTL_OFI_MODE_TWO_SIDED: diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index b0fc47a91a9..4890c52c4f8 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -37,12 +37,9 @@ opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL, .prov_exclude = NULL, - .registered = 0, - .verbose = 0}; - + .output = -1}; static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic"; static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT; -static bool opal_common_ofi_initialized = false; static int opal_common_ofi_init_ref_cnt = 0; #ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR @@ -99,12 +96,11 @@ static void opal_common_ofi_mem_release_cb(void *buf, size_t length, #endif /* HAVE_STRUCT_FI_OPS_MEM_MONITOR */ -int opal_common_ofi_init(void) +int opal_common_ofi_open(void) { int ret; - opal_common_ofi_init_ref_cnt++; - if (opal_common_ofi_initialized) { + if ((opal_common_ofi_init_ref_cnt++) > 0) { return OPAL_SUCCESS; } #ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR @@ -146,7 +142,6 @@ int opal_common_ofi_init(void) goto err; } opal_mem_hooks_register_release(opal_common_ofi_mem_release_cb, NULL); - opal_common_ofi_initialized = true; return OPAL_SUCCESS; err: @@ -157,23 +152,35 @@ int opal_common_ofi_init(void) free(opal_common_ofi_monitor); } + opal_common_ofi_init_ref_cnt--; + return OPAL_ERROR; #else - opal_common_ofi_initialized = true; return OPAL_SUCCESS; #endif } -int opal_common_ofi_fini(void) +int opal_common_ofi_close(void) { - if (opal_common_ofi_initialized && !--opal_common_ofi_init_ref_cnt) { -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT - opal_mem_hooks_unregister_release(opal_common_ofi_mem_release_cb); - fi_close(opal_common_ofi_cache_fid); - fi_close(&opal_common_ofi_monitor->fid); - free(opal_common_ofi_monitor); + int ret; + + if ((--opal_common_ofi_init_ref_cnt) > 0) { + return OPAL_SUCCESS; + } + +#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR + opal_mem_hooks_unregister_release(opal_common_ofi_mem_release_cb); + fi_close(opal_common_ofi_cache_fid); + fi_close(&opal_common_ofi_monitor->fid); + free(opal_common_ofi_monitor); #endif - opal_common_ofi_initialized = false; + + if (opal_common_ofi.output != -1) { + opal_output_close(opal_common_ofi.output); + opal_common_ofi.output = -1; + if (OPAL_SUCCESS != ret) { + return ret; + } } return OPAL_SUCCESS; @@ -198,11 +205,12 @@ int opal_common_ofi_is_in_list(char **list, char *item) return 0; } -int opal_common_ofi_register_mca_variables(const mca_base_component_t *component) +int opal_common_ofi_mca_register(const mca_base_component_t *component) { static int include_index; static int exclude_index; static int verbose_index; + int verbose; int param; if (fi_version() < FI_VERSION(1, 0)) { @@ -260,7 +268,7 @@ int opal_common_ofi_register_mca_variables(const mca_base_component_t *component MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, - &opal_common_ofi.verbose); + &verbose); } else { verbose_index = param; } @@ -277,32 +285,14 @@ int opal_common_ofi_register_mca_variables(const mca_base_component_t *component "verbose", 0); } - OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); - - return OPAL_SUCCESS; -} - -void opal_common_ofi_mca_register(void) -{ - opal_common_ofi.registered++; - if (opal_common_ofi.registered > 1) { - opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose); - return; + if (opal_common_ofi.output == -1) { + opal_common_ofi.output = opal_output_open(NULL); + opal_output_set_verbosity(opal_common_ofi.output, verbose); } - opal_common_ofi.output = opal_output_open(NULL); - opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose); -} + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); -void opal_common_ofi_mca_deregister(void) -{ - /* unregister only on last deregister */ - opal_common_ofi.registered--; - assert(opal_common_ofi.registered >= 0); - if (opal_common_ofi.registered) { - return; - } - opal_output_close(opal_common_ofi.output); + return OPAL_SUCCESS; } /* check that the tx attributes match */ diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 42ced038355..01096fdbf35 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -26,46 +26,46 @@ BEGIN_C_DECLS typedef struct opal_common_ofi_module { char **prov_include; char **prov_exclude; - int verbose; - int registered; int output; } opal_common_ofi_module_t; extern opal_common_ofi_module_t opal_common_ofi; - /** - * Register component-specialized MCA variables + * Common MCA registration * - * Register MCA variables common to all OFI components on behalf of - * the calling component. Expected to be called during - * component_register for all OFI-related components. + * Common MCA registration handlinge. After calling this function, + * \code opal_common_ofi.output will be properly initialized. * * @param component (IN) OFI component being initialized * * @returns OPAL_SUCCESS on success, OPAL error code on failure */ -OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component); +OPAL_DECLSPEC int opal_common_ofi_mca_register(const mca_base_component_t *component); /** - * Common MCA registration + * Initializes common objects for libfabric * - * Common MCA registration handlinge. After calling this function, - * \code opal_common_ofi.output will be properly initialized. + * Initialize common libfabric interface. This should be called from + * any other OFI component's component_open() call. * - * @returns OPAL_SUCCESS on success, OPAL error code on failure + * @note This function is not thread safe and must be called in a + * serial portion of the code. */ -OPAL_DECLSPEC void opal_common_ofi_mca_register(void); +OPAL_DECLSPEC int opal_common_ofi_open(void); /** - * Common MCA cleanup + * Cleans up common objects for libfabric * - * Cleanup for any resources registered during \code - * opal_common_ofi_mca_register(). + * Clean up common libfabric interface. This should be called from + * any other OFI component's component_close() call. Resource cleanup + * is reference counted, so any successful call to + * opal_common_ofi_init(). * - * @returns OPAL_SUCCESS on success, OPAL error code on failure + * @note This function is not thread safe and must be called in a + * serial portion of the code. */ -OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void); +OPAL_DECLSPEC int opal_common_ofi_close(void); /** * Search function for provider names @@ -85,22 +85,6 @@ OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void); */ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); -/** - * Initializes common objects for libfabric - * - * @note This function is not thread safe and must be called in a - * serial portion of the code. - */ -OPAL_DECLSPEC int opal_common_ofi_init(void); - -/** - * Cleans up common objects for libfabric - * - * @note This function is not thread safe and must be called in a - * serial portion of the code. - */ -OPAL_DECLSPEC int opal_common_ofi_fini(void); - /** * Selects NIC (provider) based on hardware locality * From 2349f6b853cad0e7c452fc27c7f3f4f231b8a266 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 22 Sep 2021 16:31:11 +0000 Subject: [PATCH 5/5] ofi: Delay monitor initialization Delay initializing the import memory monitor from component open to a function which should be called immediately before the Libfabric domain is initialized. Registering the import memory monitor requires initializing the patcher memory hooks, which is not necessary if the OFI fabrics were not selected. Signed-off-by: Brian Barrett --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 14 +++ opal/mca/btl/ofi/btl_ofi_component.c | 13 +++ opal/mca/common/ofi/common_ofi.c | 139 +++++++++++++++++++-------- opal/mca/common/ofi/common_ofi.h | 12 +++ 4 files changed, 136 insertions(+), 42 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 8d273c65d5b..4f93c86879c 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -887,6 +887,20 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, } } + /* this must be called during single threaded part of the code and + * before Libfabric configures its memory monitors. Easiest to do + * that before domain open. Silently ignore not-supported errors, + * as they are not critical to program correctness, but only + * indicate that LIbfabric will have to pick a different, possibly + * less optimial, monitor. */ + ret = opal_common_ofi_export_memory_monitor(); + if (0 != ret && -FI_ENOSYS != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "Failed to inject Libfabric memory monitor: %s", + fi_strerror(-ret)); + } + + /** * Open fabric * The getinfo struct returns a fabric attribute struct that can be used to diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 279dd25d438..e2f5512ae93 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -446,6 +446,19 @@ static int mca_btl_ofi_init_device(struct fi_info *info) * to prevent races. */ mca_btl_ofi_rcache_init(module); + /* for similar reasons to the rcache call, this must be called + * during single threaded part of the code and before Libfabric + * configures its memory monitors. Easiest to do that before + * domain open. Silently ignore not-supported errors, as they + * are not critical to program correctness, but only indicate + * that LIbfabric will have to pick a different, possibly less + * optimial, monitor. */ + rc = opal_common_ofi_export_memory_monitor(); + if (0 != rc && -FI_ENOSYS != rc) { + BTL_VERBOSE(("Failed to inject Libfabric memory monitor: %s", + fi_strerror(-rc))); + } + linux_device_name = info->domain_attr->name; BTL_VERBOSE( ("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name)); diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 4890c52c4f8..a9880151e25 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -41,12 +41,21 @@ opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL, static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic"; static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT; static int opal_common_ofi_init_ref_cnt = 0; +static bool opal_common_ofi_installed_memory_monitor = false; #ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR /* - * These no-op functions are necessary since libfabric does not allow null - * function pointers here. + * Monitor object to export into Libfabric to provide memory release + * notifications using our own memory hooks framework. Monitors may + * use the subscribe/unsubscribe notifications to reduce unnecessary + * notifications, but are not required to do so. Because patcher + * notifies about all releases, it is cheaper for us to not filter and + * this monitor can safely ignore subscribe/unsubscribe notifications. + * + * Libfabric requires the object to be fully defined. Unlike most of + * Open MPI, it does not have NULL function pointer checks in calling + * code. */ static int opal_common_ofi_monitor_start(struct fid_mem_monitor *monitor) { @@ -76,8 +85,8 @@ static bool opal_common_ofi_monitor_valid(struct fid_mem_monitor *monitor, return true; } -static struct fid_mem_monitor *opal_common_ofi_monitor; -static struct fid *opal_common_ofi_cache_fid; +static struct fid_mem_monitor *opal_common_ofi_monitor = NULL; +static struct fid *opal_common_ofi_cache_fid = NULL; static struct fi_ops_mem_monitor opal_common_ofi_export_ops = { .size = sizeof(struct fi_ops_mem_monitor), .start = opal_common_ofi_monitor_start, @@ -87,6 +96,12 @@ static struct fi_ops_mem_monitor opal_common_ofi_export_ops = { .valid = opal_common_ofi_monitor_valid, }; +/** + * Callback function from Open MPI memory monitor + * + * Translation function between the callback function from Open MPI's + * memory notifier to the Libfabric memory monitor. + */ static void opal_common_ofi_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc) { @@ -96,68 +111,110 @@ static void opal_common_ofi_mem_release_cb(void *buf, size_t length, #endif /* HAVE_STRUCT_FI_OPS_MEM_MONITOR */ -int opal_common_ofi_open(void) +int opal_common_ofi_export_memory_monitor(void) { - int ret; + int ret = -FI_ENOSYS; - if ((opal_common_ofi_init_ref_cnt++) > 0) { - return OPAL_SUCCESS; - } #ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (NULL != opal_common_ofi_cache_fid) { + return 0; + } + + /* + * While the memory import functionality was introduced in 1.13, + * some deadlock bugs exist in the 1.13 series. Require version + * 1.14 before this code is activated. Not activating the code + * should not break any functionality directly, but may lead to + * sub-optimal memory monitors being used in Libfabric, as Open + * MPI will almost certainly install a patcher first. + */ + if (FI_VERSION_LT(fi_version(), FI_VERSION(1, 14))) { + ret = -FI_ENOSYS; + goto err; + } - mca_base_framework_open(&opal_memory_base_framework, 0); + ret = mca_base_framework_open(&opal_memory_base_framework, 0); + if (OPAL_SUCCESS != ret) { + ret = -FI_ENOSYS; + goto err; + } if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) != (((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT)) & opal_mem_hooks_support_level())) { - return OPAL_SUCCESS; + ret = -FI_ENOSYS; + goto err; } /* - * This cache object doesn't do much, but is necessary for the API to work. - * It is required to call the fi_import_fid API. This API was introduced in - * libfabric version 1.13.0 and "mr_cache" is a "well known" name (documented - * in libfabric) to indicate the type of object that we are trying to open. + * The monitor import object has the well known name "mr_cache" + * and was introduced in Libfabric 1.13 */ - ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0, &opal_common_ofi_cache_fid, NULL); - if (ret) { + ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0, + &opal_common_ofi_cache_fid, NULL); + if (0 != ret) { goto err; } opal_common_ofi_monitor = calloc(1, sizeof(*opal_common_ofi_monitor)); - if (!opal_common_ofi_monitor) { + if (NULL == opal_common_ofi_monitor) { + ret = -FI_ENOMEM; goto err; } opal_common_ofi_monitor->fid.fclass = FI_CLASS_MEM_MONITOR; opal_common_ofi_monitor->export_ops = &opal_common_ofi_export_ops; - /* - * This import_fid call must occur before the libfabric provider creates - * its memory registration cache. This will typically occur during domain - * open as it is a domain level object. We put it early in initialization - * to guarantee this and share the import monitor between the ofi btl - * and ofi mtl. - */ - ret = fi_import_fid(opal_common_ofi_cache_fid, &opal_common_ofi_monitor->fid, 0); - if (ret) { + ret = fi_import_fid(opal_common_ofi_cache_fid, + &opal_common_ofi_monitor->fid, 0); + if (0 != ret) { goto err; } opal_mem_hooks_register_release(opal_common_ofi_mem_release_cb, NULL); + opal_common_ofi_installed_memory_monitor = true; + + ret = 0; - return OPAL_SUCCESS; err: - if (opal_common_ofi_cache_fid) { - fi_close(opal_common_ofi_cache_fid); + if (0 != ret) { + if (NULL != opal_common_ofi_cache_fid) { + fi_close(opal_common_ofi_cache_fid); + } + if (NULL != opal_common_ofi_monitor) { + free(opal_common_ofi_monitor); + } } - if (opal_common_ofi_monitor) { + + opal_common_ofi_installed_memory_monitor = false; + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); +#endif + + return ret; +} + +static int opal_common_ofi_remove_memory_monitor(void) +{ +#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR + if (opal_common_ofi_installed_memory_monitor) { + opal_mem_hooks_unregister_release(opal_common_ofi_mem_release_cb); + fi_close(opal_common_ofi_cache_fid); + fi_close(&opal_common_ofi_monitor->fid); free(opal_common_ofi_monitor); + opal_common_ofi_installed_memory_monitor = false; } +#endif - opal_common_ofi_init_ref_cnt--; + return OPAL_SUCCESS; +} + +int opal_common_ofi_open(void) +{ + if ((opal_common_ofi_init_ref_cnt++) > 0) { + return OPAL_SUCCESS; + } - return OPAL_ERROR; -#else return OPAL_SUCCESS; -#endif } int opal_common_ofi_close(void) @@ -168,14 +225,12 @@ int opal_common_ofi_close(void) return OPAL_SUCCESS; } -#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR - opal_mem_hooks_unregister_release(opal_common_ofi_mem_release_cb); - fi_close(opal_common_ofi_cache_fid); - fi_close(&opal_common_ofi_monitor->fid); - free(opal_common_ofi_monitor); -#endif + ret = opal_common_ofi_remove_memory_monitor(); + if (OPAL_SUCCESS != ret) { + return ret; + } - if (opal_common_ofi.output != -1) { + if (-1 != opal_common_ofi.output) { opal_output_close(opal_common_ofi.output); opal_common_ofi.output = -1; if (OPAL_SUCCESS != ret) { diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 01096fdbf35..ec21fd732b6 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -67,6 +67,18 @@ OPAL_DECLSPEC int opal_common_ofi_open(void); */ OPAL_DECLSPEC int opal_common_ofi_close(void); +/** + * Export our memory hooks into Libfabric monitor + * + * Use Open MPI's memory hooks to provide monitor notifications to + * Libfabric via the external mr_cache facility. This must be called + * before any domain is initialized (ie, before any Libfabric memory + * monitor is configured). + * + * @returns A libfabric error code is returned on error + */ +OPAL_DECLSPEC int opal_common_ofi_export_memory_monitor(void); + /** * Search function for provider names *