Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 16 additions & 21 deletions config/opal_check_ofi.m4
Original file line number Diff line number Diff line change
Expand Up @@ -126,19 +126,26 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
CPPFLAGS="$CPPFLAGS $opal_ofi_CPPFLAGS"

AS_IF([test $opal_ofi_happy = yes],
[AC_CHECK_MEMBER([struct fi_info.nic],
[AC_CHECK_HEADERS([rdma/fi_ext.h])

AC_CHECK_MEMBER([struct fi_info.nic],
[opal_check_fi_info_pci=1],
[opal_check_fi_info_pci=0],
[[#include <rdma/fabric.h>]])])
[[#include <rdma/fabric.h>]])

AC_DEFINE_UNQUOTED([OPAL_OFI_PCI_DATA_AVAILABLE],
[$opal_check_fi_info_pci],
[check if pci data is available in ofi])

AC_DEFINE_UNQUOTED([OPAL_OFI_PCI_DATA_AVAILABLE],
[$opal_check_fi_info_pci],
[check if pci data is available in ofi])
AC_CHECK_DECLS([PMIX_PACKAGE_RANK],
[],
[],
[#include <pmix.h>])

AC_CHECK_DECLS([PMIX_PACKAGE_RANK],
[],
[],
[#include <pmix.h>])
AC_CHECK_TYPES([struct fi_ops_mem_monitor], [], [],
[#ifdef HAVE_RDMA_FI_EXT_H
#include <rdma/fi_ext.h>
#endif])])

CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
LDFLAGS=$opal_check_ofi_save_LDFLAGS
Expand All @@ -157,18 +164,6 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
[AC_MSG_WARN([OFI libfabric support requested (via --with-ofi or --with-libfabric), but not found.])
AC_MSG_ERROR([Cannot continue.])])
])
opal_ofi_import_monitor=no
AS_IF([test $opal_ofi_happy = "yes"],
[OPAL_CHECK_OFI_VERSION_GE([1,14],
[opal_ofi_import_monitor=yes],
[opal_ofi_import_monitor=no])])


if test "$opal_ofi_import_monitor" = "yes"; then
AC_DEFINE_UNQUOTED([OPAL_OFI_IMPORT_MONITOR_SUPPORT],1,
[Whether libfabric supports monitor import])
fi

])dnl


Expand Down
26 changes: 17 additions & 9 deletions ompi/mca/mtl/ofi/mtl_ofi_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,7 @@ ompi_mtl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_ofi.num_ofi_contexts);

opal_common_ofi_register_mca_variables(&mca_mtl_ofi_component.super.mtl_version);

return OMPI_SUCCESS;
return opal_common_ofi_mca_register(&mca_mtl_ofi_component.super.mtl_version);
}


Expand Down Expand Up @@ -285,7 +283,7 @@ ompi_mtl_ofi_component_open(void)
"provider_exclude")) {
return OMPI_ERR_NOT_AVAILABLE;
}
return opal_common_ofi_init();
return opal_common_ofi_open();
}

static int
Expand All @@ -302,9 +300,7 @@ ompi_mtl_ofi_component_close(void)
#if OPAL_CUDA_SUPPORT
mca_common_cuda_fini();
#endif
opal_common_ofi_mca_deregister();
opal_common_ofi_fini();
return OMPI_SUCCESS;
return opal_common_ofi_close();
}

int
Expand Down Expand Up @@ -582,8 +578,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
int universe_size;
char *univ_size_str;

opal_common_ofi_mca_register();

opal_output_verbose(1, opal_common_ofi.output,
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
__FILE__, __LINE__, *opal_common_ofi.prov_include);
Expand Down Expand Up @@ -893,6 +887,20 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
}
}

/* this must be called during single threaded part of the code and
* before Libfabric configures its memory monitors. Easiest to do
* that before domain open. Silently ignore not-supported errors,
* as they are not critical to program correctness, but only
* indicate that LIbfabric will have to pick a different, possibly
* less optimial, monitor. */
ret = opal_common_ofi_export_memory_monitor();
if (0 != ret && -FI_ENOSYS != ret) {
opal_output_verbose(1, opal_common_ofi.output,
"Failed to inject Libfabric memory monitor: %s",
fi_strerror(-ret));
}


/**
* Open fabric
* The getinfo struct returns a fabric attribute struct that can be used to
Expand Down
29 changes: 22 additions & 7 deletions opal/mca/btl/ofi/btl_ofi_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ static int validate_info(struct fi_info *info, uint64_t required_caps, char **in
/* Register the MCA parameters */
static int mca_btl_ofi_component_register(void)
{
int ret;
char *msg;
mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template;

Expand Down Expand Up @@ -191,27 +192,30 @@ static int mca_btl_ofi_component_register(void)
/* for now we want this component to lose to the MTL. */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;

opal_common_ofi_register_mca_variables(&mca_btl_ofi_component.super.btl_version);
ret = opal_common_ofi_mca_register(&mca_btl_ofi_component.super.btl_version);
if (OPAL_SUCCESS != ret) {
return ret;
}

return mca_btl_base_param_register(&mca_btl_ofi_component.super.btl_version, &module->super);
}

static int mca_btl_ofi_component_open(void)
{
mca_btl_ofi_component.module_count = 0;
return opal_common_ofi_init();
return opal_common_ofi_open();
}

/*
* component cleanup - sanity checking of queue lengths
*/
static int mca_btl_ofi_component_close(void)
{
opal_common_ofi_mca_deregister();
opal_common_ofi_fini();
int ret;
ret = opal_common_ofi_close();
/* If we don't sleep, sockets provider freaks out. Ummm this is a scary comment */
sleep(1);
return OPAL_SUCCESS;
return ret;
}

void mca_btl_ofi_exit(void)
Expand Down Expand Up @@ -259,8 +263,6 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
struct fi_domain_attr domain_attr = {0};
uint64_t required_caps;

opal_common_ofi_mca_register();

switch (mca_btl_ofi_component.mode) {

case MCA_BTL_OFI_MODE_TWO_SIDED:
Expand Down Expand Up @@ -444,6 +446,19 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
* to prevent races. */
mca_btl_ofi_rcache_init(module);

/* for similar reasons to the rcache call, this must be called
* during single threaded part of the code and before Libfabric
* configures its memory monitors. Easiest to do that before
* domain open. Silently ignore not-supported errors, as they
* are not critical to program correctness, but only indicate
* that LIbfabric will have to pick a different, possibly less
* optimial, monitor. */
rc = opal_common_ofi_export_memory_monitor();
if (0 != rc && -FI_ENOSYS != rc) {
BTL_VERBOSE(("Failed to inject Libfabric memory monitor: %s",
fi_strerror(-rc)));
}

linux_device_name = info->domain_attr->name;
BTL_VERBOSE(
("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name));
Expand Down
Loading