diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index bc4d304404a..4dd818d91cd 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -445,214 +445,63 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr } } -/** - * Calculate device distances - * - * Calculate the distances between the current thread and all devices of - * type OPENFABRICS or NETWORK. - * - * The shortest distances are the nearest and therefore most efficient - * devices to use. - * - * Return an array of all the distances. Each entry is of type - * pmix_device_distance_t - * - * This function is used if there is no PMIx server running. - * - * @param distances (OUT) distances array - * @param ndist (OUT) number of entries in the distances array - * - * @return 0 on success. Error otherwise. - * - */ -static int compute_dev_distances(pmix_device_distance_t **distances, - size_t *ndist) -{ - int ret = 0; - size_t ninfo; - pmix_info_t *info; - pmix_cpuset_t cpuset; - pmix_topology_t *pmix_topo; - pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS | - PMIX_DEVTYPE_NETWORK; - - PMIX_CPUSET_CONSTRUCT(&cpuset); - ret = PMIx_Get_cpuset(&cpuset, PMIX_CPUBIND_THREAD); - if (PMIX_SUCCESS != ret) { - goto out; - } - - /* load the PMIX topology */ - PMIX_TOPOLOGY_CREATE(pmix_topo, 1); - ret = PMIx_Load_topology(pmix_topo); - if (PMIX_SUCCESS != ret) { - goto out; - } - - ninfo = 1; - PMIX_INFO_CREATE(info, ninfo); - PMIX_INFO_LOAD(&info[0], PMIX_DEVICE_TYPE, &type, PMIX_DEVTYPE); - ret = PMIx_Compute_distances(pmix_topo, &cpuset, info, ninfo, distances, - ndist); - PMIX_INFO_FREE(info, ninfo); - - PMIX_TOPOLOGY_FREE(pmix_topo, 1); -out: - return ret; -} - -/** - * Find the nearest devices to the current thread - * - * Use the PMIx server or calculate the device distances, then out of the set of - * returned distances find the subset of the nearest devices. This can be - * 1 or more. - * - * @param num_distances (OUT) number of entries in the returned array - * - * @return An array of device distances which are nearest this thread - * or NULL if we fail to get the distances. In this case we will just - * revert to round robin. - * - */ -static pmix_device_distance_t *get_nearest_nics(int *num_distances) -{ - size_t ndist; - pmix_topology_t *topo; - int ret, i, idx = 0; - pmix_data_array_t *dptr; - uint16_t near = USHRT_MAX; - pmix_info_t directive; - pmix_value_t *val = NULL; - pmix_device_distance_t *distances, *nearest = NULL; - - PMIX_INFO_LOAD(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL); - ret = PMIx_Get(&opal_process_info.myprocid, - PMIX_DEVICE_DISTANCES, &directive, 1, &val); - PMIX_INFO_DESTRUCT(&directive); - if (ret != PMIX_SUCCESS || !val) { - ret = compute_dev_distances(&distances, &ndist); - if (ret) - goto out; - goto find_nearest; - } - - if (PMIX_DATA_ARRAY != val->type) { - goto out; - } - dptr = val->data.darray; - if (NULL == dptr) { - goto out; - } - if (PMIX_DEVICE_DIST != dptr->type) { - goto out; - } - - distances = (pmix_device_distance_t*)dptr->array; - ndist = dptr->size; - -find_nearest: - nearest = calloc(sizeof(*distances), ndist); - if (!nearest) - goto out; - - for (i = 0; i < ndist; i++) { - if (distances[i].mindist < near) { - idx = 0; - near = distances[i].mindist; - nearest[idx] = distances[i]; - idx++; - } else if (distances[i].mindist == near) { - nearest[idx] = distances[i]; - idx++; - } - } - - *num_distances = idx; - -out: - if (val) - PMIX_VALUE_RELEASE(val); - return nearest; -} - #if OPAL_OFI_PCI_DATA_AVAILABLE -/** - * Determine if a device is nearest - * - * Given a device distances array of the nearest pci devices, - * determine if one of these device distances refers to the pci - * device passed in +/* Check if a process and a pci device share the same cpuset + * @param (IN) pci struct fi_pci_attr pci device attributes, + * used to find hwloc object for device. * - * @param distances (IN) distances array - * @param num_distances (IN) number of entries in the distances array - * @param topology (IN) topology of the node - * @param pci (IN) PCI device being examined + * @param (IN) topology hwloc_topology_t topology to get the cpusets + * from * - * @return true if the PCI device is in the distances array or if the - * distances array is not provided. False otherwise. + * @param (OUT) returns true if cpusets match and false if + * cpusets do not match or an error prevents comparison * + * Uses a pci device to find an ancestor that contains a cpuset, and + * determines if it intersects with the cpuset that the process is bound to. + * if the process is not bound, or if a cpuset is unavailable for whatever + * reason, returns false. Otherwise, returns the result of + * hwloc_cpuset_intersects() */ -static bool is_near(pmix_device_distance_t *distances, - int num_distances, - hwloc_topology_t topology, - struct fi_pci_attr pci) +static bool compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci) { - hwloc_obj_t pcidev, osdev; - - /* if we failed to find any distances, then we consider all interfaces - * to be of equal distances and let the caller decide how to handle - * them - */ - if (!distances) - return true; + bool result = false; + int ret; + hwloc_bitmap_t proc_cpuset; + hwloc_obj_t obj = NULL; - pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id, - pci.bus_id, pci.device_id, - pci.function_id); - if (!pcidev) + /* Cannot find topology info if no topology is found */ + if (NULL == topology) { return false; + } - for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) { - int i; - - if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { - const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID"); - const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID"); + /* Allocate memory for proc_cpuset */ + proc_cpuset = hwloc_bitmap_alloc(); + if (NULL == proc_cpuset) { + return false; + } - if (!nguid && !sguid) - continue; + /* Fill cpuset with the collection of cpu cores that the process runs on */ + ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS); + if (0 > ret) { + goto error; + } - for (i = 0; i < num_distances; i++) { - char lsguid[256], lnguid[256]; - int ret; + /* Get the pci device from bdf */ + obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id, + pci.function_id); + if (NULL == obj) { + goto error; + } - ret = sscanf(distances[i].uuid, "fab://%256s::%256s", lnguid, lsguid); - if (ret != 2) - continue; - if (0 == strcasecmp(lnguid, nguid)) { - return true; - } else if (0 == strcasecmp(lsguid, sguid)) { - return true; - } - } - } else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { - const char *address = hwloc_obj_get_info_by_name(osdev, "Address"); - if (!address) - continue; - for (i = 0; i < num_distances; i++) { - char *addr = strstr(distances[i].uuid, "://"); - if (!addr || addr + 3 > distances[i].uuid - + strlen(distances[i].uuid)) - continue; - if (!strcmp(addr+3, address)) { - return true; - } - } - } + /* pcidev objects don't have cpusets so find the first non-io object above */ + obj = hwloc_get_non_io_ancestor_obj(topology, obj); + if (NULL != obj) { + result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset); } - return false; +error: + hwloc_bitmap_free(proc_cpuset); + return result; } #endif @@ -765,10 +614,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, struct fi_info *provider = provider_list, *current_provider = provider_list; struct fi_info **provider_table; #if OPAL_OFI_PCI_DATA_AVAILABLE - pmix_device_distance_t *distances = NULL; struct fi_pci_attr pci; - int num_distances = 0; - bool near; #endif int ret; unsigned int num_provider = 0, provider_limit = 0; @@ -793,30 +639,25 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, return provider_list; } -#if OPAL_OFI_PCI_DATA_AVAILABLE - /* find all the nearest devices to this thread, then out of these - * determine which device we should bind to. - */ - distances = get_nearest_nics(&num_distances); -#endif - current_provider = provider; /* Cycle through remaining fi_info objects, looking for alike providers */ while (NULL != current_provider) { if (!check_provider_attr(provider, current_provider)) { - near = false; + cpusets_match = false; #if OPAL_OFI_PCI_DATA_AVAILABLE if (NULL != current_provider->nic && NULL != current_provider->nic->bus_attr && current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) { pci = current_provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); + cpusets_match = compare_cpusets(opal_hwloc_topology, pci); } #endif - /* We could have multiple near providers */ - if (near && !provider_found) { + + /* Reset the list if the cpusets match and no other provider was + * found on the same cpuset as the process. + */ + if (cpusets_match && !provider_found) { provider_found = true; num_provider = 0; } @@ -824,7 +665,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, /* Add the provider to the provider list if the cpusets match or if * no other provider was found on the same cpuset as the process. */ - if (near || !provider_found) { + if (cpusets_match || !provider_found) { provider_table[num_provider] = current_provider; num_provider++; } @@ -846,22 +687,17 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, && NULL != provider->nic->bus_attr && provider->nic->bus_attr->bus_type == FI_BUS_PCI) { pci = provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); + cpusets_match = compare_cpusets(opal_hwloc_topology, pci); } #endif #if OPAL_ENABLE_DEBUG opal_output_verbose(1, opal_common_ofi.output, - "package rank: %d device: %s near: %s\n", package_rank, - provider->domain_attr->name, near ? "true" : "false"); + "package rank: %d device: %s cpusets match: %s\n", package_rank, + provider->domain_attr->name, cpusets_match ? "true" : "false"); #endif free(provider_table); -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (distances) - free(distances); -#endif return provider; }