Skip to content

Commit cf15573

Browse files
mtl/ofi: NIC selection update
The existing code in compare_cpusets assumed that some non_io ancestor of a PCI object should intersect with the cpuset of the proc. However, this is not true. There is a case where the non IO ancestor can be an L3. If there exists two L3s on the same NUMA and the process is bound to one L3, but the PCI object is connected to the other L3, then compare_cpusets() will return false. A better way to determine the optimal interface is by finding the distances of the interfaces from the current process. Then find out which of these interfaces is nearest the process and select it. Use the PMIx distance generation for this purpose. Signed-off-by: Amir Shehata <[email protected]>
1 parent dd6b875 commit cf15573

File tree

1 file changed

+173
-58
lines changed

1 file changed

+173
-58
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 173 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -445,63 +445,164 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445
}
446446
}
447447

448-
#if OPAL_OFI_PCI_DATA_AVAILABLE
449-
/* Check if a process and a pci device share the same cpuset
450-
* @param (IN) pci struct fi_pci_attr pci device attributes,
451-
* used to find hwloc object for device.
452-
*
453-
* @param (IN) topology hwloc_topology_t topology to get the cpusets
454-
* from
455-
*
456-
* @param (OUT) returns true if cpusets match and false if
457-
* cpusets do not match or an error prevents comparison
458-
*
459-
* Uses a pci device to find an ancestor that contains a cpuset, and
460-
* determines if it intersects with the cpuset that the process is bound to.
461-
* if the process is not bound, or if a cpuset is unavailable for whatever
462-
* reason, returns false. Otherwise, returns the result of
463-
* hwloc_cpuset_intersects()
464-
*/
465-
static bool compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
448+
static int calculate_distances(pmix_device_distance_t **distances,
449+
size_t *ndist)
466450
{
467-
bool result = false;
468-
int ret;
469-
hwloc_bitmap_t proc_cpuset;
470-
hwloc_obj_t obj = NULL;
451+
int ret = 0;
452+
size_t ninfo;
453+
pmix_info_t *info;
454+
pmix_cpuset_t cpuset;
455+
pmix_topology_t *pmix_topo;
456+
pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
457+
PMIX_DEVTYPE_NETWORK;
458+
459+
PMIX_CPUSET_CONSTRUCT(&cpuset);
460+
ret = PMIx_Get_cpuset(&cpuset, PMIX_CPUBIND_THREAD);
461+
if (PMIX_SUCCESS != ret) {
462+
goto out;
463+
}
464+
465+
/* load the PMIX topology */
466+
PMIX_TOPOLOGY_CREATE(pmix_topo, 1);
467+
ret = PMIx_Load_topology(pmix_topo);
468+
if (PMIX_SUCCESS != ret) {
469+
goto out;
470+
}
471+
472+
ninfo = 1;
473+
PMIX_INFO_CREATE(info, ninfo);
474+
PMIX_INFO_LOAD(&info[0], PMIX_DEVICE_TYPE, &type, PMIX_DEVTYPE);
475+
ret = PMIx_Compute_distances(pmix_topo, &cpuset, info, ninfo, distances,
476+
ndist);
477+
PMIX_INFO_FREE(info, ninfo);
478+
479+
PMIX_TOPOLOGY_FREE(pmix_topo, 1);
480+
out:
481+
return ret;
482+
}
471483

472-
/* Cannot find topology info if no topology is found */
473-
if (NULL == topology) {
474-
return false;
484+
static pmix_device_distance_t *get_nearest_nics(pmix_value_t *val,
485+
int *num_distances)
486+
{
487+
size_t ndist;
488+
pmix_topology_t *topo;
489+
int ret, i, idx = 0;
490+
pmix_data_array_t *dptr;
491+
uint16_t near = USHRT_MAX;
492+
pmix_info_t directive;
493+
pmix_device_distance_t *distances, *nearest = NULL;
494+
495+
PMIX_INFO_LOAD(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL);
496+
ret = PMIx_Get(&opal_process_info.myprocid,
497+
PMIX_DEVICE_DISTANCES, &directive, 1, &val);
498+
PMIX_INFO_DESTRUCT(&directive);
499+
if (ret != PMIX_SUCCESS || !val) {
500+
ret = calculate_distances(&distances, &ndist);
501+
if (ret)
502+
goto out;
503+
goto find_nearest;
504+
}
505+
506+
if (PMIX_DATA_ARRAY != val->type) {
507+
goto out;
508+
}
509+
dptr = val->data.darray;
510+
if (NULL == dptr) {
511+
goto out;
512+
}
513+
if (PMIX_DEVICE_DIST != dptr->type) {
514+
goto out;
515+
}
516+
517+
distances = (pmix_device_distance_t*)dptr->array;
518+
ndist = dptr->size;
519+
520+
find_nearest:
521+
nearest = calloc(sizeof(*distances), ndist);
522+
if (!nearest)
523+
goto out;
524+
525+
for (i = 0; i < ndist; i++) {
526+
if (distances[i].mindist < near) {
527+
idx = 0;
528+
near = distances[i].mindist;
529+
nearest[idx] = distances[i];
530+
idx++;
531+
} else if (distances[i].mindist == near) {
532+
nearest[idx] = distances[i];
533+
idx++;
534+
}
475535
}
476536

477-
/* Allocate memory for proc_cpuset */
478-
proc_cpuset = hwloc_bitmap_alloc();
479-
if (NULL == proc_cpuset) {
537+
*num_distances = idx;
538+
539+
goto out;
540+
541+
out:
542+
return nearest;
543+
}
544+
545+
#if OPAL_OFI_PCI_DATA_AVAILABLE
546+
static bool is_near(pmix_device_distance_t *distances,
547+
int num_distances,
548+
hwloc_topology_t topology,
549+
struct fi_pci_attr pci)
550+
{
551+
hwloc_obj_t pcidev, osdev;
552+
553+
/* if we failed to find any distances, then we consider all interfaces
554+
* to be of equal distances and let the caller decide how to handle
555+
* them
556+
*/
557+
if (!distances)
558+
return true;
559+
560+
pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id,
561+
pci.bus_id, pci.device_id,
562+
pci.function_id);
563+
if (!pcidev)
480564
return false;
481-
}
482565

483-
/* Fill cpuset with the collection of cpu cores that the process runs on */
484-
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
485-
if (0 > ret) {
486-
goto error;
487-
}
566+
for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) {
567+
int i;
488568

489-
/* Get the pci device from bdf */
490-
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id,
491-
pci.function_id);
492-
if (NULL == obj) {
493-
goto error;
494-
}
569+
if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
570+
const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID");
571+
const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID");
572+
573+
if (!nguid && !sguid)
574+
continue;
575+
576+
for (i = 0; i < num_distances; i++) {
577+
char lsguid[256], lnguid[256];
578+
int ret;
495579

496-
/* pcidev objects don't have cpusets so find the first non-io object above */
497-
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
498-
if (NULL != obj) {
499-
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
580+
ret = scanf(distances[i].uuid, "fab://%256s::%256s", lnguid, lsguid);
581+
if (ret != 2)
582+
continue;
583+
if (0 == strcasecmp(lnguid, nguid)) {
584+
return true;
585+
} else if (0 == strcasecmp(lsguid, sguid)) {
586+
return true;
587+
}
588+
}
589+
} else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) {
590+
const char *address = hwloc_obj_get_info_by_name(osdev, "Address");
591+
if (!address)
592+
continue;
593+
for (i = 0; i < num_distances; i++) {
594+
char *addr = strstr(distances[i].uuid, "://");
595+
if (!addr || addr + 3 > distances[i].uuid
596+
+ strlen(distances[i].uuid))
597+
continue;
598+
if (!strcmp(addr+3, address)) {
599+
return true;
600+
}
601+
}
602+
}
500603
}
501604

502-
error:
503-
hwloc_bitmap_free(proc_cpuset);
504-
return result;
605+
return false;
505606
}
506607
#endif
507608

@@ -614,7 +715,11 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614715
struct fi_info *provider = provider_list, *current_provider = provider_list;
615716
struct fi_info **provider_table;
616717
#if OPAL_OFI_PCI_DATA_AVAILABLE
718+
pmix_device_distance_t *distances = NULL;
719+
pmix_value_t *pmix_val = NULL;
617720
struct fi_pci_attr pci;
721+
int num_distances = 0;
722+
bool near;
618723
#endif
619724
int ret;
620725
unsigned int num_provider = 0, provider_limit = 0;
@@ -639,33 +744,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639744
return provider_list;
640745
}
641746

747+
#if OPAL_OFI_PCI_DATA_AVAILABLE
748+
distances = get_nearest_nics(pmix_val, &num_distances);
749+
if (pmix_val) {
750+
PMIX_VALUE_RELEASE(pmix_val);
751+
}
752+
#endif
753+
642754
current_provider = provider;
643755

644756
/* Cycle through remaining fi_info objects, looking for alike providers */
645757
while (NULL != current_provider) {
646758
if (!check_provider_attr(provider, current_provider)) {
647-
cpusets_match = false;
759+
near = false;
648760
#if OPAL_OFI_PCI_DATA_AVAILABLE
649761
if (NULL != current_provider->nic
650762
&& NULL != current_provider->nic->bus_attr
651763
&& current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
652764
pci = current_provider->nic->bus_attr->attr.pci;
653-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
765+
near = is_near(distances, num_distances,
766+
opal_hwloc_topology, pci);
654767
}
655768
#endif
656-
657-
/* Reset the list if the cpusets match and no other provider was
658-
* found on the same cpuset as the process.
659-
*/
660-
if (cpusets_match && !provider_found) {
769+
/* We could have multiple near providers */
770+
if (near && !provider_found) {
661771
provider_found = true;
662772
num_provider = 0;
663773
}
664774

665775
/* Add the provider to the provider list if the cpusets match or if
666776
* no other provider was found on the same cpuset as the process.
667777
*/
668-
if (cpusets_match || !provider_found) {
778+
if (near || !provider_found) {
669779
provider_table[num_provider] = current_provider;
670780
num_provider++;
671781
}
@@ -687,17 +797,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687797
&& NULL != provider->nic->bus_attr
688798
&& provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
689799
pci = provider->nic->bus_attr->attr.pci;
690-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
800+
near = is_near(distances, num_distances,
801+
opal_hwloc_topology, pci);
691802
}
692803
#endif
693804

694805
#if OPAL_ENABLE_DEBUG
695806
opal_output_verbose(1, opal_common_ofi.output,
696-
"package rank: %d device: %s cpusets match: %s\n", package_rank,
697-
provider->domain_attr->name, cpusets_match ? "true" : "false");
807+
"package rank: %d device: %s near: %s\n", package_rank,
808+
provider->domain_attr->name, near ? "true" : "false");
698809
#endif
699810

700811
free(provider_table);
812+
#if OPAL_OFI_PCI_DATA_AVAILABLE
813+
if (distances)
814+
free(distances);
815+
#endif
701816
return provider;
702817
}
703818

0 commit comments

Comments
 (0)