Skip to content

Commit 649236a

Browse files
ofi: NIC selection update
The existing code in compare_cpusets assumed that some non_io ancestor of a PCI object should intersect with the cpuset of the proc. However, this is not true. There is a case where the non IO ancestor can be an L3. If there exists two L3s on the same NUMA and the process is bound to one L3, but the PCI object is connected to the other L3, then compare_cpusets() will return false. A better way to determine the optimal interface is by finding the distances of the interfaces from the current process. Then find out which of these interfaces is nearest the process and select it. Use the PMIx distance generation for this purpose. Move away from using deprecated PMIX macros and use the functions directly instead. Signed-off-by: Amir Shehata <[email protected]>
1 parent df7cf53 commit 649236a

File tree

3 files changed

+224
-57
lines changed

3 files changed

+224
-57
lines changed

3rd-party/openpmix

Submodule openpmix updated 136 files

3rd-party/prrte

Submodule prrte updated 142 files

opal/mca/common/ofi/common_ofi.c

Lines changed: 222 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -445,63 +445,216 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445
}
446446
}
447447

448-
#if OPAL_OFI_PCI_DATA_AVAILABLE
449-
/* Check if a process and a pci device share the same cpuset
450-
* @param (IN) pci struct fi_pci_attr pci device attributes,
451-
* used to find hwloc object for device.
448+
/**
449+
* Calculate device distances
450+
*
451+
* Calculate the distances between the current thread and all devices of
452+
* type OPENFABRICS or NETWORK.
453+
*
454+
* The shortest distances are the nearest and therefore most efficient
455+
* devices to use.
452456
*
453-
* @param (IN) topology hwloc_topology_t topology to get the cpusets
454-
* from
457+
* Return an array of all the distances. Each entry is of type
458+
* pmix_device_distance_t
455459
*
456-
* @param (OUT) returns true if cpusets match and false if
457-
* cpusets do not match or an error prevents comparison
460+
* This function is used if there is no PMIx server running.
461+
*
462+
* @param distances (OUT) distances array
463+
* @param ndist (OUT) number of entries in the distances array
464+
*
465+
* @return 0 on success. Error otherwise.
458466
*
459-
* Uses a pci device to find an ancestor that contains a cpuset, and
460-
* determines if it intersects with the cpuset that the process is bound to.
461-
* if the process is not bound, or if a cpuset is unavailable for whatever
462-
* reason, returns false. Otherwise, returns the result of
463-
* hwloc_cpuset_intersects()
464467
*/
465-
static bool compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
468+
static int compute_dev_distances(pmix_device_distance_t **distances,
469+
size_t *ndist)
466470
{
467-
bool result = false;
468-
int ret;
469-
hwloc_bitmap_t proc_cpuset;
470-
hwloc_obj_t obj = NULL;
471+
int ret = 0;
472+
size_t ninfo;
473+
pmix_info_t *info;
474+
pmix_cpuset_t cpuset;
475+
pmix_topology_t *pmix_topo;
476+
pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
477+
PMIX_DEVTYPE_NETWORK;
478+
479+
PMIX_CPUSET_CONSTRUCT(&cpuset);
480+
ret = PMIx_Get_cpuset(&cpuset, PMIX_CPUBIND_THREAD);
481+
if (PMIX_SUCCESS != ret) {
482+
goto out;
483+
}
484+
485+
/* load the PMIX topology */
486+
PMIx_Topology_free(pmix_topo, 1);
487+
ret = PMIx_Load_topology(pmix_topo);
488+
if (PMIX_SUCCESS != ret) {
489+
goto out;
490+
}
491+
492+
ninfo = 1;
493+
info = PMIx_Info_create(ninfo);
494+
PMIx_Info_load(&info[0], PMIX_DEVICE_TYPE, &type, PMIX_DEVTYPE);
495+
ret = PMIx_Compute_distances(pmix_topo, &cpuset, info, ninfo, distances,
496+
ndist);
497+
PMIx_Info_free(info, ninfo);
498+
499+
PMIx_Topology_free(pmix_topo, 1);
500+
out:
501+
return ret;
502+
}
471503

472-
/* Cannot find topology info if no topology is found */
473-
if (NULL == topology) {
474-
return false;
504+
/**
505+
* Find the nearest devices to the current thread
506+
*
507+
* Use the PMIx server or calculate the device distances, then out of the set of
508+
* returned distances find the subset of the nearest devices. This can be
509+
* 1 or more.
510+
*
511+
* @param num_distances (OUT) number of entries in the returned array
512+
*
513+
* @return An array of device distances which are nearest this thread
514+
* or NULL if we fail to get the distances. In this case we will just
515+
* revert to round robin.
516+
*
517+
*/
518+
static pmix_device_distance_t *
519+
get_nearest_nics(int *num_distances, pmix_value_t **valin)
520+
{
521+
size_t ndist, i;
522+
int ret, idx = 0;
523+
pmix_data_array_t *dptr;
524+
uint16_t near = USHRT_MAX;
525+
pmix_info_t directive;
526+
pmix_value_t *val = NULL;
527+
pmix_device_distance_t *distances, *nearest = NULL;
528+
529+
PMIx_Info_load(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL);
530+
ret = PMIx_Get(&opal_process_info.myprocid,
531+
PMIX_DEVICE_DISTANCES, &directive, 1, &val);
532+
PMIx_Info_destruct(&directive);
533+
if (ret != PMIX_SUCCESS || !val) {
534+
ret = compute_dev_distances(&distances, &ndist);
535+
if (ret)
536+
goto out;
537+
goto find_nearest;
538+
}
539+
540+
if (PMIX_DATA_ARRAY != val->type) {
541+
goto out;
542+
}
543+
dptr = val->data.darray;
544+
if (NULL == dptr) {
545+
goto out;
546+
}
547+
if (PMIX_DEVICE_DIST != dptr->type) {
548+
goto out;
549+
}
550+
551+
distances = (pmix_device_distance_t*)dptr->array;
552+
ndist = dptr->size;
553+
554+
find_nearest:
555+
nearest = calloc(sizeof(*distances), ndist);
556+
if (!nearest)
557+
goto out;
558+
559+
for (i = 0; i < ndist; i++) {
560+
if (distances[i].type != PMIX_DEVTYPE_NETWORK &&
561+
distances[i].type != PMIX_DEVTYPE_OPENFABRICS)
562+
continue;
563+
if (distances[i].mindist < near) {
564+
idx = 0;
565+
near = distances[i].mindist;
566+
nearest[idx] = distances[i];
567+
idx++;
568+
} else if (distances[i].mindist == near) {
569+
nearest[idx] = distances[i];
570+
idx++;
571+
}
475572
}
476573

477-
/* Allocate memory for proc_cpuset */
478-
proc_cpuset = hwloc_bitmap_alloc();
479-
if (NULL == proc_cpuset) {
574+
*num_distances = idx;
575+
576+
out:
577+
*valin = val;
578+
return nearest;
579+
}
580+
581+
#if OPAL_OFI_PCI_DATA_AVAILABLE
582+
/**
583+
* Determine if a device is nearest
584+
*
585+
* Given a device distances array of the nearest pci devices,
586+
* determine if one of these device distances refers to the pci
587+
* device passed in
588+
*
589+
* @param distances (IN) distances array
590+
* @param num_distances (IN) number of entries in the distances array
591+
* @param topology (IN) topology of the node
592+
* @param pci (IN) PCI device being examined
593+
*
594+
* @return true if the PCI device is in the distances array or if the
595+
* distances array is not provided. False otherwise.
596+
*
597+
*/
598+
static bool is_near(pmix_device_distance_t *distances,
599+
int num_distances,
600+
hwloc_topology_t topology,
601+
struct fi_pci_attr pci)
602+
{
603+
hwloc_obj_t pcidev, osdev;
604+
605+
/* if we failed to find any distances, then we consider all interfaces
606+
* to be of equal distances and let the caller decide how to handle
607+
* them
608+
*/
609+
if (!distances)
610+
return true;
611+
612+
pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id,
613+
pci.bus_id, pci.device_id,
614+
pci.function_id);
615+
if (!pcidev)
480616
return false;
481-
}
482617

483-
/* Fill cpuset with the collection of cpu cores that the process runs on */
484-
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
485-
if (0 > ret) {
486-
goto error;
487-
}
618+
for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) {
619+
int i;
488620

489-
/* Get the pci device from bdf */
490-
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id,
491-
pci.function_id);
492-
if (NULL == obj) {
493-
goto error;
494-
}
621+
if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
622+
const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID");
623+
const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID");
624+
625+
if (!nguid && !sguid)
626+
continue;
495627

496-
/* pcidev objects don't have cpusets so find the first non-io object above */
497-
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
498-
if (NULL != obj) {
499-
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
628+
for (i = 0; i < num_distances; i++) {
629+
char lsguid[256], lnguid[256];
630+
int ret;
631+
632+
ret = sscanf(distances[i].uuid, "fab://%256s::%256s", lnguid, lsguid);
633+
if (ret != 2)
634+
continue;
635+
if (0 == strcasecmp(lnguid, nguid)) {
636+
return true;
637+
} else if (0 == strcasecmp(lsguid, sguid)) {
638+
return true;
639+
}
640+
}
641+
} else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) {
642+
const char *address = hwloc_obj_get_info_by_name(osdev, "Address");
643+
if (!address)
644+
continue;
645+
for (i = 0; i < num_distances; i++) {
646+
char *addr = strstr(distances[i].uuid, "://");
647+
if (!addr || addr + 3 > distances[i].uuid
648+
+ strlen(distances[i].uuid))
649+
continue;
650+
if (!strcmp(addr+3, address)) {
651+
return true;
652+
}
653+
}
654+
}
500655
}
501656

502-
error:
503-
hwloc_bitmap_free(proc_cpuset);
504-
return result;
657+
return false;
505658
}
506659
#endif
507660

@@ -614,11 +767,15 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614767
struct fi_info *provider = provider_list, *current_provider = provider_list;
615768
struct fi_info **provider_table;
616769
#if OPAL_OFI_PCI_DATA_AVAILABLE
770+
pmix_device_distance_t *distances = NULL;
771+
pmix_value_t *pmix_val;
617772
struct fi_pci_attr pci;
773+
int num_distances = 0;
774+
bool near;
618775
#endif
619776
int ret;
620777
unsigned int num_provider = 0, provider_limit = 0;
621-
bool provider_found = false, cpusets_match = false;
778+
bool provider_found = false;
622779

623780
/* Initialize opal_hwloc_topology if it is not already */
624781
ret = opal_hwloc_base_get_topology();
@@ -639,33 +796,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639796
return provider_list;
640797
}
641798

799+
#if OPAL_OFI_PCI_DATA_AVAILABLE
800+
/* find all the nearest devices to this thread, then out of these
801+
* determine which device we should bind to.
802+
*/
803+
distances = get_nearest_nics(&num_distances, &pmix_val);
804+
#endif
805+
642806
current_provider = provider;
643807

644808
/* Cycle through remaining fi_info objects, looking for alike providers */
645809
while (NULL != current_provider) {
646810
if (!check_provider_attr(provider, current_provider)) {
647-
cpusets_match = false;
811+
near = false;
648812
#if OPAL_OFI_PCI_DATA_AVAILABLE
649813
if (NULL != current_provider->nic
650814
&& NULL != current_provider->nic->bus_attr
651815
&& current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
652816
pci = current_provider->nic->bus_attr->attr.pci;
653-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
817+
near = is_near(distances, num_distances,
818+
opal_hwloc_topology, pci);
654819
}
655820
#endif
656-
657-
/* Reset the list if the cpusets match and no other provider was
658-
* found on the same cpuset as the process.
659-
*/
660-
if (cpusets_match && !provider_found) {
821+
/* We could have multiple near providers */
822+
if (near && !provider_found) {
661823
provider_found = true;
662824
num_provider = 0;
663825
}
664826

665827
/* Add the provider to the provider list if the cpusets match or if
666828
* no other provider was found on the same cpuset as the process.
667829
*/
668-
if (cpusets_match || !provider_found) {
830+
if (near || !provider_found) {
669831
provider_table[num_provider] = current_provider;
670832
num_provider++;
671833
}
@@ -687,17 +849,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687849
&& NULL != provider->nic->bus_attr
688850
&& provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
689851
pci = provider->nic->bus_attr->attr.pci;
690-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
852+
near = is_near(distances, num_distances,
853+
opal_hwloc_topology, pci);
691854
}
692855
#endif
693856

694857
#if OPAL_ENABLE_DEBUG
695858
opal_output_verbose(1, opal_common_ofi.output,
696-
"package rank: %d device: %s cpusets match: %s\n", package_rank,
697-
provider->domain_attr->name, cpusets_match ? "true" : "false");
859+
"package rank: %d device: %s near: %s\n", package_rank,
860+
provider->domain_attr->name, near ? "true" : "false");
698861
#endif
699862

700863
free(provider_table);
864+
#if OPAL_OFI_PCI_DATA_AVAILABLE
865+
if (pmix_val)
866+
PMIx_Value_free(pmix_val, 1);
867+
#endif
701868
return provider;
702869
}
703870

0 commit comments

Comments
 (0)