@@ -445,63 +445,122 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445 }
446446}
447447
448+ static pmix_device_distance_t * get_nearest_nics (int * num_distances )
449+ {
450+ size_t ndist ;
451+ pmix_value_t * val ;
452+ int ret , i , idx = 0 ;
453+ pmix_data_array_t * dptr ;
454+ uint16_t near = USHRT_MAX ;
455+ pmix_info_t directive ;
456+ pmix_device_distance_t * distances , * nearest = NULL ;
457+ pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
458+ PMIX_DEVTYPE_NETWORK |
459+ PMIX_DEVTYPE_COPROC |
460+ PMIX_DEVTYPE_GPU |
461+ PMIX_DEVTYPE_BLOCK |
462+ PMIX_DEVTYPE_DMA ;
463+
464+ PMIX_INFO_LOAD (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
465+ ret = PMIx_Get (& opal_process_info .myprocid ,
466+ PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
467+ PMIX_INFO_DESTRUCT (& directive );
468+ if (ret != PMIX_SUCCESS ) {
469+ goto out ;
470+ }
471+
472+ if (PMIX_DATA_ARRAY != val -> type ) {
473+ goto release ;
474+ }
475+ dptr = val -> data .darray ;
476+ if (NULL == dptr ) {
477+ goto release ;
478+ }
479+ if (PMIX_DEVICE_DIST != dptr -> type ) {
480+ goto release ;
481+ }
482+ distances = (pmix_device_distance_t * )dptr -> array ;
483+
484+ nearest = calloc (sizeof (* distances ), dptr -> size );
485+ if (!nearest )
486+ goto release ;
487+
488+ for (i = 0 ; i < dptr -> size ; i ++ ) {
489+ if (distances [i ].mindist < near ) {
490+ idx = 0 ;
491+ near = distances [i ].mindist ;
492+ nearest [idx ] = distances [i ];
493+ idx ++ ;
494+ } else if (distances [i ].mindist == near ) {
495+ nearest [idx ] = distances [i ];
496+ idx ++ ;
497+ }
498+ }
499+
500+ * num_distances = idx ;
501+
502+ goto out ;
503+
504+ release :
505+ PMIX_VALUE_RELEASE (val );
506+ out :
507+ return nearest ;
508+ }
509+
448510#if OPAL_OFI_PCI_DATA_AVAILABLE
449- /* Check if a process and a pci device share the same cpuset
450- * @param (IN) pci struct fi_pci_attr pci device attributes,
451- * used to find hwloc object for device.
452- *
453- * @param (IN) topology hwloc_topology_t topology to get the cpusets
454- * from
455- *
456- * @param (OUT) returns true if cpusets match and false if
457- * cpusets do not match or an error prevents comparison
458- *
459- * Uses a pci device to find an ancestor that contains a cpuset, and
460- * determines if it intersects with the cpuset that the process is bound to.
461- * if the process is not bound, or if a cpuset is unavailable for whatever
462- * reason, returns false. Otherwise, returns the result of
463- * hwloc_cpuset_intersects()
464- */
465- static bool compare_cpusets (hwloc_topology_t topology , struct fi_pci_attr pci )
511+ static bool is_near (pmix_device_distance_t * distances ,
512+ int num_distances ,
513+ hwloc_topology_t topology ,
514+ struct fi_pci_attr pci )
466515{
467- bool result = false;
468- int ret ;
469- hwloc_bitmap_t proc_cpuset ;
470- hwloc_obj_t obj = NULL ;
516+ hwloc_obj_t pcidev , osdev ;
471517
472- /* Cannot find topology info if no topology is found */
473- if (NULL == topology ) {
518+ pcidev = hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
519+ pci .bus_id , pci .device_id ,
520+ pci .function_id );
521+ if (!pcidev )
474522 return false;
475- }
476523
477- /* Allocate memory for proc_cpuset */
478- proc_cpuset = hwloc_bitmap_alloc ();
479- if (NULL == proc_cpuset ) {
480- return false;
481- }
524+ for (osdev = pcidev -> io_first_child ; osdev != NULL ; osdev = osdev -> next_sibling ) {
525+ int i ;
482526
483- /* Fill cpuset with the collection of cpu cores that the process runs on */
484- ret = hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485- if (0 > ret ) {
486- goto error ;
487- }
527+ if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_OPENFABRICS ) {
528+ const char * nguid = hwloc_obj_get_info_by_name (osdev ,"NodeGUID" );
529+ const char * sguid = hwloc_obj_get_info_by_name (osdev , "SysImageGUID" );
488530
489- /* Get the pci device from bdf */
490- obj = hwloc_get_pcidev_by_busid (topology , pci .domain_id , pci .bus_id , pci .device_id ,
491- pci .function_id );
492- if (NULL == obj ) {
493- goto error ;
494- }
531+ if (!nguid && !sguid )
532+ continue ;
495533
496- /* pcidev objects don't have cpusets so find the first non-io object above */
497- obj = hwloc_get_non_io_ancestor_obj (topology , obj );
498- if (NULL != obj ) {
499- result = hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
534+ for (i = 0 ; i < num_distances ; i ++ ) {
535+ char lsguid [256 ], lnguid [256 ];
536+ int ret ;
537+
538+ ret = scanf (distances [i ].uuid , "fab://%256s::%256s" , lnguid , lsguid );
539+ if (ret != 2 )
540+ continue ;
541+ if (0 == strcasecmp (lnguid , nguid )) {
542+ return true;
543+ } else if (0 == strcasecmp (lsguid , sguid )) {
544+ return true;
545+ }
546+ }
547+ } else if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_NETWORK ) {
548+ const char * address = hwloc_obj_get_info_by_name (osdev , "Address" );
549+ if (!address )
550+ continue ;
551+ for (i = 0 ; i < num_distances ; i ++ ) {
552+ char * addr = strstr (distances [i ].uuid , "://" );
553+ if (!addr || addr + 3 > distances [i ].uuid
554+ + strlen (distances [i ].uuid ))
555+ continue ;
556+ if (!strcmp (addr + 3 , address )) {
557+ return true;
558+ }
559+ }
560+ }
500561 }
501562
502- error :
503- hwloc_bitmap_free (proc_cpuset );
504- return result ;
563+ return false;
505564}
506565#endif
507566
@@ -613,10 +672,13 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
613672{
614673 struct fi_info * provider = provider_list , * current_provider = provider_list ;
615674 struct fi_info * * provider_table ;
675+ pmix_device_distance_t * distances = NULL ;
616676#if OPAL_OFI_PCI_DATA_AVAILABLE
617677 struct fi_pci_attr pci ;
678+ bool near ;
618679#endif
619680 int ret ;
681+ int num_distances = 0 ;
620682 unsigned int num_provider = 0 , provider_limit = 0 ;
621683 bool provider_found = false, cpusets_match = false;
622684
@@ -641,31 +703,30 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
641703
642704 current_provider = provider ;
643705
706+ distances = get_nearest_nics (& num_distances );
644707 /* Cycle through remaining fi_info objects, looking for alike providers */
645708 while (NULL != current_provider ) {
646709 if (!check_provider_attr (provider , current_provider )) {
647- cpusets_match = false;
710+ near = false;
648711#if OPAL_OFI_PCI_DATA_AVAILABLE
649712 if (NULL != current_provider -> nic
650713 && NULL != current_provider -> nic -> bus_attr
651714 && current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
652715 pci = current_provider -> nic -> bus_attr -> attr .pci ;
653- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
716+ near = is_near (distances , num_distances ,
717+ opal_hwloc_topology , pci );
654718 }
655719#endif
656-
657- /* Reset the list if the cpusets match and no other provider was
658- * found on the same cpuset as the process.
659- */
660- if (cpusets_match && !provider_found ) {
720+ /* We could have multiple near providers */
721+ if (near && !provider_found ) {
661722 provider_found = true;
662723 num_provider = 0 ;
663724 }
664725
665726 /* Add the provider to the provider list if the cpusets match or if
666727 * no other provider was found on the same cpuset as the process.
667728 */
668- if (cpusets_match || !provider_found ) {
729+ if (near || !provider_found ) {
669730 provider_table [num_provider ] = current_provider ;
670731 num_provider ++ ;
671732 }
@@ -687,17 +748,20 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687748 && NULL != provider -> nic -> bus_attr
688749 && provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
689750 pci = provider -> nic -> bus_attr -> attr .pci ;
690- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
751+ near = is_near (distances , num_distances ,
752+ opal_hwloc_topology , pci );
691753 }
692754#endif
693755
694756#if OPAL_ENABLE_DEBUG
695757 opal_output_verbose (1 , opal_common_ofi .output ,
696- "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697- provider -> domain_attr -> name , cpusets_match ? "true" : "false" );
758+ "package rank: %d device: %s near : %s\n" , package_rank ,
759+ provider -> domain_attr -> name , near ? "true" : "false" );
698760#endif
699761
700762 free (provider_table );
763+ if (distances )
764+ free (distances );
701765 return provider ;
702766}
703767
0 commit comments