@@ -446,63 +446,116 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
446446}
447447
448448#if OPAL_OFI_PCI_DATA_AVAILABLE
449- /* Check if a process and a pci device share the same cpuset
450- * @param (IN) pci struct fi_pci_attr pci device attributes,
451- * used to find hwloc object for device.
452- *
453- * @param (IN) topology hwloc_topology_t topology to get the cpusets
454- * from
455- *
456- * @param (OUT) returns true if cpusets match and false if
457- * cpusets do not match or an error prevents comparison
458- *
459- * Uses a pci device to find an ancestor that contains a cpuset, and
460- * determines if it intersects with the cpuset that the process is bound to.
461- * if the process is not bound, or if a cpuset is unavailable for whatever
462- * reason, returns false. Otherwise, returns the result of
463- * hwloc_cpuset_intersects()
464- */
465- static bool compare_cpusets (hwloc_topology_t topology , struct fi_pci_attr pci )
449+
450+ #define MAX_NUM_NUMA 8
451+ static pmix_device_distance_t * get_nearest_nics (int * num_distances )
466452{
467- bool result = false;
468- int ret ;
453+ size_t ndist ;
454+ pmix_value_t * val ;
455+ int ret , i , idx = 0 ;
456+ pmix_cpuset_t cpuset ;
457+ pmix_topology_t * topo ;
458+ pmix_data_array_t * dptr ;
459+ uint16_t near = USHRT_MAX ;
469460 hwloc_bitmap_t proc_cpuset ;
470- hwloc_obj_t obj = NULL ;
471-
472- /* Cannot find topology info if no topology is found */
473- if (NULL == topology ) {
474- return false;
475- }
476-
477- /* Allocate memory for proc_cpuset */
478- proc_cpuset = hwloc_bitmap_alloc ();
479- if (NULL == proc_cpuset ) {
480- return false;
481- }
482-
483- /* Fill cpuset with the collection of cpu cores that the process runs on */
484- ret = hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485- if (0 > ret ) {
486- goto error ;
487- }
488-
489- /* Get the pci device from bdf */
490- obj = hwloc_get_pcidev_by_busid (topology , pci .domain_id , pci .bus_id , pci .device_id ,
491- pci .function_id );
492- if (NULL == obj ) {
493- goto error ;
494- }
461+ pmix_info_t * info , directive ;
462+ pmix_device_distance_t * distances , * nearest = NULL ;
463+ pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
464+ PMIX_DEVTYPE_NETWORK |
465+ PMIX_DEVTYPE_COPROC |
466+ PMIX_DEVTYPE_GPU |
467+ PMIX_DEVTYPE_BLOCK |
468+ PMIX_DEVTYPE_DMA ;
469+
470+ PMIX_INFO_LOAD (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
471+ ret = PMIx_Get (& opal_process_info .myprocid ,
472+ PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
473+ PMIX_INFO_DESTRUCT (& directive );
474+ if (ret != PMIX_SUCCESS ) {
475+ goto out ;
476+ }
477+
478+ if (PMIX_DATA_ARRAY != val -> type ) {
479+ PMIX_VALUE_RELEASE (val );
480+ goto release ;
481+ }
482+ dptr = val -> data .darray ;
483+ if (NULL == dptr ) {
484+ PMIX_VALUE_RELEASE (val );
485+ goto release ;
486+ }
487+ if (PMIX_DEVICE_DIST != dptr -> type ) {
488+ PMIX_VALUE_RELEASE (val );
489+ goto release ;
490+ }
491+ distances = (pmix_device_distance_t * )dptr -> array ;
492+
493+ for (i = 0 ; i < dptr -> size ; i ++ )
494+ fprintf (stderr , "%d: %d:%s:%d:%d\n" , getpid (), i , distances [i ].uuid ,
495+ distances [i ].mindist , distances [i ].maxdist );
496+
497+ nearest = calloc (sizeof (* distances ), dptr -> size );
498+ if (!nearest )
499+ goto release ;
500+
501+ for (i = 0 ; i < dptr -> size ; i ++ ) {
502+ if (distances [i ].mindist < near ) {
503+ idx = 0 ;
504+ near = distances [i ].mindist ;
505+ nearest [idx ] = distances [i ];
506+ idx ++ ;
507+ } else if (distances [i ].mindist == near ) {
508+ nearest [idx ] = distances [i ];
509+ idx ++ ;
510+ }
511+ }
512+
513+ * num_distances = idx ;
514+
515+ goto out ;
516+
517+ release :
518+ PMIX_VALUE_RELEASE (val );
519+ out :
520+ return nearest ;
521+ }
495522
496- /* pcidev objects don't have cpusets so find the first non-io object above */
497- obj = hwloc_get_non_io_ancestor_obj (topology , obj );
498- if (NULL != obj ) {
499- result = hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
500- }
523+ #if OPAL_OFI_PCI_DATA_AVAILABLE
501524
502- error :
503- hwloc_bitmap_free (proc_cpuset );
504- return result ;
525+ static bool is_near (pmix_device_distance_t * distances ,
526+ int num_distances ,
527+ hwloc_topology_t topology ,
528+ struct fi_pci_attr pci )
529+ {
530+ hwloc_obj_t pcidev , osdev ;
531+
532+ pcidev = hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
533+ pci .bus_id , pci .device_id ,
534+ pci .function_id );
535+ if (!pcidev )
536+ return false;
537+
538+ for (osdev = pcidev -> io_first_child ; osdev != NULL ; osdev = osdev -> next_sibling ) {
539+ int i ;
540+ const char * address = hwloc_obj_get_info_by_name (osdev , "Address" );
541+ if (!address )
542+ continue ;
543+ for (i = 0 ; i < num_distances ; i ++ ) {
544+ char * addr = strstr (distances [i ].uuid , "://" );
545+ if (!addr || addr + 3 > distances [i ].uuid
546+ + strlen (distances [i ].uuid ))
547+ continue ;
548+ if (!strcmp (addr + 3 , address )) {
549+ fprintf (stderr , "%d matched distance addr %s with %s\n" ,
550+ getpid (), addr + 3 , address );
551+ return true;
552+ }
553+ }
554+ }
555+
556+ return false;
505557}
558+
506559#endif
507560
508561/* Count providers returns the number of providers present in an fi_info list
@@ -613,10 +666,13 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
613666{
614667 struct fi_info * provider = provider_list , * current_provider = provider_list ;
615668 struct fi_info * * provider_table ;
669+ pmix_device_distance_t * distances = NULL ;
616670#if OPAL_OFI_PCI_DATA_AVAILABLE
617671 struct fi_pci_attr pci ;
672+ bool near ;
618673#endif
619674 int ret ;
675+ int num_distances = 0 ;
620676 unsigned int num_provider = 0 , provider_limit = 0 ;
621677 bool provider_found = false, cpusets_match = false;
622678
@@ -641,31 +697,30 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
641697
642698 current_provider = provider ;
643699
700+ distances = get_nearest_nics (& num_distances );
644701 /* Cycle through remaining fi_info objects, looking for alike providers */
645702 while (NULL != current_provider ) {
646703 if (!check_provider_attr (provider , current_provider )) {
647- cpusets_match = false;
704+ near = false;
648705#if OPAL_OFI_PCI_DATA_AVAILABLE
649706 if (NULL != current_provider -> nic
650707 && NULL != current_provider -> nic -> bus_attr
651708 && current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
652709 pci = current_provider -> nic -> bus_attr -> attr .pci ;
653- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
710+ near = is_near (distances , num_distances ,
711+ opal_hwloc_topology , pci );
654712 }
655713#endif
656-
657- /* Reset the list if the cpusets match and no other provider was
658- * found on the same cpuset as the process.
659- */
660- if (cpusets_match && !provider_found ) {
714+ /* We could have multiple near providers */
715+ if (near && !provider_found ) {
661716 provider_found = true;
662717 num_provider = 0 ;
663718 }
664719
665720 /* Add the provider to the provider list if the cpusets match or if
666721 * no other provider was found on the same cpuset as the process.
667722 */
668- if (cpusets_match || !provider_found ) {
723+ if (near || !provider_found ) {
669724 provider_table [num_provider ] = current_provider ;
670725 num_provider ++ ;
671726 }
@@ -687,17 +742,19 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687742 && NULL != provider -> nic -> bus_attr
688743 && provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
689744 pci = provider -> nic -> bus_attr -> attr .pci ;
690- cpusets_match = compare_cpusets ( opal_hwloc_topology , pci );
745+ near = is_near ( distances , num_distances , opal_hwloc_topology , pci );
691746 }
692747#endif
693748
694749#if OPAL_ENABLE_DEBUG
695750 opal_output_verbose (1 , opal_common_ofi .output ,
696- "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697- provider -> domain_attr -> name , cpusets_match ? "true" : "false" );
751+ "package rank: %d device: %s near : %s\n" , package_rank ,
752+ provider -> domain_attr -> name , near ? "true" : "false" );
698753#endif
699754
700755 free (provider_table );
756+ if (distances )
757+ free (distances );
701758 return provider ;
702759}
703760
0 commit comments