@@ -445,63 +445,164 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445 }
446446}
447447
448- #if OPAL_OFI_PCI_DATA_AVAILABLE
449- /* Check if a process and a pci device share the same cpuset
450- * @param (IN) pci struct fi_pci_attr pci device attributes,
451- * used to find hwloc object for device.
452- *
453- * @param (IN) topology hwloc_topology_t topology to get the cpusets
454- * from
455- *
456- * @param (OUT) returns true if cpusets match and false if
457- * cpusets do not match or an error prevents comparison
458- *
459- * Uses a pci device to find an ancestor that contains a cpuset, and
460- * determines if it intersects with the cpuset that the process is bound to.
461- * if the process is not bound, or if a cpuset is unavailable for whatever
462- * reason, returns false. Otherwise, returns the result of
463- * hwloc_cpuset_intersects()
464- */
465- static bool compare_cpusets (hwloc_topology_t topology , struct fi_pci_attr pci )
448+ static int calculate_distances (pmix_device_distance_t * * distances ,
449+ size_t * ndist )
466450{
467- bool result = false;
468- int ret ;
469- hwloc_bitmap_t proc_cpuset ;
470- hwloc_obj_t obj = NULL ;
451+ int ret = 0 ;
452+ size_t ninfo ;
453+ pmix_info_t * info ;
454+ pmix_cpuset_t cpuset ;
455+ pmix_topology_t * pmix_topo ;
456+ pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
457+ PMIX_DEVTYPE_NETWORK ;
458+
459+ PMIX_CPUSET_CONSTRUCT (& cpuset );
460+ ret = PMIx_Get_cpuset (& cpuset , PMIX_CPUBIND_THREAD );
461+ if (PMIX_SUCCESS != ret ) {
462+ goto out ;
463+ }
464+
465+ /* load the PMIX topology */
466+ PMIX_TOPOLOGY_CREATE (pmix_topo , 1 );
467+ ret = PMIx_Load_topology (pmix_topo );
468+ if (PMIX_SUCCESS != ret ) {
469+ goto out ;
470+ }
471+
472+ ninfo = 1 ;
473+ PMIX_INFO_CREATE (info , ninfo );
474+ PMIX_INFO_LOAD (& info [0 ], PMIX_DEVICE_TYPE , & type , PMIX_DEVTYPE );
475+ ret = PMIx_Compute_distances (pmix_topo , & cpuset , info , ninfo , distances ,
476+ ndist );
477+ PMIX_INFO_FREE (info , ninfo );
478+
479+ PMIX_TOPOLOGY_FREE (pmix_topo , 1 );
480+ out :
481+ return ret ;
482+ }
471483
472- /* Cannot find topology info if no topology is found */
473- if (NULL == topology ) {
474- return false;
484+ static pmix_device_distance_t * get_nearest_nics (pmix_value_t * val ,
485+ int * num_distances )
486+ {
487+ size_t ndist ;
488+ pmix_topology_t * topo ;
489+ int ret , i , idx = 0 ;
490+ pmix_data_array_t * dptr ;
491+ uint16_t near = USHRT_MAX ;
492+ pmix_info_t directive ;
493+ pmix_device_distance_t * distances , * nearest = NULL ;
494+
495+ PMIX_INFO_LOAD (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
496+ ret = PMIx_Get (& opal_process_info .myprocid ,
497+ PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
498+ PMIX_INFO_DESTRUCT (& directive );
499+ if (ret != PMIX_SUCCESS || !val ) {
500+ ret = calculate_distances (& distances , & ndist );
501+ if (ret )
502+ goto out ;
503+ goto find_nearest ;
504+ }
505+
506+ if (PMIX_DATA_ARRAY != val -> type ) {
507+ goto out ;
508+ }
509+ dptr = val -> data .darray ;
510+ if (NULL == dptr ) {
511+ goto out ;
512+ }
513+ if (PMIX_DEVICE_DIST != dptr -> type ) {
514+ goto out ;
515+ }
516+
517+ distances = (pmix_device_distance_t * )dptr -> array ;
518+ ndist = dptr -> size ;
519+
520+ find_nearest :
521+ nearest = calloc (sizeof (* distances ), ndist );
522+ if (!nearest )
523+ goto out ;
524+
525+ for (i = 0 ; i < ndist ; i ++ ) {
526+ if (distances [i ].mindist < near ) {
527+ idx = 0 ;
528+ near = distances [i ].mindist ;
529+ nearest [idx ] = distances [i ];
530+ idx ++ ;
531+ } else if (distances [i ].mindist == near ) {
532+ nearest [idx ] = distances [i ];
533+ idx ++ ;
534+ }
475535 }
476536
477- /* Allocate memory for proc_cpuset */
478- proc_cpuset = hwloc_bitmap_alloc ();
479- if (NULL == proc_cpuset ) {
537+ * num_distances = idx ;
538+
539+ goto out ;
540+
541+ out :
542+ return nearest ;
543+ }
544+
545+ #if OPAL_OFI_PCI_DATA_AVAILABLE
546+ static bool is_near (pmix_device_distance_t * distances ,
547+ int num_distances ,
548+ hwloc_topology_t topology ,
549+ struct fi_pci_attr pci )
550+ {
551+ hwloc_obj_t pcidev , osdev ;
552+
553+ /* if we failed to find any distances, then we consider all interfaces
554+ * to be of equal distances and let the caller decide how to handle
555+ * them
556+ */
557+ if (!distances )
558+ return true;
559+
560+ pcidev = hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
561+ pci .bus_id , pci .device_id ,
562+ pci .function_id );
563+ if (!pcidev )
480564 return false;
481- }
482565
483- /* Fill cpuset with the collection of cpu cores that the process runs on */
484- ret = hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485- if (0 > ret ) {
486- goto error ;
487- }
566+ for (osdev = pcidev -> io_first_child ; osdev != NULL ; osdev = osdev -> next_sibling ) {
567+ int i ;
488568
489- /* Get the pci device from bdf */
490- obj = hwloc_get_pcidev_by_busid (topology , pci .domain_id , pci .bus_id , pci .device_id ,
491- pci .function_id );
492- if (NULL == obj ) {
493- goto error ;
494- }
569+ if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_OPENFABRICS ) {
570+ const char * nguid = hwloc_obj_get_info_by_name (osdev ,"NodeGUID" );
571+ const char * sguid = hwloc_obj_get_info_by_name (osdev , "SysImageGUID" );
572+
573+ if (!nguid && !sguid )
574+ continue ;
575+
576+ for (i = 0 ; i < num_distances ; i ++ ) {
577+ char lsguid [256 ], lnguid [256 ];
578+ int ret ;
495579
496- /* pcidev objects don't have cpusets so find the first non-io object above */
497- obj = hwloc_get_non_io_ancestor_obj (topology , obj );
498- if (NULL != obj ) {
499- result = hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
580+ ret = scanf (distances [i ].uuid , "fab://%256s::%256s" , lnguid , lsguid );
581+ if (ret != 2 )
582+ continue ;
583+ if (0 == strcasecmp (lnguid , nguid )) {
584+ return true;
585+ } else if (0 == strcasecmp (lsguid , sguid )) {
586+ return true;
587+ }
588+ }
589+ } else if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_NETWORK ) {
590+ const char * address = hwloc_obj_get_info_by_name (osdev , "Address" );
591+ if (!address )
592+ continue ;
593+ for (i = 0 ; i < num_distances ; i ++ ) {
594+ char * addr = strstr (distances [i ].uuid , "://" );
595+ if (!addr || addr + 3 > distances [i ].uuid
596+ + strlen (distances [i ].uuid ))
597+ continue ;
598+ if (!strcmp (addr + 3 , address )) {
599+ return true;
600+ }
601+ }
602+ }
500603 }
501604
502- error :
503- hwloc_bitmap_free (proc_cpuset );
504- return result ;
605+ return false;
505606}
506607#endif
507608
@@ -614,7 +715,11 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614715 struct fi_info * provider = provider_list , * current_provider = provider_list ;
615716 struct fi_info * * provider_table ;
616717#if OPAL_OFI_PCI_DATA_AVAILABLE
718+ pmix_device_distance_t * distances = NULL ;
719+ pmix_value_t * pmix_val = NULL ;
617720 struct fi_pci_attr pci ;
721+ int num_distances = 0 ;
722+ bool near ;
618723#endif
619724 int ret ;
620725 unsigned int num_provider = 0 , provider_limit = 0 ;
@@ -639,33 +744,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639744 return provider_list ;
640745 }
641746
747+ #if OPAL_OFI_PCI_DATA_AVAILABLE
748+ distances = get_nearest_nics (pmix_val , & num_distances );
749+ if (pmix_val ) {
750+ PMIX_VALUE_RELEASE (pmix_val );
751+ }
752+ #endif
753+
642754 current_provider = provider ;
643755
644756 /* Cycle through remaining fi_info objects, looking for alike providers */
645757 while (NULL != current_provider ) {
646758 if (!check_provider_attr (provider , current_provider )) {
647- cpusets_match = false;
759+ near = false;
648760#if OPAL_OFI_PCI_DATA_AVAILABLE
649761 if (NULL != current_provider -> nic
650762 && NULL != current_provider -> nic -> bus_attr
651763 && current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
652764 pci = current_provider -> nic -> bus_attr -> attr .pci ;
653- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
765+ near = is_near (distances , num_distances ,
766+ opal_hwloc_topology , pci );
654767 }
655768#endif
656-
657- /* Reset the list if the cpusets match and no other provider was
658- * found on the same cpuset as the process.
659- */
660- if (cpusets_match && !provider_found ) {
769+ /* We could have multiple near providers */
770+ if (near && !provider_found ) {
661771 provider_found = true;
662772 num_provider = 0 ;
663773 }
664774
665775 /* Add the provider to the provider list if the cpusets match or if
666776 * no other provider was found on the same cpuset as the process.
667777 */
668- if (cpusets_match || !provider_found ) {
778+ if (near || !provider_found ) {
669779 provider_table [num_provider ] = current_provider ;
670780 num_provider ++ ;
671781 }
@@ -687,17 +797,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687797 && NULL != provider -> nic -> bus_attr
688798 && provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
689799 pci = provider -> nic -> bus_attr -> attr .pci ;
690- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
800+ near = is_near (distances , num_distances ,
801+ opal_hwloc_topology , pci );
691802 }
692803#endif
693804
694805#if OPAL_ENABLE_DEBUG
695806 opal_output_verbose (1 , opal_common_ofi .output ,
696- "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697- provider -> domain_attr -> name , cpusets_match ? "true" : "false" );
807+ "package rank: %d device: %s near : %s\n" , package_rank ,
808+ provider -> domain_attr -> name , near ? "true" : "false" );
698809#endif
699810
700811 free (provider_table );
812+ #if OPAL_OFI_PCI_DATA_AVAILABLE
813+ if (distances )
814+ free (distances );
815+ #endif
701816 return provider ;
702817}
703818
0 commit comments