44 *                         reserved. 
55 * Copyright (c) 2020-2022 Triad National Security, LLC. All rights 
66 *                         reserved. 
7-  * Copyright (c) 2020-2021 Cisco Systems, Inc.  All rights reserved 
7+  * Copyright (c) 2020-2021 Cisco Systems, Inc.  All rights reserved.  
88 * Copyright (c) 2021      Nanook Consulting.  All rights reserved. 
99 * Copyright (c) 2021      Amazon.com, Inc. or its affiliates. All rights 
1010 *                         reserved. 
11+  * Copyright (c) 2023      UT-Battelle, LLC.  All rights reserved. 
1112 * $COPYRIGHT$ 
1213 * 
1314 * Additional copyrights may follow 
@@ -445,63 +446,216 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445446    }
446447}
447448
448- #if  OPAL_OFI_PCI_DATA_AVAILABLE 
449- /* Check if a process and a pci device share the same cpuset 
450-  *     @param (IN) pci              struct fi_pci_attr pci device attributes, 
451-  *                                  used to find hwloc object for device. 
449+ /** 
450+  * Calculate device distances 
451+  * 
452+  * Calculate the distances between the current thread and all devices of 
453+  * type OPENFABRICS or NETWORK. 
454+  * 
455+  * The shortest distances are the nearest and therefore most efficient 
456+  * devices to use. 
452457 * 
453-  *     @param (IN) topology         hwloc_topology_t topology to get the cpusets  
454-  *                                  from  
458+  * Return an array of all the distances. Each entry is of type  
459+  * pmix_device_distance_t  
455460 * 
456-  *     @param (OUT)                 returns true if cpusets match and false if 
457-  *                                  cpusets do not match or an error prevents comparison 
461+  * This function is used if there is no PMIx server running. 
462+  * 
463+  * @param distances (OUT)     distances array 
464+  * @param ndist (OUT)    number of entries in the distances array 
465+  * 
466+  * @return   0 on success. Error otherwise. 
458467 * 
459-  *     Uses a pci device to find an ancestor that contains a cpuset, and 
460-  *     determines if it intersects with the cpuset that the process is bound to. 
461-  *     if the process is not bound, or if a cpuset is unavailable for whatever 
462-  *     reason, returns false. Otherwise, returns the result of 
463-  *     hwloc_cpuset_intersects() 
464468 */ 
465- static  bool  compare_cpusets (hwloc_topology_t  topology , struct  fi_pci_attr  pci )
469+ static  int  compute_dev_distances (pmix_device_distance_t  * * distances ,
470+                                   size_t  * ndist )
466471{
467-     bool  result  =  false;
468-     int  ret ;
469-     hwloc_bitmap_t  proc_cpuset ;
470-     hwloc_obj_t  obj  =  NULL ;
472+     int  ret  =  0 ;
473+     size_t  ninfo ;
474+     pmix_info_t  * info ;
475+     pmix_cpuset_t  cpuset ;
476+     pmix_topology_t  * pmix_topo ;
477+     pmix_device_type_t  type  =  PMIX_DEVTYPE_OPENFABRICS  |
478+       PMIX_DEVTYPE_NETWORK ;
479+ 
480+     PMIX_CPUSET_CONSTRUCT (& cpuset );
481+     ret  =  PMIx_Get_cpuset (& cpuset , PMIX_CPUBIND_THREAD );
482+     if  (PMIX_SUCCESS  !=  ret ) {
483+         goto out ;
484+     }
485+ 
486+     /* load the PMIX topology */ 
487+     PMIx_Topology_free (pmix_topo , 1 );
488+     ret  =  PMIx_Load_topology (pmix_topo );
489+     if  (PMIX_SUCCESS  !=  ret ) {
490+         goto out ;
491+     }
492+ 
493+     ninfo  =  1 ;
494+     info  =  PMIx_Info_create (ninfo );
495+     PMIx_Info_load (& info [0 ], PMIX_DEVICE_TYPE , & type , PMIX_DEVTYPE );
496+     ret  =  PMIx_Compute_distances (pmix_topo , & cpuset , info , ninfo , distances ,
497+                                  ndist );
498+     PMIx_Info_free (info , ninfo );
499+ 
500+     PMIx_Topology_free (pmix_topo , 1 );
501+ out :
502+     return  ret ;
503+ }
471504
472-     /* Cannot find topology info if no topology is found */ 
473-     if  (NULL  ==  topology ) {
474-         return  false;
505+ /** 
506+  * Find the nearest devices to the current thread 
507+  * 
508+  * Use the PMIx server or calculate the device distances, then out of the set of 
509+  * returned distances find the subset of the nearest devices. This can be 
510+  * 1 or more. 
511+  * 
512+  * @param num_distances (OUT)     number of entries in the returned array 
513+  * 
514+  * @return   An array of device distances which are nearest this thread 
515+  *           or NULL if we fail to get the distances. In this case we will just 
516+  *           revert to round robin. 
517+  * 
518+  */ 
519+ static  pmix_device_distance_t  * 
520+ get_nearest_nics (int  * num_distances , pmix_value_t  * * valin )
521+ {
522+     size_t  ndist , i ;
523+     int  ret , idx  =  0 ;
524+     pmix_data_array_t  * dptr ;
525+     uint16_t  near  =  USHRT_MAX ;
526+     pmix_info_t  directive ;
527+     pmix_value_t  * val  =  NULL ;
528+     pmix_device_distance_t  * distances , * nearest  =  NULL ;
529+ 
530+     PMIx_Info_load (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
531+     ret  =  PMIx_Get (& opal_process_info .myprocid ,
532+              PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
533+     PMIx_Info_destruct (& directive );
534+     if  (ret  !=  PMIX_SUCCESS  ||  !val ) {
535+         ret  =  compute_dev_distances (& distances , & ndist );
536+         if  (ret )
537+             goto out ;
538+         goto find_nearest ;
539+     }
540+ 
541+     if  (PMIX_DATA_ARRAY  !=  val -> type ) {
542+         goto out ;
543+     }
544+     dptr  =  val -> data .darray ;
545+     if  (NULL  ==  dptr ) {
546+         goto out ;
547+     }
548+     if  (PMIX_DEVICE_DIST  !=  dptr -> type ) {
549+         goto out ;
550+     }
551+ 
552+     distances  =  (pmix_device_distance_t * )dptr -> array ;
553+     ndist  =  dptr -> size ;
554+ 
555+ find_nearest :
556+     nearest  =  calloc (sizeof (* distances ), ndist );
557+     if  (!nearest )
558+         goto out ;
559+ 
560+     for  (i  =  0 ; i  <  ndist ; i ++ ) {
561+         if  (distances [i ].type  !=  PMIX_DEVTYPE_NETWORK  && 
562+             distances [i ].type  !=  PMIX_DEVTYPE_OPENFABRICS )
563+             continue ;
564+         if  (distances [i ].mindist  <  near ) {
565+             idx  =  0 ;
566+             near  =  distances [i ].mindist ;
567+             nearest [idx ] =  distances [i ];
568+             idx ++ ;
569+         } else  if  (distances [i ].mindist  ==  near ) {
570+             nearest [idx ] =  distances [i ];
571+             idx ++ ;
572+         }
475573    }
476574
477-     /* Allocate memory for proc_cpuset */ 
478-     proc_cpuset  =  hwloc_bitmap_alloc ();
479-     if  (NULL  ==  proc_cpuset ) {
575+     * num_distances  =  idx ;
576+ 
577+ out :
578+     * valin  =  val ;
579+     return  nearest ;
580+ }
581+ 
582+ #if  OPAL_OFI_PCI_DATA_AVAILABLE 
583+ /** 
584+  * Determine if a device is nearest 
585+  * 
586+  * Given a device distances array of the nearest pci devices, 
587+  * determine if one of these device distances refers to the pci 
588+  * device passed in 
589+  * 
590+  * @param distances (IN)     distances array 
591+  * @param num_distances (IN) number of entries in the distances array 
592+  * @param topology (IN)      topology of the node 
593+  * @param pci (IN)           PCI device being examined 
594+  * 
595+  * @return   true if the PCI device is in the distances array or if the 
596+  *           distances array is not provided. False otherwise. 
597+  * 
598+  */ 
599+ static  bool  is_near (pmix_device_distance_t  * distances ,
600+                     int  num_distances ,
601+                     hwloc_topology_t  topology ,
602+                     struct  fi_pci_attr  pci )
603+ {
604+     hwloc_obj_t  pcidev , osdev ;
605+ 
606+     /* if we failed to find any distances, then we consider all interfaces 
607+      * to be of equal distances and let the caller decide how to handle 
608+      * them 
609+      */ 
610+     if  (!distances )
611+         return  true;
612+ 
613+     pcidev  =  hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
614+                         pci .bus_id , pci .device_id ,
615+                         pci .function_id );
616+     if  (!pcidev )
480617        return  false;
481-     }
482618
483-     /* Fill cpuset with the collection of cpu cores that the process runs on */ 
484-     ret  =  hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485-     if  (0  >  ret ) {
486-         goto error ;
487-     }
619+     for (osdev  =  pcidev -> io_first_child ; osdev  !=  NULL ; osdev  =  osdev -> next_sibling ) {
620+         int  i ;
488621
489-     /* Get the pci device from bdf */ 
490-     obj   =   hwloc_get_pcidev_by_busid ( topology ,  pci . domain_id ,  pci . bus_id ,  pci . device_id , 
491-                                      pci . function_id );
492-      if  ( NULL   ==   obj ) { 
493-         goto  error ; 
494-     } 
622+          if  ( osdev -> attr -> osdev . type   ==   HWLOC_OBJ_OSDEV_OPENFABRICS ) { 
623+              const   char   * nguid   =   hwloc_obj_get_info_by_name ( osdev , "NodeGUID" ); 
624+             const   char   * sguid   =   hwloc_obj_get_info_by_name ( osdev ,  "SysImageGUID" );
625+ 
626+              if  (! nguid   &&  ! sguid ) 
627+                  continue ; 
495628
496-     /* pcidev objects don't have cpusets so find the first non-io object above */ 
497-     obj  =  hwloc_get_non_io_ancestor_obj (topology , obj );
498-     if  (NULL  !=  obj ) {
499-         result  =  hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
629+             for  (i  =  0 ; i  <  num_distances ; i ++ ) {
630+                 char  lsguid [256 ], lnguid [256 ];
631+                 int  ret ;
632+ 
633+                 ret  =  sscanf (distances [i ].uuid , "fab://%256s::%256s" , lnguid , lsguid );
634+                 if  (ret  !=  2 )
635+                     continue ;
636+                 if  (0  ==  strcasecmp (lnguid , nguid )) {
637+                     return  true;
638+                 } else  if  (0  ==  strcasecmp (lsguid , sguid )) {
639+                     return  true;
640+                 }
641+             }
642+         } else  if  (osdev -> attr -> osdev .type  ==  HWLOC_OBJ_OSDEV_NETWORK ) {
643+             const  char  * address  =  hwloc_obj_get_info_by_name (osdev , "Address" );
644+             if  (!address )
645+                 continue ;
646+             for  (i  =  0 ; i  <  num_distances ; i ++ ) {
647+                 char  * addr  =  strstr (distances [i ].uuid , "://" );
648+                 if  (!addr  ||  addr  +  3  >  distances [i ].uuid 
649+                     +  strlen (distances [i ].uuid ))
650+                     continue ;
651+                 if  (!strcmp (addr + 3 , address )) {
652+                     return  true;
653+                 }
654+             }
655+         }
500656    }
501657
502- error :
503-     hwloc_bitmap_free (proc_cpuset );
504-     return  result ;
658+     return  false;
505659}
506660#endif 
507661
@@ -614,11 +768,15 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614768    struct  fi_info  * provider  =  provider_list , * current_provider  =  provider_list ;
615769    struct  fi_info  * * provider_table ;
616770#if  OPAL_OFI_PCI_DATA_AVAILABLE 
771+     pmix_device_distance_t  * distances  =  NULL ;
772+     pmix_value_t  * pmix_val ;
617773    struct  fi_pci_attr  pci ;
774+     int  num_distances  =  0 ;
775+     bool  near ;
618776#endif 
619777    int  ret ;
620778    unsigned int   num_provider  =  0 , provider_limit  =  0 ;
621-     bool  provider_found  =  false,  cpusets_match   =  false ;
779+     bool  provider_found  =  false;
622780
623781    /* Initialize opal_hwloc_topology if it is not already */ 
624782    ret  =  opal_hwloc_base_get_topology ();
@@ -639,33 +797,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639797        return  provider_list ;
640798    }
641799
800+ #if  OPAL_OFI_PCI_DATA_AVAILABLE 
801+     /* find all the nearest devices to this thread, then out of these 
802+      * determine which device we should bind to. 
803+      */ 
804+     distances  =  get_nearest_nics (& num_distances , & pmix_val );
805+ #endif 
806+ 
642807    current_provider  =  provider ;
643808
644809    /* Cycle through remaining fi_info objects, looking for alike providers */ 
645810    while  (NULL  !=  current_provider ) {
646811        if  (!check_provider_attr (provider , current_provider )) {
647-             cpusets_match  =  false;
812+             near  =  false;
648813#if  OPAL_OFI_PCI_DATA_AVAILABLE 
649814            if  (NULL  !=  current_provider -> nic 
650815                &&  NULL  !=  current_provider -> nic -> bus_attr 
651816                &&  current_provider -> nic -> bus_attr -> bus_type  ==  FI_BUS_PCI ) {
652817                pci  =  current_provider -> nic -> bus_attr -> attr .pci ;
653-                 cpusets_match  =  compare_cpusets (opal_hwloc_topology , pci );
818+                 near  =  is_near (distances , num_distances ,
819+                                opal_hwloc_topology , pci );
654820            }
655821#endif 
656- 
657-             /* Reset the list if the cpusets match and no other provider was 
658-              * found on the same cpuset as the process. 
659-              */ 
660-             if  (cpusets_match  &&  !provider_found ) {
822+             /* We could have multiple near providers */ 
823+             if  (near  &&  !provider_found ) {
661824                provider_found  =  true;
662825                num_provider  =  0 ;
663826            }
664827
665828            /* Add the provider to the provider list if the cpusets match or if 
666829             * no other provider was found on the same cpuset as the process. 
667830             */ 
668-             if  (cpusets_match  ||  !provider_found ) {
831+             if  (near  ||  !provider_found ) {
669832                provider_table [num_provider ] =  current_provider ;
670833                num_provider ++ ;
671834            }
@@ -687,17 +850,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687850        &&  NULL  !=  provider -> nic -> bus_attr 
688851        &&  provider -> nic -> bus_attr -> bus_type  ==  FI_BUS_PCI ) {
689852        pci  =  provider -> nic -> bus_attr -> attr .pci ;
690-         cpusets_match  =  compare_cpusets (opal_hwloc_topology , pci );
853+         near  =  is_near (distances , num_distances ,
854+                        opal_hwloc_topology , pci );
691855    }
692856#endif 
693857
694858#if  OPAL_ENABLE_DEBUG 
695859    opal_output_verbose (1 , opal_common_ofi .output ,
696-                         "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697-                         provider -> domain_attr -> name , cpusets_match  ? "true"  : "false" );
860+                         "package rank: %d device: %s near : %s\n" , package_rank ,
861+                         provider -> domain_attr -> name , near  ? "true"  : "false" );
698862#endif 
699863
700864    free (provider_table );
865+ #if  OPAL_OFI_PCI_DATA_AVAILABLE 
866+     if  (pmix_val )
867+         PMIx_Value_free (pmix_val , 1 );
868+ #endif 
701869    return  provider ;
702870}
703871
0 commit comments