@@ -445,214 +445,63 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445    }
446446}
447447
448- /** 
449-  * Calculate device distances 
450-  * 
451-  * Calculate the distances between the current thread and all devices of 
452-  * type OPENFABRICS or NETWORK. 
453-  * 
454-  * The shortest distances are the nearest and therefore most efficient 
455-  * devices to use. 
456-  * 
457-  * Return an array of all the distances. Each entry is of type 
458-  * pmix_device_distance_t 
459-  * 
460-  * This function is used if there is no PMIx server running. 
461-  * 
462-  * @param distances (OUT)     distances array 
463-  * @param ndist (OUT)    number of entries in the distances array 
464-  * 
465-  * @return   0 on success. Error otherwise. 
466-  * 
467-  */ 
468- static  int  compute_dev_distances (pmix_device_distance_t  * * distances ,
469-                                   size_t  * ndist )
470- {
471-     int  ret  =  0 ;
472-     size_t  ninfo ;
473-     pmix_info_t  * info ;
474-     pmix_cpuset_t  cpuset ;
475-     pmix_topology_t  * pmix_topo ;
476-     pmix_device_type_t  type  =  PMIX_DEVTYPE_OPENFABRICS  |
477-       PMIX_DEVTYPE_NETWORK ;
478- 
479-     PMIX_CPUSET_CONSTRUCT (& cpuset );
480-     ret  =  PMIx_Get_cpuset (& cpuset , PMIX_CPUBIND_THREAD );
481-     if  (PMIX_SUCCESS  !=  ret ) {
482-         goto out ;
483-     }
484- 
485-     /* load the PMIX topology */ 
486-     PMIX_TOPOLOGY_CREATE (pmix_topo , 1 );
487-     ret  =  PMIx_Load_topology (pmix_topo );
488-     if  (PMIX_SUCCESS  !=  ret ) {
489-         goto out ;
490-     }
491- 
492-     ninfo  =  1 ;
493-     PMIX_INFO_CREATE (info , ninfo );
494-     PMIX_INFO_LOAD (& info [0 ], PMIX_DEVICE_TYPE , & type , PMIX_DEVTYPE );
495-     ret  =  PMIx_Compute_distances (pmix_topo , & cpuset , info , ninfo , distances ,
496-                                  ndist );
497-     PMIX_INFO_FREE (info , ninfo );
498- 
499-     PMIX_TOPOLOGY_FREE (pmix_topo , 1 );
500- out :
501-     return  ret ;
502- }
503- 
504- /** 
505-  * Find the nearest devices to the current thread 
506-  * 
507-  * Use the PMIx server or calculate the device distances, then out of the set of 
508-  * returned distances find the subset of the nearest devices. This can be 
509-  * 1 or more. 
510-  * 
511-  * @param num_distances (OUT)     number of entries in the returned array 
512-  * 
513-  * @return   An array of device distances which are nearest this thread 
514-  *           or NULL if we fail to get the distances. In this case we will just 
515-  *           revert to round robin. 
516-  * 
517-  */ 
518- static  pmix_device_distance_t  * get_nearest_nics (int  * num_distances )
519- {
520-     size_t  ndist ;
521-     pmix_topology_t  * topo ;
522-     int  ret , i , idx  =  0 ;
523-     pmix_data_array_t  * dptr ;
524-     uint16_t  near  =  USHRT_MAX ;
525-     pmix_info_t  directive ;
526-     pmix_value_t  * val  =  NULL ;
527-     pmix_device_distance_t  * distances , * nearest  =  NULL ;
528- 
529-     PMIX_INFO_LOAD (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
530-     ret  =  PMIx_Get (& opal_process_info .myprocid ,
531-              PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
532-     PMIX_INFO_DESTRUCT (& directive );
533-     if  (ret  !=  PMIX_SUCCESS  ||  !val ) {
534-         ret  =  compute_dev_distances (& distances , & ndist );
535-         if  (ret )
536-             goto out ;
537-         goto find_nearest ;
538-     }
539- 
540-     if  (PMIX_DATA_ARRAY  !=  val -> type ) {
541-         goto out ;
542-     }
543-     dptr  =  val -> data .darray ;
544-     if  (NULL  ==  dptr ) {
545-         goto out ;
546-     }
547-     if  (PMIX_DEVICE_DIST  !=  dptr -> type ) {
548-         goto out ;
549-     }
550- 
551-     distances  =  (pmix_device_distance_t * )dptr -> array ;
552-     ndist  =  dptr -> size ;
553- 
554- find_nearest :
555-     nearest  =  calloc (sizeof (* distances ), ndist );
556-     if  (!nearest )
557-         goto out ;
558- 
559-     for  (i  =  0 ; i  <  ndist ; i ++ ) {
560-         if  (distances [i ].mindist  <  near ) {
561-             idx  =  0 ;
562-             near  =  distances [i ].mindist ;
563-             nearest [idx ] =  distances [i ];
564-             idx ++ ;
565-         } else  if  (distances [i ].mindist  ==  near ) {
566-             nearest [idx ] =  distances [i ];
567-             idx ++ ;
568-         }
569-     }
570- 
571-     * num_distances  =  idx ;
572- 
573- out :
574-     if  (val )
575-         PMIX_VALUE_RELEASE (val );
576-     return  nearest ;
577- }
578- 
579448#if  OPAL_OFI_PCI_DATA_AVAILABLE 
580- /** 
581-  * Determine if a device is nearest 
582-  * 
583-  * Given a device distances array of the nearest pci devices, 
584-  * determine if one of these device distances refers to the pci 
585-  * device passed in 
449+ /* Check if a process and a pci device share the same cpuset 
450+  *     @param (IN) pci              struct fi_pci_attr pci device attributes, 
451+  *                                  used to find hwloc object for device. 
586452 * 
587-  * @param distances (IN)     distances array 
588-  * @param num_distances (IN) number of entries in the distances array 
589-  * @param topology (IN)      topology of the node 
590-  * @param pci (IN)           PCI device being examined 
453+  *     @param (IN) topology         hwloc_topology_t topology to get the cpusets 
454+  *                                  from 
591455 * 
592-  * @return   true if the PCI device is in the distances array or  if the  
593-  *           distances array is  not provided. False otherwise.  
456+  *     @param (OUT)                 returns true  if cpusets match and false if  
457+  *                                  cpusets do  not match or an error prevents comparison  
594458 * 
459+  *     Uses a pci device to find an ancestor that contains a cpuset, and 
460+  *     determines if it intersects with the cpuset that the process is bound to. 
461+  *     if the process is not bound, or if a cpuset is unavailable for whatever 
462+  *     reason, returns false. Otherwise, returns the result of 
463+  *     hwloc_cpuset_intersects() 
595464 */ 
596- static  bool  is_near (pmix_device_distance_t  * distances ,
597-                     int  num_distances ,
598-                     hwloc_topology_t  topology ,
599-                     struct  fi_pci_attr  pci )
465+ static  bool  compare_cpusets (hwloc_topology_t  topology , struct  fi_pci_attr  pci )
600466{
601-     hwloc_obj_t  pcidev , osdev ;
602- 
603-     /* if we failed to find any distances, then we consider all interfaces 
604-      * to be of equal distances and let the caller decide how to handle 
605-      * them 
606-      */ 
607-     if  (!distances )
608-         return  true;
467+     bool  result  =  false;
468+     int  ret ;
469+     hwloc_bitmap_t  proc_cpuset ;
470+     hwloc_obj_t  obj  =  NULL ;
609471
610-     pcidev  =  hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
611-                         pci .bus_id , pci .device_id ,
612-                         pci .function_id );
613-     if  (!pcidev )
472+     /* Cannot find topology info if no topology is found */ 
473+     if  (NULL  ==  topology ) {
614474        return  false;
475+     }
615476
616-     for (osdev  =  pcidev -> io_first_child ; osdev  !=  NULL ; osdev  =  osdev -> next_sibling ) {
617-         int  i ;
618- 
619-         if  (osdev -> attr -> osdev .type  ==  HWLOC_OBJ_OSDEV_OPENFABRICS ) {
620-             const  char  * nguid  =  hwloc_obj_get_info_by_name (osdev ,"NodeGUID" );
621-             const  char  * sguid  =  hwloc_obj_get_info_by_name (osdev , "SysImageGUID" );
477+     /* Allocate memory for proc_cpuset */ 
478+     proc_cpuset  =  hwloc_bitmap_alloc ();
479+     if  (NULL  ==  proc_cpuset ) {
480+         return  false;
481+     }
622482
623-             if  (!nguid  &&  !sguid )
624-                 continue ;
483+     /* Fill cpuset with the collection of cpu cores that the process runs on */ 
484+     ret  =  hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485+     if  (0  >  ret ) {
486+         goto error ;
487+     }
625488
626-             for  (i  =  0 ; i  <  num_distances ; i ++ ) {
627-                 char  lsguid [256 ], lnguid [256 ];
628-                 int  ret ;
489+     /* Get the pci device from bdf */ 
490+     obj  =  hwloc_get_pcidev_by_busid (topology , pci .domain_id , pci .bus_id , pci .device_id ,
491+                                     pci .function_id );
492+     if  (NULL  ==  obj ) {
493+         goto error ;
494+     }
629495
630-                 ret  =  sscanf (distances [i ].uuid , "fab://%256s::%256s" , lnguid , lsguid );
631-                 if  (ret  !=  2 )
632-                     continue ;
633-                 if  (0  ==  strcasecmp (lnguid , nguid )) {
634-                     return  true;
635-                 } else  if  (0  ==  strcasecmp (lsguid , sguid )) {
636-                     return  true;
637-                 }
638-             }
639-         } else  if  (osdev -> attr -> osdev .type  ==  HWLOC_OBJ_OSDEV_NETWORK ) {
640-             const  char  * address  =  hwloc_obj_get_info_by_name (osdev , "Address" );
641-             if  (!address )
642-                 continue ;
643-             for  (i  =  0 ; i  <  num_distances ; i ++ ) {
644-                 char  * addr  =  strstr (distances [i ].uuid , "://" );
645-                 if  (!addr  ||  addr  +  3  >  distances [i ].uuid 
646-                     +  strlen (distances [i ].uuid ))
647-                     continue ;
648-                 if  (!strcmp (addr + 3 , address )) {
649-                     return  true;
650-                 }
651-             }
652-         }
496+     /* pcidev objects don't have cpusets so find the first non-io object above */ 
497+     obj  =  hwloc_get_non_io_ancestor_obj (topology , obj );
498+     if  (NULL  !=  obj ) {
499+         result  =  hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
653500    }
654501
655-     return  false;
502+ error :
503+     hwloc_bitmap_free (proc_cpuset );
504+     return  result ;
656505}
657506#endif 
658507
@@ -765,10 +614,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
765614    struct  fi_info  * provider  =  provider_list , * current_provider  =  provider_list ;
766615    struct  fi_info  * * provider_table ;
767616#if  OPAL_OFI_PCI_DATA_AVAILABLE 
768-     pmix_device_distance_t  * distances  =  NULL ;
769617    struct  fi_pci_attr  pci ;
770-     int  num_distances  =  0 ;
771-     bool  near ;
772618#endif 
773619    int  ret ;
774620    unsigned int   num_provider  =  0 , provider_limit  =  0 ;
@@ -793,38 +639,33 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
793639        return  provider_list ;
794640    }
795641
796- #if  OPAL_OFI_PCI_DATA_AVAILABLE 
797-     /* find all the nearest devices to this thread, then out of these 
798-      * determine which device we should bind to. 
799-      */ 
800-     distances  =  get_nearest_nics (& num_distances );
801- #endif 
802- 
803642    current_provider  =  provider ;
804643
805644    /* Cycle through remaining fi_info objects, looking for alike providers */ 
806645    while  (NULL  !=  current_provider ) {
807646        if  (!check_provider_attr (provider , current_provider )) {
808-             near  =  false;
647+             cpusets_match  =  false;
809648#if  OPAL_OFI_PCI_DATA_AVAILABLE 
810649            if  (NULL  !=  current_provider -> nic 
811650                &&  NULL  !=  current_provider -> nic -> bus_attr 
812651                &&  current_provider -> nic -> bus_attr -> bus_type  ==  FI_BUS_PCI ) {
813652                pci  =  current_provider -> nic -> bus_attr -> attr .pci ;
814-                 near  =  is_near (distances , num_distances ,
815-                                opal_hwloc_topology , pci );
653+                 cpusets_match  =  compare_cpusets (opal_hwloc_topology , pci );
816654            }
817655#endif 
818-             /* We could have multiple near providers */ 
819-             if  (near  &&  !provider_found ) {
656+ 
657+             /* Reset the list if the cpusets match and no other provider was 
658+              * found on the same cpuset as the process. 
659+              */ 
660+             if  (cpusets_match  &&  !provider_found ) {
820661                provider_found  =  true;
821662                num_provider  =  0 ;
822663            }
823664
824665            /* Add the provider to the provider list if the cpusets match or if 
825666             * no other provider was found on the same cpuset as the process. 
826667             */ 
827-             if  (near  ||  !provider_found ) {
668+             if  (cpusets_match  ||  !provider_found ) {
828669                provider_table [num_provider ] =  current_provider ;
829670                num_provider ++ ;
830671            }
@@ -846,22 +687,17 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
846687        &&  NULL  !=  provider -> nic -> bus_attr 
847688        &&  provider -> nic -> bus_attr -> bus_type  ==  FI_BUS_PCI ) {
848689        pci  =  provider -> nic -> bus_attr -> attr .pci ;
849-         near  =  is_near (distances , num_distances ,
850-                        opal_hwloc_topology , pci );
690+         cpusets_match  =  compare_cpusets (opal_hwloc_topology , pci );
851691    }
852692#endif 
853693
854694#if  OPAL_ENABLE_DEBUG 
855695    opal_output_verbose (1 , opal_common_ofi .output ,
856-                         "package rank: %d device: %s near : %s\n" , package_rank ,
857-                         provider -> domain_attr -> name , near  ? "true"  : "false" );
696+                         "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697+                         provider -> domain_attr -> name , cpusets_match  ? "true"  : "false" );
858698#endif 
859699
860700    free (provider_table );
861- #if  OPAL_OFI_PCI_DATA_AVAILABLE 
862-     if  (distances )
863-         free (distances );
864- #endif 
865701    return  provider ;
866702}
867703
0 commit comments