@@ -445,63 +445,216 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445 }
446446}
447447
448- #if OPAL_OFI_PCI_DATA_AVAILABLE
449- /* Check if a process and a pci device share the same cpuset
450- * @param (IN) pci struct fi_pci_attr pci device attributes,
451- * used to find hwloc object for device.
448+ /**
449+ * Calculate device distances
450+ *
451+ * Calculate the distances between the current thread and all devices of
452+ * type OPENFABRICS or NETWORK.
453+ *
454+ * The shortest distances are the nearest and therefore most efficient
455+ * devices to use.
452456 *
453- * @param (IN) topology hwloc_topology_t topology to get the cpusets
454- * from
457+ * Return an array of all the distances. Each entry is of type
458+ * pmix_device_distance_t
455459 *
456- * @param (OUT) returns true if cpusets match and false if
457- * cpusets do not match or an error prevents comparison
460+ * This function is used if there is no PMIx server running.
461+ *
462+ * @param distances (OUT) distances array
463+ * @param ndist (OUT) number of entries in the distances array
464+ *
465+ * @return 0 on success. Error otherwise.
458466 *
459- * Uses a pci device to find an ancestor that contains a cpuset, and
460- * determines if it intersects with the cpuset that the process is bound to.
461- * if the process is not bound, or if a cpuset is unavailable for whatever
462- * reason, returns false. Otherwise, returns the result of
463- * hwloc_cpuset_intersects()
464467 */
465- static bool compare_cpusets (hwloc_topology_t topology , struct fi_pci_attr pci )
468+ static int compute_dev_distances (pmix_device_distance_t * * distances ,
469+ size_t * ndist )
466470{
467- bool result = false;
468- int ret ;
469- hwloc_bitmap_t proc_cpuset ;
470- hwloc_obj_t obj = NULL ;
471+ int ret = 0 ;
472+ size_t ninfo ;
473+ pmix_info_t * info ;
474+ pmix_cpuset_t cpuset ;
475+ pmix_topology_t * pmix_topo ;
476+ pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
477+ PMIX_DEVTYPE_NETWORK ;
478+
479+ PMIX_CPUSET_CONSTRUCT (& cpuset );
480+ ret = PMIx_Get_cpuset (& cpuset , PMIX_CPUBIND_THREAD );
481+ if (PMIX_SUCCESS != ret ) {
482+ goto out ;
483+ }
484+
485+ /* load the PMIX topology */
486+ PMIx_Topology_free (pmix_topo , 1 );
487+ ret = PMIx_Load_topology (pmix_topo );
488+ if (PMIX_SUCCESS != ret ) {
489+ goto out ;
490+ }
491+
492+ ninfo = 1 ;
493+ info = PMIx_Info_create (ninfo );
494+ PMIx_Info_load (& info [0 ], PMIX_DEVICE_TYPE , & type , PMIX_DEVTYPE );
495+ ret = PMIx_Compute_distances (pmix_topo , & cpuset , info , ninfo , distances ,
496+ ndist );
497+ PMIx_Info_free (info , ninfo );
498+
499+ PMIx_Topology_free (pmix_topo , 1 );
500+ out :
501+ return ret ;
502+ }
471503
472- /* Cannot find topology info if no topology is found */
473- if (NULL == topology ) {
474- return false;
504+ /**
505+ * Find the nearest devices to the current thread
506+ *
507+ * Use the PMIx server or calculate the device distances, then out of the set of
508+ * returned distances find the subset of the nearest devices. This can be
509+ * 1 or more.
510+ *
511+ * @param num_distances (OUT) number of entries in the returned array
512+ *
513+ * @return An array of device distances which are nearest this thread
514+ * or NULL if we fail to get the distances. In this case we will just
515+ * revert to round robin.
516+ *
517+ */
518+ static pmix_device_distance_t *
519+ get_nearest_nics (int * num_distances , pmix_value_t * * valin )
520+ {
521+ size_t ndist , i ;
522+ int ret , idx = 0 ;
523+ pmix_data_array_t * dptr ;
524+ uint16_t near = USHRT_MAX ;
525+ pmix_info_t directive ;
526+ pmix_value_t * val = NULL ;
527+ pmix_device_distance_t * distances , * nearest = NULL ;
528+
529+ PMIx_Info_load (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
530+ ret = PMIx_Get (& opal_process_info .myprocid ,
531+ PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
532+ PMIx_Info_destruct (& directive );
533+ if (ret != PMIX_SUCCESS || !val ) {
534+ ret = compute_dev_distances (& distances , & ndist );
535+ if (ret )
536+ goto out ;
537+ goto find_nearest ;
538+ }
539+
540+ if (PMIX_DATA_ARRAY != val -> type ) {
541+ goto out ;
542+ }
543+ dptr = val -> data .darray ;
544+ if (NULL == dptr ) {
545+ goto out ;
546+ }
547+ if (PMIX_DEVICE_DIST != dptr -> type ) {
548+ goto out ;
549+ }
550+
551+ distances = (pmix_device_distance_t * )dptr -> array ;
552+ ndist = dptr -> size ;
553+
554+ find_nearest :
555+ nearest = calloc (sizeof (* distances ), ndist );
556+ if (!nearest )
557+ goto out ;
558+
559+ for (i = 0 ; i < ndist ; i ++ ) {
560+ if (distances [i ].type != PMIX_DEVTYPE_NETWORK &&
561+ distances [i ].type != PMIX_DEVTYPE_OPENFABRICS )
562+ continue ;
563+ if (distances [i ].mindist < near ) {
564+ idx = 0 ;
565+ near = distances [i ].mindist ;
566+ nearest [idx ] = distances [i ];
567+ idx ++ ;
568+ } else if (distances [i ].mindist == near ) {
569+ nearest [idx ] = distances [i ];
570+ idx ++ ;
571+ }
475572 }
476573
477- /* Allocate memory for proc_cpuset */
478- proc_cpuset = hwloc_bitmap_alloc ();
479- if (NULL == proc_cpuset ) {
574+ * num_distances = idx ;
575+
576+ out :
577+ * valin = val ;
578+ return nearest ;
579+ }
580+
581+ #if OPAL_OFI_PCI_DATA_AVAILABLE
582+ /**
583+ * Determine if a device is nearest
584+ *
585+ * Given a device distances array of the nearest pci devices,
586+ * determine if one of these device distances refers to the pci
587+ * device passed in
588+ *
589+ * @param distances (IN) distances array
590+ * @param num_distances (IN) number of entries in the distances array
591+ * @param topology (IN) topology of the node
592+ * @param pci (IN) PCI device being examined
593+ *
594+ * @return true if the PCI device is in the distances array or if the
595+ * distances array is not provided. False otherwise.
596+ *
597+ */
598+ static bool is_near (pmix_device_distance_t * distances ,
599+ int num_distances ,
600+ hwloc_topology_t topology ,
601+ struct fi_pci_attr pci )
602+ {
603+ hwloc_obj_t pcidev , osdev ;
604+
605+ /* if we failed to find any distances, then we consider all interfaces
606+ * to be of equal distances and let the caller decide how to handle
607+ * them
608+ */
609+ if (!distances )
610+ return true;
611+
612+ pcidev = hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
613+ pci .bus_id , pci .device_id ,
614+ pci .function_id );
615+ if (!pcidev )
480616 return false;
481- }
482617
483- /* Fill cpuset with the collection of cpu cores that the process runs on */
484- ret = hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485- if (0 > ret ) {
486- goto error ;
487- }
618+ for (osdev = pcidev -> io_first_child ; osdev != NULL ; osdev = osdev -> next_sibling ) {
619+ int i ;
488620
489- /* Get the pci device from bdf */
490- obj = hwloc_get_pcidev_by_busid ( topology , pci . domain_id , pci . bus_id , pci . device_id ,
491- pci . function_id );
492- if ( NULL == obj ) {
493- goto error ;
494- }
621+ if ( osdev -> attr -> osdev . type == HWLOC_OBJ_OSDEV_OPENFABRICS ) {
622+ const char * nguid = hwloc_obj_get_info_by_name ( osdev , "NodeGUID" );
623+ const char * sguid = hwloc_obj_get_info_by_name ( osdev , "SysImageGUID" );
624+
625+ if (! nguid && ! sguid )
626+ continue ;
495627
496- /* pcidev objects don't have cpusets so find the first non-io object above */
497- obj = hwloc_get_non_io_ancestor_obj (topology , obj );
498- if (NULL != obj ) {
499- result = hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
628+ for (i = 0 ; i < num_distances ; i ++ ) {
629+ char lsguid [256 ], lnguid [256 ];
630+ int ret ;
631+
632+ ret = sscanf (distances [i ].uuid , "fab://%256s::%256s" , lnguid , lsguid );
633+ if (ret != 2 )
634+ continue ;
635+ if (0 == strcasecmp (lnguid , nguid )) {
636+ return true;
637+ } else if (0 == strcasecmp (lsguid , sguid )) {
638+ return true;
639+ }
640+ }
641+ } else if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_NETWORK ) {
642+ const char * address = hwloc_obj_get_info_by_name (osdev , "Address" );
643+ if (!address )
644+ continue ;
645+ for (i = 0 ; i < num_distances ; i ++ ) {
646+ char * addr = strstr (distances [i ].uuid , "://" );
647+ if (!addr || addr + 3 > distances [i ].uuid
648+ + strlen (distances [i ].uuid ))
649+ continue ;
650+ if (!strcmp (addr + 3 , address )) {
651+ return true;
652+ }
653+ }
654+ }
500655 }
501656
502- error :
503- hwloc_bitmap_free (proc_cpuset );
504- return result ;
657+ return false;
505658}
506659#endif
507660
@@ -614,11 +767,15 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614767 struct fi_info * provider = provider_list , * current_provider = provider_list ;
615768 struct fi_info * * provider_table ;
616769#if OPAL_OFI_PCI_DATA_AVAILABLE
770+ pmix_device_distance_t * distances = NULL ;
771+ pmix_value_t * pmix_val ;
617772 struct fi_pci_attr pci ;
773+ int num_distances = 0 ;
774+ bool near ;
618775#endif
619776 int ret ;
620777 unsigned int num_provider = 0 , provider_limit = 0 ;
621- bool provider_found = false, cpusets_match = false ;
778+ bool provider_found = false;
622779
623780 /* Initialize opal_hwloc_topology if it is not already */
624781 ret = opal_hwloc_base_get_topology ();
@@ -639,33 +796,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639796 return provider_list ;
640797 }
641798
799+ #if OPAL_OFI_PCI_DATA_AVAILABLE
800+ /* find all the nearest devices to this thread, then out of these
801+ * determine which device we should bind to.
802+ */
803+ distances = get_nearest_nics (& num_distances , & pmix_val );
804+ #endif
805+
642806 current_provider = provider ;
643807
644808 /* Cycle through remaining fi_info objects, looking for alike providers */
645809 while (NULL != current_provider ) {
646810 if (!check_provider_attr (provider , current_provider )) {
647- cpusets_match = false;
811+ near = false;
648812#if OPAL_OFI_PCI_DATA_AVAILABLE
649813 if (NULL != current_provider -> nic
650814 && NULL != current_provider -> nic -> bus_attr
651815 && current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
652816 pci = current_provider -> nic -> bus_attr -> attr .pci ;
653- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
817+ near = is_near (distances , num_distances ,
818+ opal_hwloc_topology , pci );
654819 }
655820#endif
656-
657- /* Reset the list if the cpusets match and no other provider was
658- * found on the same cpuset as the process.
659- */
660- if (cpusets_match && !provider_found ) {
821+ /* We could have multiple near providers */
822+ if (near && !provider_found ) {
661823 provider_found = true;
662824 num_provider = 0 ;
663825 }
664826
665827 /* Add the provider to the provider list if the cpusets match or if
666828 * no other provider was found on the same cpuset as the process.
667829 */
668- if (cpusets_match || !provider_found ) {
830+ if (near || !provider_found ) {
669831 provider_table [num_provider ] = current_provider ;
670832 num_provider ++ ;
671833 }
@@ -687,17 +849,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687849 && NULL != provider -> nic -> bus_attr
688850 && provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
689851 pci = provider -> nic -> bus_attr -> attr .pci ;
690- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
852+ near = is_near (distances , num_distances ,
853+ opal_hwloc_topology , pci );
691854 }
692855#endif
693856
694857#if OPAL_ENABLE_DEBUG
695858 opal_output_verbose (1 , opal_common_ofi .output ,
696- "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697- provider -> domain_attr -> name , cpusets_match ? "true" : "false" );
859+ "package rank: %d device: %s near : %s\n" , package_rank ,
860+ provider -> domain_attr -> name , near ? "true" : "false" );
698861#endif
699862
700863 free (provider_table );
864+ #if OPAL_OFI_PCI_DATA_AVAILABLE
865+ if (pmix_val )
866+ PMIx_Value_free (pmix_val , 1 );
867+ #endif
701868 return provider ;
702869}
703870
0 commit comments