44 * reserved.
55 * Copyright (c) 2020-2022 Triad National Security, LLC. All rights
66 * reserved.
7- * Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved
7+ * Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved.
88 * Copyright (c) 2021 Nanook Consulting. All rights reserved.
99 * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
1010 * reserved.
11+ * Copyright (c) 2023 UT-Battelle, LLC. All rights reserved.
1112 * $COPYRIGHT$
1213 *
1314 * Additional copyrights may follow
@@ -445,63 +446,216 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445446 }
446447}
447448
448- #if OPAL_OFI_PCI_DATA_AVAILABLE
449- /* Check if a process and a pci device share the same cpuset
450- * @param (IN) pci struct fi_pci_attr pci device attributes,
451- * used to find hwloc object for device.
449+ /**
450+ * Calculate device distances
451+ *
452+ * Calculate the distances between the current thread and all devices of
453+ * type OPENFABRICS or NETWORK.
454+ *
455+ * The shortest distances are the nearest and therefore most efficient
456+ * devices to use.
452457 *
453- * @param (IN) topology hwloc_topology_t topology to get the cpusets
454- * from
458+ * Return an array of all the distances. Each entry is of type
459+ * pmix_device_distance_t
455460 *
456- * @param (OUT) returns true if cpusets match and false if
457- * cpusets do not match or an error prevents comparison
461+ * This function is used if there is no PMIx server running.
462+ *
463+ * @param distances (OUT) distances array
464+ * @param ndist (OUT) number of entries in the distances array
465+ *
466+ * @return 0 on success. Error otherwise.
458467 *
459- * Uses a pci device to find an ancestor that contains a cpuset, and
460- * determines if it intersects with the cpuset that the process is bound to.
461- * if the process is not bound, or if a cpuset is unavailable for whatever
462- * reason, returns false. Otherwise, returns the result of
463- * hwloc_cpuset_intersects()
464468 */
465- static bool compare_cpusets (hwloc_topology_t topology , struct fi_pci_attr pci )
469+ static int compute_dev_distances (pmix_device_distance_t * * distances ,
470+ size_t * ndist )
466471{
467- bool result = false;
468- int ret ;
469- hwloc_bitmap_t proc_cpuset ;
470- hwloc_obj_t obj = NULL ;
472+ int ret = 0 ;
473+ size_t ninfo ;
474+ pmix_info_t * info ;
475+ pmix_cpuset_t cpuset ;
476+ pmix_topology_t * pmix_topo ;
477+ pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
478+ PMIX_DEVTYPE_NETWORK ;
479+
480+ PMIX_CPUSET_CONSTRUCT (& cpuset );
481+ ret = PMIx_Get_cpuset (& cpuset , PMIX_CPUBIND_THREAD );
482+ if (PMIX_SUCCESS != ret ) {
483+ goto out ;
484+ }
485+
486+ /* load the PMIX topology */
487+ PMIx_Topology_free (pmix_topo , 1 );
488+ ret = PMIx_Load_topology (pmix_topo );
489+ if (PMIX_SUCCESS != ret ) {
490+ goto out ;
491+ }
492+
493+ ninfo = 1 ;
494+ info = PMIx_Info_create (ninfo );
495+ PMIx_Info_load (& info [0 ], PMIX_DEVICE_TYPE , & type , PMIX_DEVTYPE );
496+ ret = PMIx_Compute_distances (pmix_topo , & cpuset , info , ninfo , distances ,
497+ ndist );
498+ PMIx_Info_free (info , ninfo );
499+
500+ PMIx_Topology_free (pmix_topo , 1 );
501+ out :
502+ return ret ;
503+ }
471504
472- /* Cannot find topology info if no topology is found */
473- if (NULL == topology ) {
474- return false;
505+ /**
506+ * Find the nearest devices to the current thread
507+ *
508+ * Use the PMIx server or calculate the device distances, then out of the set of
509+ * returned distances find the subset of the nearest devices. This can be
510+ * 1 or more.
511+ *
512+ * @param num_distances (OUT) number of entries in the returned array
513+ *
514+ * @return An array of device distances which are nearest this thread
515+ * or NULL if we fail to get the distances. In this case we will just
516+ * revert to round robin.
517+ *
518+ */
519+ static pmix_device_distance_t *
520+ get_nearest_nics (int * num_distances , pmix_value_t * * valin )
521+ {
522+ size_t ndist , i ;
523+ int ret , idx = 0 ;
524+ pmix_data_array_t * dptr ;
525+ uint16_t near = USHRT_MAX ;
526+ pmix_info_t directive ;
527+ pmix_value_t * val = NULL ;
528+ pmix_device_distance_t * distances , * nearest = NULL ;
529+
530+ PMIx_Info_load (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
531+ ret = PMIx_Get (& opal_process_info .myprocid ,
532+ PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
533+ PMIx_Info_destruct (& directive );
534+ if (ret != PMIX_SUCCESS || !val ) {
535+ ret = compute_dev_distances (& distances , & ndist );
536+ if (ret )
537+ goto out ;
538+ goto find_nearest ;
539+ }
540+
541+ if (PMIX_DATA_ARRAY != val -> type ) {
542+ goto out ;
543+ }
544+ dptr = val -> data .darray ;
545+ if (NULL == dptr ) {
546+ goto out ;
547+ }
548+ if (PMIX_DEVICE_DIST != dptr -> type ) {
549+ goto out ;
550+ }
551+
552+ distances = (pmix_device_distance_t * )dptr -> array ;
553+ ndist = dptr -> size ;
554+
555+ find_nearest :
556+ nearest = calloc (sizeof (* distances ), ndist );
557+ if (!nearest )
558+ goto out ;
559+
560+ for (i = 0 ; i < ndist ; i ++ ) {
561+ if (distances [i ].type != PMIX_DEVTYPE_NETWORK &&
562+ distances [i ].type != PMIX_DEVTYPE_OPENFABRICS )
563+ continue ;
564+ if (distances [i ].mindist < near ) {
565+ idx = 0 ;
566+ near = distances [i ].mindist ;
567+ nearest [idx ] = distances [i ];
568+ idx ++ ;
569+ } else if (distances [i ].mindist == near ) {
570+ nearest [idx ] = distances [i ];
571+ idx ++ ;
572+ }
475573 }
476574
477- /* Allocate memory for proc_cpuset */
478- proc_cpuset = hwloc_bitmap_alloc ();
479- if (NULL == proc_cpuset ) {
575+ * num_distances = idx ;
576+
577+ out :
578+ * valin = val ;
579+ return nearest ;
580+ }
581+
582+ #if OPAL_OFI_PCI_DATA_AVAILABLE
583+ /**
584+ * Determine if a device is nearest
585+ *
586+ * Given a device distances array of the nearest pci devices,
587+ * determine if one of these device distances refers to the pci
588+ * device passed in
589+ *
590+ * @param distances (IN) distances array
591+ * @param num_distances (IN) number of entries in the distances array
592+ * @param topology (IN) topology of the node
593+ * @param pci (IN) PCI device being examined
594+ *
595+ * @return true if the PCI device is in the distances array or if the
596+ * distances array is not provided. False otherwise.
597+ *
598+ */
599+ static bool is_near (pmix_device_distance_t * distances ,
600+ int num_distances ,
601+ hwloc_topology_t topology ,
602+ struct fi_pci_attr pci )
603+ {
604+ hwloc_obj_t pcidev , osdev ;
605+
606+ /* if we failed to find any distances, then we consider all interfaces
607+ * to be of equal distances and let the caller decide how to handle
608+ * them
609+ */
610+ if (!distances )
611+ return true;
612+
613+ pcidev = hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
614+ pci .bus_id , pci .device_id ,
615+ pci .function_id );
616+ if (!pcidev )
480617 return false;
481- }
482618
483- /* Fill cpuset with the collection of cpu cores that the process runs on */
484- ret = hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485- if (0 > ret ) {
486- goto error ;
487- }
619+ for (osdev = pcidev -> io_first_child ; osdev != NULL ; osdev = osdev -> next_sibling ) {
620+ int i ;
488621
489- /* Get the pci device from bdf */
490- obj = hwloc_get_pcidev_by_busid ( topology , pci . domain_id , pci . bus_id , pci . device_id ,
491- pci . function_id );
492- if ( NULL == obj ) {
493- goto error ;
494- }
622+ if ( osdev -> attr -> osdev . type == HWLOC_OBJ_OSDEV_OPENFABRICS ) {
623+ const char * nguid = hwloc_obj_get_info_by_name ( osdev , "NodeGUID" );
624+ const char * sguid = hwloc_obj_get_info_by_name ( osdev , "SysImageGUID" );
625+
626+ if (! nguid && ! sguid )
627+ continue ;
495628
496- /* pcidev objects don't have cpusets so find the first non-io object above */
497- obj = hwloc_get_non_io_ancestor_obj (topology , obj );
498- if (NULL != obj ) {
499- result = hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
629+ for (i = 0 ; i < num_distances ; i ++ ) {
630+ char lsguid [256 ], lnguid [256 ];
631+ int ret ;
632+
633+ ret = sscanf (distances [i ].uuid , "fab://%256s::%256s" , lnguid , lsguid );
634+ if (ret != 2 )
635+ continue ;
636+ if (0 == strcasecmp (lnguid , nguid )) {
637+ return true;
638+ } else if (0 == strcasecmp (lsguid , sguid )) {
639+ return true;
640+ }
641+ }
642+ } else if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_NETWORK ) {
643+ const char * address = hwloc_obj_get_info_by_name (osdev , "Address" );
644+ if (!address )
645+ continue ;
646+ for (i = 0 ; i < num_distances ; i ++ ) {
647+ char * addr = strstr (distances [i ].uuid , "://" );
648+ if (!addr || addr + 3 > distances [i ].uuid
649+ + strlen (distances [i ].uuid ))
650+ continue ;
651+ if (!strcmp (addr + 3 , address )) {
652+ return true;
653+ }
654+ }
655+ }
500656 }
501657
502- error :
503- hwloc_bitmap_free (proc_cpuset );
504- return result ;
658+ return false;
505659}
506660#endif
507661
@@ -614,11 +768,15 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614768 struct fi_info * provider = provider_list , * current_provider = provider_list ;
615769 struct fi_info * * provider_table ;
616770#if OPAL_OFI_PCI_DATA_AVAILABLE
771+ pmix_device_distance_t * distances = NULL ;
772+ pmix_value_t * pmix_val ;
617773 struct fi_pci_attr pci ;
774+ int num_distances = 0 ;
775+ bool near ;
618776#endif
619777 int ret ;
620778 unsigned int num_provider = 0 , provider_limit = 0 ;
621- bool provider_found = false, cpusets_match = false ;
779+ bool provider_found = false;
622780
623781 /* Initialize opal_hwloc_topology if it is not already */
624782 ret = opal_hwloc_base_get_topology ();
@@ -639,33 +797,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639797 return provider_list ;
640798 }
641799
800+ #if OPAL_OFI_PCI_DATA_AVAILABLE
801+ /* find all the nearest devices to this thread, then out of these
802+ * determine which device we should bind to.
803+ */
804+ distances = get_nearest_nics (& num_distances , & pmix_val );
805+ #endif
806+
642807 current_provider = provider ;
643808
644809 /* Cycle through remaining fi_info objects, looking for alike providers */
645810 while (NULL != current_provider ) {
646811 if (!check_provider_attr (provider , current_provider )) {
647- cpusets_match = false;
812+ near = false;
648813#if OPAL_OFI_PCI_DATA_AVAILABLE
649814 if (NULL != current_provider -> nic
650815 && NULL != current_provider -> nic -> bus_attr
651816 && current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
652817 pci = current_provider -> nic -> bus_attr -> attr .pci ;
653- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
818+ near = is_near (distances , num_distances ,
819+ opal_hwloc_topology , pci );
654820 }
655821#endif
656-
657- /* Reset the list if the cpusets match and no other provider was
658- * found on the same cpuset as the process.
659- */
660- if (cpusets_match && !provider_found ) {
822+ /* We could have multiple near providers */
823+ if (near && !provider_found ) {
661824 provider_found = true;
662825 num_provider = 0 ;
663826 }
664827
665828 /* Add the provider to the provider list if the cpusets match or if
666829 * no other provider was found on the same cpuset as the process.
667830 */
668- if (cpusets_match || !provider_found ) {
831+ if (near || !provider_found ) {
669832 provider_table [num_provider ] = current_provider ;
670833 num_provider ++ ;
671834 }
@@ -687,17 +850,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687850 && NULL != provider -> nic -> bus_attr
688851 && provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
689852 pci = provider -> nic -> bus_attr -> attr .pci ;
690- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
853+ near = is_near (distances , num_distances ,
854+ opal_hwloc_topology , pci );
691855 }
692856#endif
693857
694858#if OPAL_ENABLE_DEBUG
695859 opal_output_verbose (1 , opal_common_ofi .output ,
696- "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697- provider -> domain_attr -> name , cpusets_match ? "true" : "false" );
860+ "package rank: %d device: %s near : %s\n" , package_rank ,
861+ provider -> domain_attr -> name , near ? "true" : "false" );
698862#endif
699863
700864 free (provider_table );
865+ #if OPAL_OFI_PCI_DATA_AVAILABLE
866+ if (pmix_val )
867+ PMIx_Value_free (pmix_val , 1 );
868+ #endif
701869 return provider ;
702870}
703871
0 commit comments