@@ -623,10 +623,10 @@ static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topo
623623/**
624624 * @brief Get the nearest device to the current thread
625625 *
626- * Use the PMIx server or calculate the device distances, then out of the set of
627- * returned distances find the subset of the nearest devices. This can be
628- * 0 or more.
629- * If there are multiple equidistant devices, break the tie using the rank .
626+ * Compute the distances from the current thread to each NIC in provider_list,
627+ * and select the NIC with the shortest distance.
628+ * If there are multiple equidistant devices, break the tie using local rank
629+ * to balance NIC utilization .
630630 *
631631 * @param[in] topoloy hwloc topology
632632 * @param[in] provider_list List of providers to select from
@@ -724,16 +724,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l
724724 return ret ;
725725}
726726
727- static struct fi_info * select_provider_round_robin (struct fi_info * provider_list , uint32_t rank ,
728- size_t num_providers )
727+ /**
728+ * @brief Selects a provider from the list in a round-robin fashion
729+ *
730+ * This function implements a round-robin algorithm to select a provider from
731+ * the provided list based on a rank. Only providers of the same type as the
732+ * first provider are eligible for selection.
733+ *
734+ * @param[in] provider_list A list of providers to select from.
735+ * @param[out] rank A rank metric for the current process, such as
736+ * the rank on the same node or CPU package.
737+ * @return Pointer to the selected provider
738+ */
739+ static struct fi_info * select_provider_round_robin (struct fi_info * provider_list , uint32_t rank )
729740{
730- uint32_t provider_rank = rank % num_providers ;
731- struct fi_info * current_provider = provider_list ;
741+ uint32_t provider_rank = 0 , current_rank = 0 ;
742+ size_t num_providers = 0 ;
743+ struct fi_info * current_provider = NULL ;
732744
733- for (uint32_t i = 0 ; i < provider_rank ; ++ i ) {
745+ for (current_provider = provider_list ; NULL != current_provider ;) {
746+ if (OPAL_SUCCESS == check_provider_attr (provider_list , current_provider )) {
747+ ++ num_providers ;
748+ }
734749 current_provider = current_provider -> next ;
735750 }
736751
752+ current_provider = provider_list ;
753+ if (2 > num_providers ) {
754+ goto out ;
755+ }
756+
757+ provider_rank = rank % num_providers ;
758+
759+ while (NULL != current_provider ) {
760+ if (OPAL_SUCCESS == check_provider_attr (provider_list , current_provider )
761+ && provider_rank == current_rank ++ ) {
762+ break ;
763+ }
764+ current_provider = current_provider -> next ;
765+ }
766+ out :
737767 return current_provider ;
738768}
739769
@@ -850,7 +880,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
850880{
851881 int ret , num_providers = 0 ;
852882 struct fi_info * provider = NULL ;
853- uint32_t package_rank = 0 ;
883+ uint32_t package_rank = process_info -> my_local_rank ;
854884
855885 num_providers = count_providers (provider_list );
856886 if (!process_info -> proc_is_bound || 2 > num_providers ) {
@@ -868,6 +898,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
868898 package_rank = get_package_rank (process_info );
869899
870900#if OPAL_OFI_PCI_DATA_AVAILABLE
901+ /**
902+ * If provider PCI BDF information is available, we calculate its physical distance
903+ * to the current process, and select the provider with the shortest distance.
904+ */
871905 ret = get_nearest_nic (opal_hwloc_topology , provider_list , num_providers , package_rank ,
872906 & provider );
873907 if (OPAL_SUCCESS == ret ) {
@@ -876,7 +910,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
876910#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
877911
878912round_robin :
879- provider = select_provider_round_robin (provider_list , package_rank , num_providers );
913+ if (!process_info -> proc_is_bound && 1 < num_providers
914+ && opal_output_get_verbosity (opal_common_ofi .output ) >= 1 ) {
915+ opal_show_help ("help-common-ofi.txt" , "unbound_process" , true, 1 );
916+ }
917+
918+ provider = select_provider_round_robin (provider_list , package_rank );
880919out :
881920#if OPAL_ENABLE_DEBUG
882921 opal_output_verbose (1 , opal_common_ofi .output , "package rank: %d device: %s" , package_rank ,
@@ -950,5 +989,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
950989 }
951990 return ret ;
952991}
953-
954-
0 commit comments