From 3deda3dc82f3d7fa50227233d8e7af74db313336 Mon Sep 17 00:00:00 2001 From: Nick Papior Andersen Date: Wed, 24 Dec 2014 11:21:35 +0000 Subject: [PATCH] Added several new COMM_TYPE_<> splits Using the underlying hardware identification to split communicators based on locality has been enabled using the MPI_Comm_Split_Type function. Currently implemented split's are: HWTHREAD CORE L1CACHE L2CACHE L3CACHE SOCKET NUMA NODE BOARD HOST CU CLUSTER However only NODE is defined in the standard which is why the remaning splits are referred to using the OMPI_ prefix instead of the standard MPI_ prefix. I have tested this using --without-hwloc and --with-hwloc= which both give the same output. NOTE: I think something fishy is going on in the locality operators. In my test-program I couldn't get the correct split on these requests: NUMA, SOCKET, L3CACHE where I suspected a full communicator but only got one. --- ompi/communicator/comm.c | 265 ++++++++++++++++++++-- ompi/include/mpi.h.in | 14 +- ompi/include/mpif-values.pl | 14 +- ompi/mpi/c/comm_split_type.c | 14 +- ompi/mpi/man/man3/MPI_Comm_split_type.3in | 49 +++- 5 files changed, 338 insertions(+), 18 deletions(-) diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 072d5a49fb3..f1e78e078bd 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -692,7 +692,7 @@ ompi_comm_split_type(ompi_communicator_t *comm, int my_rsize; int mode; int rsize; - int i, loc; + int i, loc, found; int inter; int *results=NULL, *sorted=NULL; int *rresults=NULL, *rsorted=NULL; @@ -711,7 +711,51 @@ ompi_comm_split_type(ompi_communicator_t *comm, /* --------------------------------------------------------- */ /* sort according to participation and rank. Gather information from everyone */ - myinfo[0] = (split_type == MPI_COMM_TYPE_SHARED) ? 1 : 0; + /* allowed splitting types: + CLUSTER + CU + HOST + BOARD + NODE + NUMA + SOCKET + L3CACHE + L2CACHE + L1CACHE + CORE + HWTHREAD + Even though HWTHREAD/CORE etc. is overkill they are here for consistency. + They will most likely return a communicator which is equal to MPI_COMM_SELF + Unless oversubscribing. + */ + myinfo[0] = 0; // default to no type splitting (also if non-recognized split-type) + switch ( split_type ) { + case OMPI_COMM_TYPE_HWTHREAD: + myinfo[0] = 1; break; + case OMPI_COMM_TYPE_CORE: + myinfo[0] = 2; break; + case OMPI_COMM_TYPE_L1CACHE: + myinfo[0] = 3; break; + case OMPI_COMM_TYPE_L2CACHE: + myinfo[0] = 4; break; + case OMPI_COMM_TYPE_L3CACHE: + myinfo[0] = 5; break; + case OMPI_COMM_TYPE_SOCKET: + myinfo[0] = 6; break; + case OMPI_COMM_TYPE_NUMA: + myinfo[0] = 7; break; + //case MPI_COMM_TYPE_SHARED: // the standard implemented type + case OMPI_COMM_TYPE_NODE: + myinfo[0] = 8; break; + case OMPI_COMM_TYPE_BOARD: + myinfo[0] = 9; break; + case OMPI_COMM_TYPE_HOST: + myinfo[0] = 10; break; + case OMPI_COMM_TYPE_CU: + myinfo[0] = 11; break; + case OMPI_COMM_TYPE_CLUSTER: + myinfo[0] = 12; break; + } myinfo[1] = key; size = ompi_comm_size ( comm ); @@ -731,13 +775,65 @@ ompi_comm_split_type(ompi_communicator_t *comm, if ( OMPI_SUCCESS != rc ) { goto exit; } - + + /* check that all processors have been called with the same value */ + for ( i=0; i < size; i++) { + if ( results[2*i] != myinfo[0] ) { + rc = OMPI_ERR_BAD_PARAM; + goto exit; + } + } + /* how many are participating and on my node? */ for ( my_size = 0, i=0; i < size; i++) { - if ( results[(2*i)+0] == 1) { + if ( results[2*i] == 1 ) { + if (OPAL_PROC_ON_LOCAL_HWTHREAD(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 2 ) { + if (OPAL_PROC_ON_LOCAL_CORE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 3 ) { + if (OPAL_PROC_ON_LOCAL_L1CACHE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 4 ) { + if (OPAL_PROC_ON_LOCAL_L2CACHE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 5 ) { + if (OPAL_PROC_ON_LOCAL_L3CACHE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 6 ) { + if (OPAL_PROC_ON_LOCAL_SOCKET(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 7 ) { + if (OPAL_PROC_ON_LOCAL_NUMA(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 8 ) { if (OPAL_PROC_ON_LOCAL_NODE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { my_size++; } + } else if ( results[2*i] == 9 ) { + if (OPAL_PROC_ON_LOCAL_BOARD(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 10 ) { + if (OPAL_PROC_ON_LOCAL_HOST(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 11 ) { + if (OPAL_PROC_ON_LOCAL_CU(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } + } else if ( results[2*i] == 12 ) { + if (OPAL_PROC_ON_LOCAL_CLUSTER(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + my_size++; + } } } @@ -755,13 +851,63 @@ ompi_comm_split_type(ompi_communicator_t *comm, /* ok we can now fill this info */ for( loc = 0, i = 0; i < size; i++ ) { - if ( results[(2*i)+0] == 1) { + found = 0; + if ( results[2*i] == 1 ) { + if (OPAL_PROC_ON_LOCAL_HWTHREAD(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 2 ) { + if (OPAL_PROC_ON_LOCAL_CORE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 3 ) { + if (OPAL_PROC_ON_LOCAL_L1CACHE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 4 ) { + if (OPAL_PROC_ON_LOCAL_L2CACHE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 5 ) { + if (OPAL_PROC_ON_LOCAL_L3CACHE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 6 ) { + if (OPAL_PROC_ON_LOCAL_SOCKET(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 7 ) { + if (OPAL_PROC_ON_LOCAL_NUMA(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 8 ) { if (OPAL_PROC_ON_LOCAL_NODE(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { - sorted[(2*loc)+0] = i; /* copy org rank */ - sorted[(2*loc)+1] = results[(2*i)+1]; /* copy key */ - loc++; + found = 1; + } + } else if ( results[2*i] == 9 ) { + if (OPAL_PROC_ON_LOCAL_BOARD(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 10 ) { + if (OPAL_PROC_ON_LOCAL_HOST(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 11 ) { + if (OPAL_PROC_ON_LOCAL_CU(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( results[2*i] == 12 ) { + if (OPAL_PROC_ON_LOCAL_CLUSTER(ompi_group_peer_lookup(comm->c_local_group, i)->super.proc_flags)) { + found = 1; } } + + /* we have found and occupied the index (i) */ + if ( found == 1 ) { + sorted[2*loc ] = i; /* copy org rank */ + sorted[2*loc+1] = results[2*i+1]; /* copy key */ + loc++; + } } /* the new array needs to be sorted so that it is in 'key' order */ @@ -800,10 +946,54 @@ ompi_comm_split_type(ompi_communicator_t *comm, /* how many are participating and on my node? */ for ( my_rsize = 0, i=0; i < rsize; i++) { - if ( rresults[(2*i)+0] == 1) { + if ( rresults[2*i] == 1 ) { + if (OPAL_PROC_ON_LOCAL_HWTHREAD(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 2 ) { + if (OPAL_PROC_ON_LOCAL_CORE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 3 ) { + if (OPAL_PROC_ON_LOCAL_L1CACHE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 4 ) { + if (OPAL_PROC_ON_LOCAL_L2CACHE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 5 ) { + if (OPAL_PROC_ON_LOCAL_L3CACHE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 6 ) { + if (OPAL_PROC_ON_LOCAL_SOCKET(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 7 ) { + if (OPAL_PROC_ON_LOCAL_NUMA(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 8 ) { if (OPAL_PROC_ON_LOCAL_NODE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { my_rsize++; } + } else if ( rresults[2*i] == 9 ) { + if (OPAL_PROC_ON_LOCAL_BOARD(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 10 ) { + if (OPAL_PROC_ON_LOCAL_HOST(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 11 ) { + if (OPAL_PROC_ON_LOCAL_CU(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } + } else if ( rresults[2*i] == 12 ) { + if (OPAL_PROC_ON_LOCAL_CLUSTER(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + my_rsize++; + } } } @@ -816,12 +1006,61 @@ ompi_comm_split_type(ompi_communicator_t *comm, /* ok we can now fill this info */ for( loc = 0, i = 0; i < rsize; i++ ) { - if ( rresults[(2*i)+0] == 1) { + found = 0; + if ( rresults[2*i] == 1 ) { + if (OPAL_PROC_ON_LOCAL_HWTHREAD(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( rresults[2*i] == 2 ) { + if (OPAL_PROC_ON_LOCAL_CORE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( rresults[2*i] == 3 ) { + if (OPAL_PROC_ON_LOCAL_L1CACHE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( rresults[2*i] == 4 ) { + if (OPAL_PROC_ON_LOCAL_L2CACHE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( rresults[2*i] == 5 ) { + if (OPAL_PROC_ON_LOCAL_L3CACHE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( rresults[2*i] == 6 ) { + if (OPAL_PROC_ON_LOCAL_SOCKET(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( rresults[2*i] == 7 ) { + if (OPAL_PROC_ON_LOCAL_NUMA(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( rresults[2*i] == 8 ) { if (OPAL_PROC_ON_LOCAL_NODE(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { - rsorted[(2*loc)+0] = i; /* org rank */ - rsorted[(2*loc)+1] = rresults[(2*i)+1]; /* key */ - loc++; + found = 1; + } + } else if ( rresults[2*i] == 9 ) { + if (OPAL_PROC_ON_LOCAL_BOARD(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( rresults[2*i] == 10 ) { + if (OPAL_PROC_ON_LOCAL_HOST(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; } + } else if ( rresults[2*i] == 11 ) { + if (OPAL_PROC_ON_LOCAL_CU(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } else if ( rresults[2*i] == 12 ) { + if (OPAL_PROC_ON_LOCAL_CLUSTER(ompi_group_peer_lookup(comm->c_remote_group, i)->super.proc_flags)) { + found = 1; + } + } + + if ( found == 1 ) { + rsorted[2*loc ] = i; /* org rank */ + rsorted[2*loc+1] = rresults[2*i+1]; /* key */ + loc++; } } diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in index a629ce0216f..be899a593e4 100644 --- a/ompi/include/mpi.h.in +++ b/ompi/include/mpi.h.in @@ -667,8 +667,20 @@ enum { * (see also mpif-common.h.fin). */ enum { - MPI_COMM_TYPE_SHARED + OMPI_COMM_TYPE_HWTHREAD, + OMPI_COMM_TYPE_CORE, + OMPI_COMM_TYPE_L1CACHE, + OMPI_COMM_TYPE_L2CACHE, + OMPI_COMM_TYPE_L3CACHE, + OMPI_COMM_TYPE_SOCKET, + OMPI_COMM_TYPE_NUMA, + OMPI_COMM_TYPE_NODE, + OMPI_COMM_TYPE_BOARD, + OMPI_COMM_TYPE_HOST, + OMPI_COMM_TYPE_CU, + OMPI_COMM_TYPE_CLUSTER }; +#define MPI_COMM_TYPE_SHARED OMPI_COMM_TYPE_NODE /* * MPIT Verbosity Levels diff --git a/ompi/include/mpif-values.pl b/ompi/include/mpif-values.pl index 32a8a07e9df..975aa3fc2cc 100755 --- a/ompi/include/mpif-values.pl +++ b/ompi/include/mpif-values.pl @@ -354,7 +354,19 @@ sub write_file { $constants->{MPI_COMBINER_RESIZED} = 17; $constants->{MPI_COMBINER_HINDEXED_BLOCK} = 18; -$constants->{MPI_COMM_TYPE_SHARED} = 0; +$constants->{OMPI_COMM_TYPE_HWTHREAD} = 0; +$constants->{OMPI_COMM_TYPE_CORE} = 1; +$constants->{OMPI_COMM_TYPE_L1CACHE} = 2; +$constants->{OMPI_COMM_TYPE_L2CACHE} = 3; +$constants->{OMPI_COMM_TYPE_L3CACHE} = 4; +$constants->{OMPI_COMM_TYPE_SOCKET} = 5; +$constants->{OMPI_COMM_TYPE_NUMA} = 6; +$constants->{OMPI_COMM_TYPE_NODE} = 7; +$constants->{MPI_COMM_TYPE_SHARED} = 7; +$constants->{OMPI_COMM_TYPE_BOARD} = 8; +$constants->{OMPI_COMM_TYPE_HOST} = 9; +$constants->{OMPI_COMM_TYPE_CU} = 10; +$constants->{OMPI_COMM_TYPE_CLUSTER} = 11; #---------------------------------------------------------------------------- diff --git a/ompi/mpi/c/comm_split_type.c b/ompi/mpi/c/comm_split_type.c index e9776f24c27..1fdece79a0f 100644 --- a/ompi/mpi/c/comm_split_type.c +++ b/ompi/mpi/c/comm_split_type.c @@ -61,7 +61,19 @@ int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, FUNC_NAME); } - if ( MPI_COMM_TYPE_SHARED != split_type && + if ( MPI_COMM_TYPE_SHARED != split_type && // Same as OMPI_COMM_TYPE_NODE + OMPI_COMM_TYPE_CLUSTER != split_type && + OMPI_COMM_TYPE_CU != split_type && + OMPI_COMM_TYPE_HOST != split_type && + OMPI_COMM_TYPE_BOARD != split_type && + OMPI_COMM_TYPE_NODE != split_type && // Same as MPI_COMM_TYPE_SHARED + OMPI_COMM_TYPE_NUMA != split_type && + OMPI_COMM_TYPE_SOCKET != split_type && + OMPI_COMM_TYPE_L3CACHE != split_type && + OMPI_COMM_TYPE_L2CACHE != split_type && + OMPI_COMM_TYPE_L1CACHE != split_type && + OMPI_COMM_TYPE_CORE != split_type && + OMPI_COMM_TYPE_HWTHREAD != split_type && MPI_UNDEFINED != split_type ) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME); diff --git a/ompi/mpi/man/man3/MPI_Comm_split_type.3in b/ompi/mpi/man/man3/MPI_Comm_split_type.3in index fe0ed84996d..acac9c7309d 100644 --- a/ompi/mpi/man/man3/MPI_Comm_split_type.3in +++ b/ompi/mpi/man/man3/MPI_Comm_split_type.3in @@ -62,10 +62,53 @@ value MPI_UNDEFINED, in which case newcomm returns MPI_COMM_NULL. .SH SPLIT TYPES .ft R - .TP 1i -MPI_COMM_TYPE_SHARED +MPI_COMM_TYPE_SHARED|OMPI_COMM_TYPE_NODE This type splits the communicator into subcommunicators, each of which can create a shared memory region. +.ft R +.TP 1i +OMPI_COMM_TYPE_HWTHREAD +This type splits the communicator into subcommunicators, each of which belongs to the same hardware thread. +.ft R +.TP 1i +OMPI_COMM_TYPE_CORE +This type splits the communicator into subcommunicators, each of which belongs to the same core/processing unit. +.ft R +.TP 1i +OMPI_COMM_TYPE_L1CACHE +This type splits the communicator into subcommunicators, each of which belongs to the same L1 cache. +.ft R +.TP 1i +OMPI_COMM_TYPE_L2CACHE +This type splits the communicator into subcommunicators, each of which belongs to the same L2 cache. +.ft R +.TP 1i +OMPI_COMM_TYPE_L3CACHE +This type splits the communicator into subcommunicators, each of which belongs to the same L3 cache. +.ft R +.TP 1i +OMPI_COMM_TYPE_SOCKET +This type splits the communicator into subcommunicators, each of which belongs to the same socket. +.ft R +.TP 1i +OMPI_COMM_TYPE_NUMA +This type splits the communicator into subcommunicators, each of which belongs to the same NUMA-node. +.ft R +.TP 1i +OMPI_COMM_TYPE_BOARD +This type splits the communicator into subcommunicators, each of which belongs to the same board. +.ft R +.TP 1i +OMPI_COMM_TYPE_HOST +This type splits the communicator into subcommunicators, each of which belongs to the same host. +.ft R +.TP 1i +OMPI_COMM_TYPE_CU +This type splits the communicator into subcommunicators, each of which belongs to the same computational unit. +.ft R +.TP 1i +OMPI_COMM_TYPE_CLUSTER +This type splits the communicator into subcommunicators, each of which belongs to the same cluster. .SH NOTES .ft R @@ -79,6 +122,8 @@ Multiple calls to MPI_Comm_split_type can be used to overcome the requirement th Note that keys need not be unique. It is MPI_Comm_split_type's responsibility to sort processes in ascending order according to this key, and to break ties in a consistent way. If all the keys are specified in the same way, then all the processes in a given color will have the relative rank order as they did in their parent group. (In general, they will have different ranks.) .sp Essentially, making the key value zero for all processes of a given split_type means that one needn't really pay attention to the rank-order of the processes in the new communicator. +.sp +The communicator coloring denoted with OMPI instead of MPI are specific to OpenMPI only and are not part of the standard. .SH ERRORS Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument.