Skip to content

Commit b0e2a59

Browse files
committed
communicator bugfix: disjoint function does not have the correct max_local_peers value
local_peers is passed in the non-blocking function iallreduce_fn as a stack variable. Change it to be part of the context struct so the correct value is passed. Signed-off-by: Jessie Yang <[email protected]>
1 parent 74fbf8f commit b0e2a59

File tree

1 file changed

+9
-4
lines changed

1 file changed

+9
-4
lines changed

ompi/communicator/comm_cid.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ struct ompi_comm_cid_context_t {
9999
int iter;
100100
/** storage for activate barrier */
101101
int max_local_peers;
102+
int local_peers;
102103
char *port_string;
103104
bool send_first;
104105
int pml_tag;
@@ -267,6 +268,7 @@ static ompi_comm_cid_context_t *mca_comm_cid_context_alloc (ompi_communicator_t
267268
context->send_first = send_first;
268269
context->iter = 0;
269270
context->max_local_peers = ompi_group_count_local_peers(newcomm->c_local_group);
271+
context->local_peers = context->max_local_peers;
270272

271273
return context;
272274
}
@@ -774,6 +776,10 @@ static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request);
774776
/* Callback function to set communicator disjointness flags */
775777
static inline void ompi_comm_set_disjointness_nb_complete(ompi_comm_cid_context_t *context)
776778
{
779+
/* Only set the disjoint flags when it is intra-communicator */
780+
if (OMPI_COMM_IS_INTER(*context->newcommp)) {
781+
return;
782+
}
777783
if (OMPI_COMM_IS_DISJOINT_SET(*context->newcommp)) {
778784
opal_show_help("help-comm.txt", "disjointness-set-again", true);
779785
return;
@@ -870,7 +876,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
870876
ompi_comm_cid_context_t *context;
871877
ompi_comm_request_t *request;
872878
ompi_request_t *subreq;
873-
int ret = 0, local_peers = -1;
879+
int ret = 0;
874880

875881
/* the caller should not pass NULL for comm (it may be the same as *newcomm) */
876882
assert (NULL != comm);
@@ -907,9 +913,8 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
907913
* 1. The communicator's disjointness is inferred from max_local_peers.
908914
* 2. After the operation it is allowed to send messages over the new communicator.
909915
*/
910-
local_peers = context->max_local_peers;
911-
ret = context->iallreduce_fn (&local_peers, &context->max_local_peers, 1, MPI_MAX, context,
912-
&subreq);
916+
ret = context->iallreduce_fn (&context->local_peers, &context->max_local_peers, 1, MPI_MAX, context,
917+
&subreq);
913918
if (OMPI_SUCCESS != ret) {
914919
ompi_comm_request_return (request);
915920
return ret;

0 commit comments

Comments
 (0)