Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 27 additions & 18 deletions orte/mca/rmaps/base/rmaps_base_map_job.c
Original file line number Diff line number Diff line change
Expand Up @@ -378,26 +378,18 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
*/
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(caddy);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
return;
goto cleanup;
}
}
/* reset any node map flags we used so the next job will start clean */
for (i=0; i < jdata->map->nodes->size; i++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
}
}

if (did_map && ORTE_ERR_RESOURCE_BUSY == rc) {
/* the map was done but nothing could be mapped
* for launch as all the resources were busy
*/
orte_show_help("help-orte-rmaps-base.txt", "cannot-launch", true);
OBJ_RELEASE(caddy);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
return;
goto cleanup;
}

/* if we get here without doing the map, or with zero procs in
Expand All @@ -407,9 +399,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
orte_show_help("help-orte-rmaps-base.txt", "failed-map", true,
did_map ? "mapped" : "unmapped",
jdata->num_procs, jdata->map->num_nodes);
OBJ_RELEASE(caddy);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
return;
goto cleanup;
}

/* if any node is oversubscribed, then check to see if a binding
Expand All @@ -423,28 +414,38 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
}

if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
/* we didn't add the nodes to the node map as it would cause them to
* be in a different order than on the backend if this is a dynamic
* spawn (which means we may have started somewhere other than at
* the beginning of the allocation) */
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
OBJ_RETAIN(node);
opal_pointer_array_add(jdata->map->nodes, node);
}
}
/* compute and save location assignments */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(caddy);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
return;
goto cleanup;
}
} else {
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(caddy);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
return;
goto cleanup;
}

/* compute and save bindings */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(caddy);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
return;
goto cleanup;
}
}

Expand All @@ -465,6 +466,14 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
/* set the job state to the next position */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE);

cleanup:
/* reset any node map flags we used so the next job will start clean */
for (i=0; i < jdata->map->nodes->size; i++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
}
}

/* cleanup */
OBJ_RELEASE(caddy);
}
Expand Down
7 changes: 1 addition & 6 deletions orte/mca/rmaps/ppr/rmaps_ppr.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,12 +275,7 @@ static int ppr_mapper(orte_job_t *jdata)
}
/* add the node to the map, if needed */
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
goto error;
}
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
OBJ_RETAIN(node); /* maintain accounting on object */
jdata->map->num_nodes++;
}
/* if we are mapping solely at the node level, just put
Expand Down Expand Up @@ -407,7 +402,7 @@ static int ppr_mapper(orte_job_t *jdata)
}
return ORTE_SUCCESS;

error:
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
Expand Down
31 changes: 2 additions & 29 deletions orte/mca/rmaps/round_robin/rmaps_rr_mappers.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
orte_std_cntr_t num_slots,
orte_vpid_t num_procs)
{
int rc, i, nprocs_mapped;
int i, nprocs_mapped;
orte_node_t *node;
orte_proc_t *proc;
int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0;
Expand Down Expand Up @@ -94,12 +94,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
/* add this node to the map - do it only once */
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
return rc;
}
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
Expand Down Expand Up @@ -149,12 +144,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,

/* add this node to the map - do it only once */
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
return rc;
}
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
if (add_one) {
Expand Down Expand Up @@ -221,7 +211,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
int j, nprocs_mapped, nnodes;
orte_node_t *node;
orte_proc_t *proc;
int num_procs_to_assign, navg, idx;
int num_procs_to_assign, navg;
int extra_procs_to_assign=0, nxtra_nodes=0;
hwloc_obj_t obj=NULL;
float balance;
Expand Down Expand Up @@ -293,12 +283,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
}
/* add this node to the map, but only do so once */
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
if (oversubscribed) {
Expand Down Expand Up @@ -456,7 +441,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
orte_node_t *node;
orte_proc_t *proc;
int nprocs, start;
int idx;
hwloc_obj_t obj=NULL;
unsigned int nobjs;
bool add_one;
Expand Down Expand Up @@ -547,12 +531,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
}
/* add this node to the map, if reqd */
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
nmapped = 0;
Expand Down Expand Up @@ -638,7 +617,6 @@ static int byobj_span(orte_job_t *jdata,
orte_node_t *node;
orte_proc_t *proc;
int nprocs, nxtra_objs;
int idx;
hwloc_obj_t obj=NULL;
unsigned int nobjs;

Expand Down Expand Up @@ -699,12 +677,7 @@ static int byobj_span(orte_job_t *jdata,
OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
/* add this node to the map, if reqd */
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
/* get the number of objects of this type on this node */
Expand Down
13 changes: 11 additions & 2 deletions orte/runtime/data_type_support/orte_dt_packing_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
int rc;
int32_t i, j, count;
int32_t i, j, count, bookmark;
orte_job_t **jobs;
orte_app_context_t *app;
orte_proc_t *proc;
Expand Down Expand Up @@ -241,7 +241,16 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
}
}

/* do not pack the bookmark or oversubscribe_override flags */
/* pack the bookmark */
if (NULL == jobs[i]->bookmark) {
bookmark = -1;
} else {
bookmark = jobs[i]->bookmark->index;
}
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &bookmark, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}

/* pack the job state */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
Expand Down
14 changes: 12 additions & 2 deletions orte/runtime/data_type_support/orte_dt_unpacking_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int rc;
int32_t i, k, n, count;
int32_t i, k, n, count, bookmark;
orte_job_t **jobs;
orte_app_idx_t j;
orte_attribute_t *kv;
Expand Down Expand Up @@ -237,7 +237,17 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
}
}

/* no bookmark of oversubscribe_override flags to unpack */
/* unpack the bookmark */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&bookmark, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 <= bookmark) {
/* retrieve it */
jobs[i]->bookmark = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, bookmark);
}

/* unpack the job state */
n = 1;
Expand Down
10 changes: 8 additions & 2 deletions orte/test/mpi/simple_spawn.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/param.h>
Expand All @@ -12,9 +13,15 @@ int main(int argc, char* argv[])
int rank, size;
char hostname[MAXHOSTNAMELEN];
pid_t pid;
char *env_rank,*env_nspace;

env_rank = getenv("PMIX_RANK");
env_nspace = getenv("PMIX_NAMESPACE");
pid = getpid();
printf("[pid %ld] starting up!\n", (long)pid);
gethostname(hostname, sizeof(hostname));

printf("[%s:%s pid %ld] starting up on node %s!\n", env_nspace, env_rank, (long)pid, hostname);

MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
printf("%d completed MPI_Init\n", rank);
Expand Down Expand Up @@ -42,7 +49,6 @@ int main(int argc, char* argv[])
else {
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
gethostname(hostname, sizeof(hostname));
pid = getpid();
printf("Hello from the child %d of %d on host %s pid %ld\n", rank, 3, hostname, (long)pid);
if (0 == rank) {
Expand Down