From c4cd12bc436afa5607910e464b80cb50032fe5cc Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Thu, 21 Dec 2017 17:20:27 +0900 Subject: [PATCH 1/5] plm/rsh: fix parameter handling in rsh_wait_daemon() since open-mpi/ompi@8f496b01b729a86da3e43824d4fb609d21057fbb rsh_wait_daemon is invoked with an orte_wait_tracker_t *, that must be used to reach the orte_plm_rsh_caddy_t *. Signed-off-by: Gilles Gouaillardet --- orte/mca/plm/rsh/plm_rsh_module.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 94fe84f5c47..2554298b3b2 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -263,7 +263,8 @@ static int rsh_init(void) static void rsh_wait_daemon(int sd, short flags, void *cbdata) { orte_job_t *jdata; - orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata; + orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata; + orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)t2->cbdata; orte_proc_t *daemon = caddy->daemon; char *rtmod; @@ -272,6 +273,7 @@ static void rsh_wait_daemon(int sd, short flags, void *cbdata) * session attached, e.g., while debugging */ OBJ_RELEASE(caddy); + OBJ_RELEASE(t2); return; } @@ -325,7 +327,7 @@ static void rsh_wait_daemon(int sd, short flags, void *cbdata) opal_event_active(&launch_event, EV_WRITE, 1); } /* cleanup */ - OBJ_RELEASE(caddy); + OBJ_RELEASE(t2); } static int setup_launch(int *argcptr, char ***argvptr, From f7e29127bcc753f1068ae2b9ce319499acb8912b Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 22 Dec 2017 11:17:34 +0900 Subject: [PATCH 2/5] sstore/stage: fix parameter handling in sstore_stage_local_compress_waitpid_cb() since open-mpi/ompi@8f496b01b729a86da3e43824d4fb609d21057fbb sstore_stage_local_compress_waitpid_cb is invoked with an orte_wait_tracker_t *, that must be used to reach the orte_sstore_stage_local_app_snapshot_info_t *. Signed-off-by: Gilles Gouaillardet --- orte/mca/sstore/stage/sstore_stage_local.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/orte/mca/sstore/stage/sstore_stage_local.c b/orte/mca/sstore/stage/sstore_stage_local.c index 46ad2774e81..56dd0373fbd 100644 --- a/orte/mca/sstore/stage/sstore_stage_local.c +++ b/orte/mca/sstore/stage/sstore_stage_local.c @@ -1,9 +1,11 @@ /* - * Copyright (c) 2010 The Trustees of Indiana University. + * Copyright (c) 2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1550,8 +1552,9 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf static void sstore_stage_local_compress_waitpid_cb(orte_proc_t *proc, void* cbdata) { orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL; + orte_wait_tracker_t *t2 = (orte_wait_tracker_t *)cbdata; - app_info = (orte_sstore_stage_local_app_snapshot_info_t*)cbdata; + app_info = (orte_sstore_stage_local_app_snapshot_info_t*)t2->cbdata; OPAL_OUTPUT_VERBOSE((10, mca_sstore_stage_component.super.output_handle, "sstore:stage:(local): waitpid(%6d) Compression finished for Process %s", @@ -1560,6 +1563,7 @@ static void sstore_stage_local_compress_waitpid_cb(orte_proc_t *proc, void* cbda app_info->compress_pid = 0; OBJ_RELEASE(proc); + OBJ_RELEASE(t2); } static int wait_all_compressed(orte_sstore_stage_local_snapshot_info_t *handle_info) From 799152e7fb96b932d734a7558dadf2a8e8e167d1 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Thu, 21 Dec 2017 17:27:35 +0900 Subject: [PATCH 3/5] plm/base: add the orte_plm_base_node_regex_threshold MCA parameter This parameter can be used to set the node regex max length that can be passed to the orted command line. For testing purpose, it can be set to zero in order to force the node regex being retrieved by orted from its parent. Signed-off-by: Gilles Gouaillardet --- orte/mca/plm/base/plm_base_frame.c | 17 +++++++++++++++-- orte/mca/plm/base/plm_base_launch_support.c | 2 +- orte/mca/plm/base/plm_private.h | 3 +++ orte/util/nidmap.h | 4 ++-- 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/orte/mca/plm/base/plm_base_frame.c b/orte/mca/plm/base/plm_base_frame.c index 9fc9752b41b..b494b0b1565 100644 --- a/orte/mca/plm/base/plm_base_frame.c +++ b/orte/mca/plm/base/plm_base_frame.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -52,6 +52,19 @@ orte_plm_globals_t orte_plm_globals = {0}; orte_plm_base_module_t orte_plm = {0}; +static int mca_plm_base_register(mca_base_register_flag_t flags) +{ + orte_plm_globals.node_regex_threshold = 1024; + (void) mca_base_var_register("orte", "pml", "base", "node_regex_threshold", + "Only pass the node regex on the orted command line if smaller than this threshold", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, + MCA_BASE_VAR_FLAG_INTERNAL, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &orte_plm_globals.node_regex_threshold); + return ORTE_SUCCESS; +} + static int orte_plm_base_close(void) { int rc; @@ -88,5 +101,5 @@ static int orte_plm_base_open(mca_base_open_flag_t flags) return mca_base_framework_components_open(&orte_plm_base_framework, flags); } -MCA_BASE_FRAMEWORK_DECLARE(orte, plm, NULL, NULL, orte_plm_base_open, orte_plm_base_close, +MCA_BASE_FRAMEWORK_DECLARE(orte, plm, NULL, mca_plm_base_register, orte_plm_base_open, orte_plm_base_close, mca_plm_base_static_components, 0); diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 9932706a64d..4aa45fd3233 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1567,7 +1567,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, } /* if this is too long, then we'll have to do it with * a phone home operation instead */ - if (strlen(param) < ORTE_MAX_REGEX_CMD_LENGTH) { + if (strlen(param) < orte_plm_globals.node_regex_threshold) { opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); opal_argv_append(argc, argv, "orte_node_regex"); opal_argv_append(argc, argv, param); diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index 047a508394c..3a58c351b34 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -12,6 +12,8 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,6 +61,7 @@ typedef struct { opal_buffer_t tree_spawn_cmd; /* daemon nodes assigned at launch */ bool daemon_nodes_assigned_at_launch; + size_t node_regex_threshold; } orte_plm_globals_t; /** * Global instance of PLM framework data diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index e8c6f59bc21..3be3f71dbf9 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,8 +39,6 @@ BEGIN_C_DECLS -#define ORTE_MAX_REGEX_CMD_LENGTH 1024 - #define ORTE_MAX_NODE_PREFIX 50 #define ORTE_CONTIG_NODE_CMD 0x01 #define ORTE_NON_CONTIG_NODE_CMD 0x02 From 45275848407f8774b7746dbd027568a57a4ffeea Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Thu, 21 Dec 2017 17:27:59 +0900 Subject: [PATCH 4/5] orted: fix tree-spawn when the node regex is too long When the node regex is too long to be sent on the command line, retrieve it first from the parent, and then spawn the remote orted Signed-off-by: Gilles Gouaillardet --- orte/mca/plm/base/plm_base_launch_support.c | 7 ++- orte/mca/plm/rsh/plm_rsh_module.c | 1 - orte/mca/rml/base/rml_base_msg_handlers.c | 30 +++++++++- orte/mca/rml/rml_types.h | 5 ++ orte/orted/orted_main.c | 61 +++++++++++++++++++-- 5 files changed, 94 insertions(+), 10 deletions(-) diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 4aa45fd3233..7eaaca6d4c9 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1565,16 +1565,19 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, ORTE_ERROR_LOG(rc); return rc; } + if (NULL != orte_node_regex) { + free(orte_node_regex); + } + orte_node_regex = param; /* if this is too long, then we'll have to do it with * a phone home operation instead */ if (strlen(param) < orte_plm_globals.node_regex_threshold) { opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); opal_argv_append(argc, argv, "orte_node_regex"); - opal_argv_append(argc, argv, param); + opal_argv_append(argc, argv, orte_node_regex); /* mark that the nidmap has been communicated */ orte_nidmap_communicated = true; } - free(param); if (!orte_static_ports && !orte_fwd_mpirun_port) { /* if we are using static ports, or we are forwarding diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 2554298b3b2..b14c8fa2ee4 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -825,7 +825,6 @@ static int remote_spawn(opal_buffer_t *launch) prefix = NULL; } } - /* get the updated routing list */ rtmod = orte_rml.get_routed(orte_coll_conduit); OBJ_CONSTRUCT(&coll, opal_list_t); diff --git a/orte/mca/rml/base/rml_base_msg_handlers.c b/orte/mca/rml/base/rml_base_msg_handlers.c index 69c2ade7ae1..72a37cdae9f 100644 --- a/orte/mca/rml/base/rml_base_msg_handlers.c +++ b/orte/mca/rml/base/rml_base_msg_handlers.c @@ -13,6 +13,8 @@ * Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -172,8 +174,32 @@ void orte_rml_base_process_msg(int fd, short flags, void *cbdata) /* if this message is just to warmup the connection, then drop it */ if (ORTE_RML_TAG_WARMUP_CONNECTION == msg->tag) { - OBJ_RELEASE(msg); - return; + if (!orte_nidmap_communicated) { + opal_buffer_t * buffer = OBJ_NEW(opal_buffer_t); + int rc; + if (NULL == buffer) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return; + } + assert (NULL != orte_node_regex); + + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &orte_node_regex, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + return; + } + + if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, + &msg->sender, buffer, + ORTE_RML_TAG_NODE_REGEX_REPORT, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + return; + } + OBJ_RELEASE(msg); + return; + } } /* see if we have a waiting recv for this message */ diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index 5cfbb07072c..2acb03c1bb0 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -13,6 +13,8 @@ * reserved. * Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -175,6 +177,9 @@ BEGIN_C_DECLS /* warmup connection - simply establishes the connection */ #define ORTE_RML_TAG_WARMUP_CONNECTION 63 +/* node regex report */ +#define ORTE_RML_TAG_NODE_REGEX_REPORT 64 + #define ORTE_RML_TAG_MAX 100 diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index bb3bbec90a6..80788305b2f 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -117,8 +117,14 @@ static void pipe_closed(int fd, short flags, void *arg); static void rollup(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata); +static void node_regex_report(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata); +static void report_orted(void); + static opal_buffer_t *bucket, *mybucket = NULL; static int ncollected = 0; +static bool node_regex_waiting = false; static char *orte_parent_uri = NULL; @@ -734,6 +740,11 @@ int orte_daemon(int argc, char *argv[]) * a little time in the launch phase by "warming up" the * connection to our parent while we wait for our children */ buffer = OBJ_NEW(opal_buffer_t); // zero-byte message + if (NULL == orte_node_regex) { + orte_rml.recv_buffer_nb(ORTE_PROC_MY_PARENT, ORTE_RML_TAG_NODE_REGEX_REPORT, + ORTE_RML_PERSISTENT, node_regex_report, &node_regex_waiting); + node_regex_waiting = true; + } if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_PARENT, buffer, ORTE_RML_TAG_WARMUP_CONNECTION, @@ -969,8 +980,10 @@ int orte_daemon(int argc, char *argv[]) i += 2; } } - /* now launch any child daemons of ours */ - orte_plm.remote_spawn(orte_tree_launch_cmd); + if (NULL != orte_node_regex) { + /* now launch any child daemons of ours */ + orte_plm.remote_spawn(orte_tree_launch_cmd); + } } if (orte_debug_daemons_flag) { @@ -1052,8 +1065,6 @@ static void rollup(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) { - int nreqd; - char *rtmod; int ret; orte_process_name_t child; int32_t i, flag, cnt; @@ -1095,10 +1106,17 @@ static void rollup(int status, orte_process_name_t* sender, } report: + report_orted(); +} + +static void report_orted() { + char *rtmod; + int nreqd, ret; + /* get the number of children */ rtmod = orte_rml.get_routed(orte_mgmt_conduit); nreqd = orte_routed.num_routes(rtmod) + 1; - if (nreqd == ncollected && NULL != mybucket) { + if (nreqd == ncollected && NULL != mybucket && !node_regex_waiting) { /* add the collection of our children's buckets to ours */ opal_dss.copy_payload(mybucket, bucket); OBJ_RELEASE(bucket); @@ -1112,3 +1130,36 @@ static void rollup(int status, orte_process_name_t* sender, } } } + +static void node_regex_report(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata) { + int rc, n=1; + char * regex; + assert(NULL == orte_node_regex); + bool * active = (bool *)cbdata; + + /* extract the node regex if needed, and update the routing tree */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, ®ex, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return; + } + orte_node_regex = regex; + + if (ORTE_SUCCESS != (rc = orte_util_nidmap_parse(orte_node_regex))) { + ORTE_ERROR_LOG(rc); + return; + } + + /* update the routing tree so any tree spawn operation + * properly gets the number of children underneath us */ + orte_routed.update_routing_plan(NULL); + + *active = false; + + /* now launch any child daemons of ours */ + orte_plm.remote_spawn(orte_tree_launch_cmd); + + report_orted(); +} From 03da5218eadbafb534a3fac10d74d84a5209c0f7 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Thu, 21 Dec 2017 17:30:45 +0900 Subject: [PATCH 5/5] orte: remove some dead code related to the new tree_spawn method Now that the daemon calls remote_spawn itself, there is no longer a need for the "tree_spawn" command nor the associated command processing code since the HNP is no longer sending a tree-spawn message to the orted. Thanks Ralph for the guidance ! Signed-off-by: Gilles Gouaillardet --- orte/mca/odls/odls_types.h | 3 +- orte/mca/plm/isolated/plm_isolated.c | 6 ++-- orte/mca/plm/plm.h | 4 ++- orte/mca/plm/rsh/plm_rsh_module.c | 43 ++++++---------------------- orte/orted/orted_comm.c | 18 ------------ orte/orted/orted_main.c | 4 +-- orte/runtime/orte_globals.c | 4 +-- orte/runtime/orte_globals.h | 3 +- 8 files changed, 23 insertions(+), 62 deletions(-) diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index 1362b1b6332..539f9a6ef5e 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -13,6 +13,8 @@ * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,7 +46,6 @@ typedef uint8_t orte_daemon_cmd_flag_t; #define ORTE_DAEMON_KILL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 2 #define ORTE_DAEMON_SIGNAL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 3 #define ORTE_DAEMON_ADD_LOCAL_PROCS (orte_daemon_cmd_flag_t) 4 -#define ORTE_DAEMON_TREE_SPAWN (orte_daemon_cmd_flag_t) 5 #define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 6 #define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 7 #define ORTE_DAEMON_PROCESS_AND_RELAY_CMD (orte_daemon_cmd_flag_t) 9 diff --git a/orte/mca/plm/isolated/plm_isolated.c b/orte/mca/plm/isolated/plm_isolated.c index f237a503b09..211fa11ee75 100644 --- a/orte/mca/plm/isolated/plm_isolated.c +++ b/orte/mca/plm/isolated/plm_isolated.c @@ -15,6 +15,8 @@ * Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011 IBM Corporation. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,7 +52,7 @@ static int isolated_init(void); static int isolated_launch(orte_job_t *jdata); -static int remote_spawn(opal_buffer_t *launch); +static int remote_spawn(); static int isolated_terminate_orteds(void); static int isolated_finalize(void); @@ -93,7 +95,7 @@ static int isolated_init(void) /* * launch a set of daemons from a remote daemon */ -static int remote_spawn(opal_buffer_t *launch) +static int remote_spawn() { /* unused function in this mode */ return ORTE_SUCCESS; diff --git a/orte/mca/plm/plm.h b/orte/mca/plm/plm.h index 96dd78b248a..3d1d115c73d 100644 --- a/orte/mca/plm/plm.h +++ b/orte/mca/plm/plm.h @@ -12,6 +12,8 @@ * All rights reserved. * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,7 +65,7 @@ typedef int (*orte_plm_base_module_spawn_fn_t)(orte_job_t *jdata); /* * Remote spawn - spawn called by a daemon to launch a process on its own */ -typedef int (*orte_plm_base_module_remote_spawn_fn_t)(opal_buffer_t *launch); +typedef int (*orte_plm_base_module_remote_spawn_fn_t)(void); /* * Entry point to set the HNP name diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index b14c8fa2ee4..9f56be962c8 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -101,7 +101,7 @@ static int rsh_init(void); static int rsh_launch(orte_job_t *jdata); -static int remote_spawn(opal_buffer_t *launch); +static int remote_spawn(void); static int rsh_terminate_orteds(void); static int rsh_finalize(void); @@ -784,7 +784,7 @@ static void ssh_child(int argc, char **argv) /* * launch a set of daemons from a remote daemon */ -static int remote_spawn(opal_buffer_t *launch) +static int remote_spawn() { int node_name_index1; int proc_vpid_index; @@ -793,7 +793,6 @@ static int remote_spawn(opal_buffer_t *launch) int argc; int rc=ORTE_SUCCESS; bool failed_launch = true; - orte_std_cntr_t n; orte_process_name_t target; orte_plm_rsh_caddy_t *caddy; orte_job_t *daemons; @@ -808,23 +807,15 @@ static int remote_spawn(opal_buffer_t *launch) /* if we hit any errors, tell the HNP it was us */ target.vpid = ORTE_PROC_MY_NAME->vpid; - if (NULL != launch) { - /* extract the prefix from the launch buffer */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* check to see if enable-orterun-prefix-by-default was given - if + * this is being done by a singleton, then orterun will not be there + * to put the prefix in the app. So make sure we check to find it */ + if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) { + prefix = strdup(opal_install_dirs.prefix); } else { - /* check to see if enable-orterun-prefix-by-default was given - if - * this is being done by a singleton, then orterun will not be there - * to put the prefix in the app. So make sure we check to find it */ - if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) { - prefix = strdup(opal_install_dirs.prefix); - } else { - prefix = NULL; - } + prefix = NULL; } + /* get the updated routing list */ rtmod = orte_rml.get_routed(orte_coll_conduit); OBJ_CONSTRUCT(&coll, opal_list_t); @@ -1180,24 +1171,8 @@ static void launch_daemons(int fd, short args, void *cbdata) /* if we are tree launching, find our children and create the launch cmd */ if (!mca_plm_rsh_component.no_tree_spawn) { - orte_daemon_cmd_flag_t command = ORTE_DAEMON_TREE_SPAWN; orte_job_t *jdatorted; - /* get the tree spawn buffer */ - orte_tree_launch_cmd = OBJ_NEW(opal_buffer_t); - /* insert the tree_spawn cmd */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(orte_tree_launch_cmd); - goto cleanup; - } - /* pack the prefix since this will be needed by the next wave */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &prefix_dir, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(orte_tree_launch_cmd); - goto cleanup; - } - /* get the orted job data object */ if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index c99e9845a4b..7d18e1ad922 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -341,22 +341,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, break; - /**** TREE_SPAWN ****/ - case ORTE_DAEMON_TREE_SPAWN: - if (orte_debug_daemons_flag) { - opal_output(0, "%s orted_cmd: received tree_spawn", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - } - /* if the PLM supports remote spawn, pass it all along */ - if (NULL != orte_plm.remote_spawn) { - if (ORTE_SUCCESS != (ret = orte_plm.remote_spawn(buffer))) { - ORTE_ERROR_LOG(ret); - } - } else { - opal_output(0, "%s remote spawn is NULL!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - } - break; - /**** EXIT COMMAND ****/ case ORTE_DAEMON_EXIT_CMD: if (orte_debug_daemons_flag) { @@ -1400,8 +1384,6 @@ static char *get_orted_comm_cmd_str(int command) return strdup("ORTE_DAEMON_SIGNAL_LOCAL_PROCS"); case ORTE_DAEMON_ADD_LOCAL_PROCS: return strdup("ORTE_DAEMON_ADD_LOCAL_PROCS"); - case ORTE_DAEMON_TREE_SPAWN: - return strdup("ORTE_DAEMON_TREE_SPAWN"); case ORTE_DAEMON_HEARTBEAT_CMD: return strdup("ORTE_DAEMON_HEARTBEAT_CMD"); diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 80788305b2f..c465f327cc8 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -982,7 +982,7 @@ int orte_daemon(int argc, char *argv[]) } if (NULL != orte_node_regex) { /* now launch any child daemons of ours */ - orte_plm.remote_spawn(orte_tree_launch_cmd); + orte_plm.remote_spawn(); } } @@ -1159,7 +1159,7 @@ static void node_regex_report(int status, orte_process_name_t* sender, *active = false; /* now launch any child daemons of ours */ - orte_plm.remote_spawn(orte_tree_launch_cmd); + orte_plm.remote_spawn(); report_orted(); } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index d07b9c17401..ec89f1d4034 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -14,7 +14,7 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -136,8 +136,6 @@ int orte_timeout_usec_per_proc = -1; float orte_max_timeout = -1.0; orte_timer_t *orte_mpiexec_timeout = NULL; -opal_buffer_t *orte_tree_launch_cmd = NULL; - int orte_stack_trace_wait_timeout = 30; /* global arrays for data storage */ diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index eb1039edaa3..001f7302254 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -15,6 +15,8 @@ * All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -517,7 +519,6 @@ ORTE_DECLSPEC extern int orte_startup_timeout; ORTE_DECLSPEC extern int orte_timeout_usec_per_proc; ORTE_DECLSPEC extern float orte_max_timeout; ORTE_DECLSPEC extern orte_timer_t *orte_mpiexec_timeout; -ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd; /* global arrays for data storage */ ORTE_DECLSPEC extern opal_hash_table_t *orte_job_data;