Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion opal/mca/pmix/base/base.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -46,6 +46,15 @@ OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
opal_pmix_pdata_t *pdat,
int timeout);

OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);

typedef struct {
opal_event_base_t *evbase;
} opal_pmix_base_t;

extern opal_pmix_base_t opal_pmix_base;

END_C_DECLS

#endif
5 changes: 5 additions & 0 deletions opal/mca/pmix/base/pmix_base_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@

#define OPAL_PMI_PAD 10

void opal_pmix_base_set_evbase(opal_event_base_t *evbase)
{
opal_pmix_base.evbase = evbase;
}

/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
******** DO NOT NATIVELY SUPPORT IT
********/
Expand Down
2 changes: 1 addition & 1 deletion opal/mca/pmix/base/pmix_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
https://github.com/open-mpi/ompi/issues/375 for details. */
opal_pmix_base_module_t opal_pmix = { 0 };
bool opal_pmix_collect_all_data = true;
bool opal_pmix_base_allow_delayed_server = false;
int opal_pmix_verbose_output = -1;
bool opal_pmix_base_async_modex = false;
opal_pmix_base_t opal_pmix_base = {0};

static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
{
Expand Down
7 changes: 4 additions & 3 deletions opal/mca/pmix/pmix120/pmix_pmix120.c
Original file line number Diff line number Diff line change
Expand Up @@ -212,9 +212,9 @@ static void reg_thread(int sd, short args, void *cbdata)
opal_pmix120_etracker_t *trk;

opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s register complete with status %d",
"%s register complete with status %d ref %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
cd->status);
cd->status, cd->errhandler_ref);

/* convert the status */
rc = pmix120_convert_rc(cd->status);
Expand All @@ -240,9 +240,10 @@ static void reg_cbfunc(pmix_status_t status,
void *cbdata)
{
pmix120_opcaddy_t *cd = (pmix120_opcaddy_t*)cbdata;

cd->status = status;
cd->errhandler_ref = errhandler_ref;
opal_event_set(opal_sync_event_base, &cd->ev,
opal_event_set(opal_pmix_base.evbase, &cd->ev,
-1, OPAL_EV_WRITE, reg_thread, cd);
opal_event_set_priority(&cd->ev, OPAL_EV_MSG_HI_PRI);
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
Expand Down
33 changes: 32 additions & 1 deletion orte/mca/errmgr/default_app/errmgr_default_app.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/errhandler/opal_errhandler.h"
#include "opal/mca/pmix/pmix.h"

#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
Expand Down Expand Up @@ -71,6 +72,33 @@
static void proc_errors(int fd, short args, void *cbdata);
static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
{
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_app: errhandler called",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

/* push it into our event base */
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
}

static int myerrhandle = -1;

static void register_cbfunc(int status, int errhndler, void *cbdata)
{
myerrhandle = errhndler;
}

static void notify_cbfunc(int status,
opal_list_t *procs,
opal_list_t *info,
opal_pmix_release_cbfunc_t cbfunc,
void *cbdata)
{
if (NULL != cbfunc) {
cbfunc(cbdata);
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_app: pmix errhandler called",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* push it into our event base */
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
}
Expand All @@ -86,13 +114,16 @@ static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
/* register an errhandler */
opal_register_errhandler(pmix_error, NULL);

/* tie the default PMIx errhandler back to us */
opal_pmix.register_errhandler(NULL, notify_cbfunc, register_cbfunc, NULL);

return ORTE_SUCCESS;
}

static int finalize(void)
{
opal_deregister_errhandler();

opal_pmix.deregister_errhandler(myerrhandle, NULL, NULL);
return ORTE_SUCCESS;
}

Expand Down
21 changes: 1 addition & 20 deletions orte/mca/ess/base/ess_base_std_app.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
Expand Down Expand Up @@ -46,7 +46,6 @@
#include "opal/util/proc.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/runtime/opal_progress_threads.h"

#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/dfs/base/base.h"
Expand All @@ -70,8 +69,6 @@

#include "orte/mca/ess/base/base.h"

static bool progress_thread_running = false;

int orte_ess_base_app_setup(bool db_restrict_local)
{
int ret;
Expand Down Expand Up @@ -109,10 +106,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
opal_proc_local_set(&orte_process_info.super);
}

/* get an async event base - we use the opal_async one so
* we don't startup extra threads if not needed */
orte_event_base = opal_progress_thread_init(NULL);
progress_thread_running = true;
/* open and setup the state machine */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
Expand Down Expand Up @@ -235,12 +228,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
}
return ORTE_SUCCESS;
error:
if (!progress_thread_running) {
/* can't send the help message, so ensure it
* comes out locally
*/
orte_show_help_finalize();
}
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
Expand All @@ -265,12 +252,6 @@ int orte_ess_base_app_finalize(void)

orte_session_dir_finalize(ORTE_PROC_MY_NAME);

/* release the event base */
if (progress_thread_running) {
opal_progress_thread_finalize(NULL);
progress_thread_running = false;
}

return ORTE_SUCCESS;
}

Expand Down
2 changes: 2 additions & 0 deletions orte/mca/ess/base/ess_base_std_orted.c
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,8 @@ int orte_ess_base_orted_setup(char **hosts)
error = "opal_pmix_base_select";
goto error;
}
/* set the event base */
opal_pmix_base_set_evbase(orte_event_base);
/* setup the PMIx server */
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
ORTE_ERROR_LOG(ret);
Expand Down
2 changes: 2 additions & 0 deletions orte/mca/ess/hnp/ess_hnp_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,8 @@ static int rte_init(void)
error = "opal_pmix_base_select";
goto error;
}
/* set the event base */
opal_pmix_base_set_evbase(orte_event_base);

/* setup the routed info - the selected routed component
* will know what to do.
Expand Down
32 changes: 26 additions & 6 deletions orte/mca/ess/pmi/ess_pmi_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -39,6 +39,7 @@
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/runtime/opal_progress_threads.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/util/printf.h"
Expand Down Expand Up @@ -73,6 +74,7 @@ orte_ess_base_module_t orte_ess_pmi_module = {
static bool added_transport_keys=false;
static bool added_num_procs = false;
static bool added_app_ctx = false;
static bool progress_thread_running = false;

/**** MODULE FUNCTIONS ****/

Expand All @@ -97,6 +99,11 @@ static int rte_init(void)
goto error;
}

/* get an async event base - we use the opal_async one so
* we don't startup extra threads if not needed */
orte_event_base = opal_progress_thread_init(NULL);
progress_thread_running = true;

/* open and setup pmix */
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
Expand All @@ -109,6 +116,8 @@ static int rte_init(void)
error = "pmix init";
goto error;
}
/* set the event base */
opal_pmix_base_set_evbase(orte_event_base);
/* initialize the selected module */
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) {
/* we cannot run */
Expand Down Expand Up @@ -394,6 +403,12 @@ static int rte_init(void)
return ORTE_SUCCESS;

error:
if (!progress_thread_running) {
/* can't send the help message, so ensure it
* comes out locally
*/
orte_show_help_finalize();
}
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
Expand All @@ -419,18 +434,23 @@ static int rte_finalize(void)
unsetenv("OMPI_APP_CTX_NUM_PROCS");
}

/* use the default app procedure to finish */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
ORTE_ERROR_LOG(ret);
return ret;
}

/* mark us as finalized */
if (NULL != opal_pmix.finalize) {
opal_pmix.finalize();
(void) mca_base_framework_close(&opal_pmix_base_framework);
}

/* use the default app procedure to finish */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
ORTE_ERROR_LOG(ret);
return ret;
/* release the event base */
if (progress_thread_running) {
opal_progress_thread_finalize(NULL);
progress_thread_running = false;
}

return ORTE_SUCCESS;
}

Expand Down
18 changes: 10 additions & 8 deletions orte/mca/odls/default/odls_default_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -415,14 +415,16 @@ static int do_child(orte_app_context_t* context,
always outputs a nice, single message indicating what
happened
*/
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
&environ_copy))) {
ORTE_ERROR_LOG(i);
send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt",
"iof setup failed",
orte_process_info.nodename, context->app);
/* Does not return */
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
&environ_copy))) {
ORTE_ERROR_LOG(i);
send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt",
"iof setup failed",
orte_process_info.nodename, context->app);
/* Does not return */
}
}

/* now set any child-level controls such as binding */
Expand Down
15 changes: 3 additions & 12 deletions orte/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -1282,18 +1282,9 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
opal_argv_append(argc, argv, "1");
}

/* the following two are not mca params */
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
opal_argv_append(argc, argv, "--debug-failure");
asprintf(&param, "%d", orted_debug_failure);
opal_argv_append(argc, argv, param);
free(param);
}
if (0 < orted_debug_failure_delay) {
opal_argv_append(argc, argv, "--debug-failure-delay");
asprintf(&param, "%d", orted_debug_failure_delay);
opal_argv_append(argc, argv, param);
free(param);
/* the following is not an mca param */
if (NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
opal_argv_append(argc, argv, "--test-suicide");
}

/* tell the orted what ESS component to use */
Expand Down
Loading