Skip to content

Commit d38e2e6

Browse files
author
rhc54
committed
Merge pull request #1423 from rhc54/topic/suicide
Fix registration of error handlers thru the pmix120 component.
2 parents 5a85a03 + 4a55fba commit d38e2e6

File tree

13 files changed

+124
-70
lines changed

13 files changed

+124
-70
lines changed

opal/mca/pmix/base/base.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
2+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -46,6 +46,15 @@ OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
4646
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
4747
opal_pmix_pdata_t *pdat,
4848
int timeout);
49+
50+
OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);
51+
52+
typedef struct {
53+
opal_event_base_t *evbase;
54+
} opal_pmix_base_t;
55+
56+
extern opal_pmix_base_t opal_pmix_base;
57+
4958
END_C_DECLS
5059

5160
#endif

opal/mca/pmix/base/pmix_base_fns.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@
3838

3939
#define OPAL_PMI_PAD 10
4040

41+
void opal_pmix_base_set_evbase(opal_event_base_t *evbase)
42+
{
43+
opal_pmix_base.evbase = evbase;
44+
}
45+
4146
/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
4247
******** DO NOT NATIVELY SUPPORT IT
4348
********/

opal/mca/pmix/base/pmix_base_frame.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@
3333
https://github.com/open-mpi/ompi/issues/375 for details. */
3434
opal_pmix_base_module_t opal_pmix = { 0 };
3535
bool opal_pmix_collect_all_data = true;
36-
bool opal_pmix_base_allow_delayed_server = false;
3736
int opal_pmix_verbose_output = -1;
3837
bool opal_pmix_base_async_modex = false;
38+
opal_pmix_base_t opal_pmix_base = {0};
3939

4040
static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
4141
{

opal/mca/pmix/pmix120/pmix_pmix120.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,9 @@ static void reg_thread(int sd, short args, void *cbdata)
212212
opal_pmix120_etracker_t *trk;
213213

214214
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
215-
"%s register complete with status %d",
215+
"%s register complete with status %d ref %d",
216216
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
217-
cd->status);
217+
cd->status, cd->errhandler_ref);
218218

219219
/* convert the status */
220220
rc = pmix120_convert_rc(cd->status);
@@ -240,9 +240,10 @@ static void reg_cbfunc(pmix_status_t status,
240240
void *cbdata)
241241
{
242242
pmix120_opcaddy_t *cd = (pmix120_opcaddy_t*)cbdata;
243+
243244
cd->status = status;
244245
cd->errhandler_ref = errhandler_ref;
245-
opal_event_set(opal_sync_event_base, &cd->ev,
246+
opal_event_set(opal_pmix_base.evbase, &cd->ev,
246247
-1, OPAL_EV_WRITE, reg_thread, cd);
247248
opal_event_set_priority(&cd->ev, OPAL_EV_MSG_HI_PRI);
248249
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);

orte/mca/errmgr/default_app/errmgr_default_app.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "opal/util/output.h"
2929
#include "opal/dss/dss.h"
3030
#include "opal/errhandler/opal_errhandler.h"
31+
#include "opal/mca/pmix/pmix.h"
3132

3233
#include "orte/util/error_strings.h"
3334
#include "orte/util/name_fns.h"
@@ -71,6 +72,33 @@
7172
static void proc_errors(int fd, short args, void *cbdata);
7273
static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
7374
{
75+
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
76+
"%s errmgr:default_app: errhandler called",
77+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
78+
79+
/* push it into our event base */
80+
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
81+
}
82+
83+
static int myerrhandle = -1;
84+
85+
static void register_cbfunc(int status, int errhndler, void *cbdata)
86+
{
87+
myerrhandle = errhndler;
88+
}
89+
90+
static void notify_cbfunc(int status,
91+
opal_list_t *procs,
92+
opal_list_t *info,
93+
opal_pmix_release_cbfunc_t cbfunc,
94+
void *cbdata)
95+
{
96+
if (NULL != cbfunc) {
97+
cbfunc(cbdata);
98+
}
99+
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
100+
"%s errmgr:default_app: pmix errhandler called",
101+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
74102
/* push it into our event base */
75103
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
76104
}
@@ -86,13 +114,16 @@ static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
86114
/* register an errhandler */
87115
opal_register_errhandler(pmix_error, NULL);
88116

117+
/* tie the default PMIx errhandler back to us */
118+
opal_pmix.register_errhandler(NULL, notify_cbfunc, register_cbfunc, NULL);
119+
89120
return ORTE_SUCCESS;
90121
}
91122

92123
static int finalize(void)
93124
{
94125
opal_deregister_errhandler();
95-
126+
opal_pmix.deregister_errhandler(myerrhandle, NULL, NULL);
96127
return ORTE_SUCCESS;
97128
}
98129

orte/mca/ess/base/ess_base_std_app.c

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
1313
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2014 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
1818
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@@ -46,7 +46,6 @@
4646
#include "opal/util/proc.h"
4747
#include "opal/runtime/opal.h"
4848
#include "opal/runtime/opal_cr.h"
49-
#include "opal/runtime/opal_progress_threads.h"
5049

5150
#include "orte/mca/errmgr/errmgr.h"
5251
#include "orte/mca/dfs/base/base.h"
@@ -70,8 +69,6 @@
7069

7170
#include "orte/mca/ess/base/base.h"
7271

73-
static bool progress_thread_running = false;
74-
7572
int orte_ess_base_app_setup(bool db_restrict_local)
7673
{
7774
int ret;
@@ -109,10 +106,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
109106
opal_proc_local_set(&orte_process_info.super);
110107
}
111108

112-
/* get an async event base - we use the opal_async one so
113-
* we don't startup extra threads if not needed */
114-
orte_event_base = opal_progress_thread_init(NULL);
115-
progress_thread_running = true;
116109
/* open and setup the state machine */
117110
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
118111
ORTE_ERROR_LOG(ret);
@@ -235,12 +228,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
235228
}
236229
return ORTE_SUCCESS;
237230
error:
238-
if (!progress_thread_running) {
239-
/* can't send the help message, so ensure it
240-
* comes out locally
241-
*/
242-
orte_show_help_finalize();
243-
}
244231
orte_show_help("help-orte-runtime.txt",
245232
"orte_init:startup:internal-failure",
246233
true, error, ORTE_ERROR_NAME(ret), ret);
@@ -265,12 +252,6 @@ int orte_ess_base_app_finalize(void)
265252

266253
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
267254

268-
/* release the event base */
269-
if (progress_thread_running) {
270-
opal_progress_thread_finalize(NULL);
271-
progress_thread_running = false;
272-
}
273-
274255
return ORTE_SUCCESS;
275256
}
276257

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,8 @@ int orte_ess_base_orted_setup(char **hosts)
522522
error = "opal_pmix_base_select";
523523
goto error;
524524
}
525+
/* set the event base */
526+
opal_pmix_base_set_evbase(orte_event_base);
525527
/* setup the PMIx server */
526528
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
527529
ORTE_ERROR_LOG(ret);

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,8 @@ static int rte_init(void)
630630
error = "opal_pmix_base_select";
631631
goto error;
632632
}
633+
/* set the event base */
634+
opal_pmix_base_set_evbase(orte_event_base);
633635

634636
/* setup the routed info - the selected routed component
635637
* will know what to do.

orte/mca/ess/pmi/ess_pmi_module.c

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
1414
* All rights reserved.
15-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -39,6 +39,7 @@
3939
#include "opal/util/opal_environ.h"
4040
#include "opal/util/output.h"
4141
#include "opal/util/argv.h"
42+
#include "opal/runtime/opal_progress_threads.h"
4243
#include "opal/class/opal_pointer_array.h"
4344
#include "opal/mca/hwloc/base/base.h"
4445
#include "opal/util/printf.h"
@@ -73,6 +74,7 @@ orte_ess_base_module_t orte_ess_pmi_module = {
7374
static bool added_transport_keys=false;
7475
static bool added_num_procs = false;
7576
static bool added_app_ctx = false;
77+
static bool progress_thread_running = false;
7678

7779
/**** MODULE FUNCTIONS ****/
7880

@@ -97,6 +99,11 @@ static int rte_init(void)
9799
goto error;
98100
}
99101

102+
/* get an async event base - we use the opal_async one so
103+
* we don't startup extra threads if not needed */
104+
orte_event_base = opal_progress_thread_init(NULL);
105+
progress_thread_running = true;
106+
100107
/* open and setup pmix */
101108
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
102109
ORTE_ERROR_LOG(ret);
@@ -109,6 +116,8 @@ static int rte_init(void)
109116
error = "pmix init";
110117
goto error;
111118
}
119+
/* set the event base */
120+
opal_pmix_base_set_evbase(orte_event_base);
112121
/* initialize the selected module */
113122
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) {
114123
/* we cannot run */
@@ -394,6 +403,12 @@ static int rte_init(void)
394403
return ORTE_SUCCESS;
395404

396405
error:
406+
if (!progress_thread_running) {
407+
/* can't send the help message, so ensure it
408+
* comes out locally
409+
*/
410+
orte_show_help_finalize();
411+
}
397412
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
398413
orte_show_help("help-orte-runtime.txt",
399414
"orte_init:startup:internal-failure",
@@ -419,18 +434,23 @@ static int rte_finalize(void)
419434
unsetenv("OMPI_APP_CTX_NUM_PROCS");
420435
}
421436

437+
/* use the default app procedure to finish */
438+
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
439+
ORTE_ERROR_LOG(ret);
440+
return ret;
441+
}
442+
422443
/* mark us as finalized */
423444
if (NULL != opal_pmix.finalize) {
424445
opal_pmix.finalize();
425446
(void) mca_base_framework_close(&opal_pmix_base_framework);
426447
}
427448

428-
/* use the default app procedure to finish */
429-
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
430-
ORTE_ERROR_LOG(ret);
431-
return ret;
449+
/* release the event base */
450+
if (progress_thread_running) {
451+
opal_progress_thread_finalize(NULL);
452+
progress_thread_running = false;
432453
}
433-
434454
return ORTE_SUCCESS;
435455
}
436456

orte/mca/odls/default/odls_default_module.c

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -415,14 +415,16 @@ static int do_child(orte_app_context_t* context,
415415
always outputs a nice, single message indicating what
416416
happened
417417
*/
418-
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
419-
&environ_copy))) {
420-
ORTE_ERROR_LOG(i);
421-
send_error_show_help(write_fd, 1,
422-
"help-orte-odls-default.txt",
423-
"iof setup failed",
424-
orte_process_info.nodename, context->app);
425-
/* Does not return */
418+
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
419+
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
420+
&environ_copy))) {
421+
ORTE_ERROR_LOG(i);
422+
send_error_show_help(write_fd, 1,
423+
"help-orte-odls-default.txt",
424+
"iof setup failed",
425+
orte_process_info.nodename, context->app);
426+
/* Does not return */
427+
}
426428
}
427429

428430
/* now set any child-level controls such as binding */

0 commit comments

Comments
 (0)