From 703b8c356f8513024568e5867599f6ce86ebfab1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bouteiller?= Date: Tue, 9 Jun 2020 08:22:02 -0400 Subject: [PATCH 1/6] Make error_class and error_string callable before/after MPI_INIT/FINALIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bouteiller make lazy initialization opal unlikely Signed-off-by: Aurelien Bouteiller --- ompi/errhandler/errcode.c | 13 ++++ ompi/errhandler/errcode.h | 152 +++++++++++++++++++++----------------- ompi/mpi/c/error_class.c | 19 ++++- ompi/mpi/c/error_string.c | 19 ++++- 4 files changed, 129 insertions(+), 74 deletions(-) diff --git a/ompi/errhandler/errcode.c b/ompi/errhandler/errcode.c index 91c430d91f4..18101d46f30 100644 --- a/ompi/errhandler/errcode.c +++ b/ompi/errhandler/errcode.c @@ -131,8 +131,17 @@ do { \ opal_pointer_array_set_item(&ompi_mpi_errcodes, (ERRCODE), &(VAR)); \ } while (0) +static opal_mutex_t errcode_init_lock = OPAL_MUTEX_STATIC_INIT; + int ompi_mpi_errcode_init (void) { + opal_mutex_lock(&errcode_init_lock); + if ( 0 != ompi_mpi_errcode_lastpredefined ) { + /* Already initialized (presumably by an API call before MPI_init */ + opal_mutex_unlock(&errcode_init_lock); + return OMPI_SUCCESS; + } + /* Initialize the pointer array, which will hold the references to the error objects */ OBJ_CONSTRUCT(&ompi_mpi_errcodes, opal_pointer_array_t); @@ -223,6 +232,7 @@ int ompi_mpi_errcode_init (void) MPI_ERR_LASTCODE. So just start it as == MPI_ERR_LASTCODE. */ ompi_mpi_errcode_lastused = MPI_ERR_LASTCODE; ompi_mpi_errcode_lastpredefined = MPI_ERR_LASTCODE; + opal_mutex_unlock(&errcode_init_lock); return OMPI_SUCCESS; } @@ -231,6 +241,7 @@ int ompi_mpi_errcode_finalize(void) int i; ompi_mpi_errcode_t *errc; + opal_mutex_lock(&errcode_init_lock); for (i=ompi_mpi_errcode_lastpredefined+1; i<=ompi_mpi_errcode_lastused; i++) { /* * there are some user defined error-codes, which @@ -317,6 +328,8 @@ int ompi_mpi_errcode_finalize(void) OBJ_DESTRUCT(&ompi_t_err_invalid_name); OBJ_DESTRUCT(&ompi_mpi_errcodes); + ompi_mpi_errcode_lastpredefined = 0; + opal_mutex_unlock(&errcode_init_lock); return OMPI_SUCCESS; } diff --git a/ompi/errhandler/errcode.h b/ompi/errhandler/errcode.h index 656ddc5576d..033abd24167 100644 --- a/ompi/errhandler/errcode.h +++ b/ompi/errhandler/errcode.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -55,11 +55,68 @@ OMPI_DECLSPEC extern int ompi_mpi_errcode_lastpredefined; OMPI_DECLSPEC extern ompi_mpi_errcode_t ompi_err_unknown; +/** + * Initialize the error codes + * + * @returns OMPI_SUCCESS Upon success + * @returns OMPI_ERROR Otherwise + * + * Invoked from ompi_mpi_init(); sets up all static MPI error codes, + */ +int ompi_mpi_errcode_init(void); + +/** + * Finalize the error codes. + * + * @returns OMPI_SUCCESS Always + * + * Invokes from ompi_mpi_finalize(); tears down the error code array. + */ +int ompi_mpi_errcode_finalize(void); + +/** + * Add an error code + * + * @param: error class to which this new error code belongs to + * + * @returns the new error code on SUCCESS (>0) + * @returns OMPI_ERROR otherwise + * + */ +int ompi_mpi_errcode_add (int errclass); + +/** + * Add an error class + * + * @param: none + * + * @returns the new error class on SUCCESS (>0) + * @returns OMPI_ERROR otherwise + * + */ +int ompi_mpi_errclass_add (void); + +/** + * Add an error string to an error code + * + * @param: error code for which the string is defined + * @param: error string to add + * @param: length of the string + * + * @returns OMPI_SUCCESS on success + * @returns OMPI_ERROR on error + */ +int ompi_mpi_errnum_add_string (int errnum, const char* string, int len); + /** * Check for a valid error code */ static inline bool ompi_mpi_errcode_is_invalid(int errcode) { + if (OPAL_UNLIKELY( 0 == ompi_mpi_errcode_lastpredefined )) { + ompi_mpi_errcode_init(); + } + if ( errcode >= 0 && errcode <= ompi_mpi_errcode_lastused ) return 0; else @@ -73,23 +130,31 @@ static inline int ompi_mpi_errcode_get_class (int errcode) { ompi_mpi_errcode_t *err = NULL; + if (OPAL_UNLIKELY( 0 == ompi_mpi_errcode_lastpredefined )) { + ompi_mpi_errcode_init(); + } + if (errcode >= 0) { err = (ompi_mpi_errcode_t *)opal_pointer_array_get_item(&ompi_mpi_errcodes, errcode); /* If we get a bogus errcode, return MPI_ERR_UNKNOWN */ } if (NULL != err) { - if ( err->code != MPI_UNDEFINED ) { - return err->cls; - } + if ( err->code != MPI_UNDEFINED ) { + return err->cls; + } } return ompi_err_unknown.cls; } static inline int ompi_mpi_errcode_is_predefined ( int errcode ) { + if (OPAL_UNLIKELY( 0 == ompi_mpi_errcode_lastpredefined )) { + ompi_mpi_errcode_init(); + } + if ( errcode >= 0 && errcode <= ompi_mpi_errcode_lastpredefined ) - return true; + return true; return false; } @@ -98,23 +163,27 @@ static inline int ompi_mpi_errnum_is_class ( int errnum ) { ompi_mpi_errcode_t *err; + if (OPAL_UNLIKELY( 0 == ompi_mpi_errcode_lastpredefined )) { + ompi_mpi_errcode_init(); + } + if (errnum < 0) { return false; } if ( errnum <= ompi_mpi_errcode_lastpredefined ) { - /* Predefined error values represent an error code and - an error class at the same time */ - return true; + /* Predefined error values represent an error code and + an error class at the same time */ + return true; } err = (ompi_mpi_errcode_t *)opal_pointer_array_get_item(&ompi_mpi_errcodes, errnum); if (NULL != err) { - if ( MPI_UNDEFINED == err->code) { - /* Distinction between error class and error code is that for the - first one the code section is set to MPI_UNDEFINED */ - return true; - } + if ( MPI_UNDEFINED == err->code) { + /* Distinction between error class and error code is that for the + first one the code section is set to MPI_UNDEFINED */ + return true; + } } return false; @@ -128,6 +197,10 @@ static inline char* ompi_mpi_errnum_get_string (int errnum) { ompi_mpi_errcode_t *err = NULL; + if (OPAL_UNLIKELY( 0 == ompi_mpi_errcode_lastpredefined )) { + ompi_mpi_errcode_init(); + } + if (errnum >= 0) { err = (ompi_mpi_errcode_t *)opal_pointer_array_get_item(&ompi_mpi_errcodes, errnum); /* If we get a bogus errcode, return a string indicating that this @@ -142,59 +215,6 @@ static inline char* ompi_mpi_errnum_get_string (int errnum) } -/** - * Initialize the error codes - * - * @returns OMPI_SUCCESS Upon success - * @returns OMPI_ERROR Otherwise - * - * Invoked from ompi_mpi_init(); sets up all static MPI error codes, - */ -int ompi_mpi_errcode_init(void); - -/** - * Finalize the error codes. - * - * @returns OMPI_SUCCESS Always - * - * Invokes from ompi_mpi_finalize(); tears down the error code array. - */ -int ompi_mpi_errcode_finalize(void); - -/** - * Add an error code - * - * @param: error class to which this new error code belongs to - * - * @returns the new error code on SUCCESS (>0) - * @returns OMPI_ERROR otherwise - * - */ -int ompi_mpi_errcode_add (int errclass); - -/** - * Add an error class - * - * @param: none - * - * @returns the new error class on SUCCESS (>0) - * @returns OMPI_ERROR otherwise - * - */ -int ompi_mpi_errclass_add (void); - -/** - * Add an error string to an error code - * - * @param: error code for which the string is defined - * @param: error string to add - * @param: length of the string - * - * @returns OMPI_SUCCESS on success - * @returns OMPI_ERROR on error - */ -int ompi_mpi_errnum_add_string (int errnum, const char* string, int len); - END_C_DECLS #endif /* OMPI_MPI_ERRCODE_H */ diff --git a/ompi/mpi/c/error_class.c b/ompi/mpi/c/error_class.c index 74d151300fe..51411227c99 100644 --- a/ompi/mpi/c/error_class.c +++ b/ompi/mpi/c/error_class.c @@ -42,11 +42,22 @@ int MPI_Error_class(int errorcode, int *errorclass) OPAL_CR_NOOP_PROGRESS(); if ( MPI_PARAM_CHECK ) { - OMPI_ERR_INIT_FINALIZE(FUNC_NAME); - if ( ompi_mpi_errcode_is_invalid(errorcode)) { - return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG, - FUNC_NAME); + /* If we have an error, the action that we take depends on + whether we're currently (after MPI_Init and before + MPI_Finalize) or not */ + int32_t state = ompi_mpi_state; + if (state >= OMPI_MPI_STATE_INIT_COMPLETED && + state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG, + FUNC_NAME); + } else { + /* We have no MPI object here so call ompi_errhandle_invoke + * directly */ + return ompi_errhandler_invoke(NULL, NULL, -1, + ompi_errcode_get_mpi_code(MPI_ERR_ARG), + FUNC_NAME); + } } } diff --git a/ompi/mpi/c/error_string.c b/ompi/mpi/c/error_string.c index e2589e4adba..9499db7f18b 100644 --- a/ompi/mpi/c/error_string.c +++ b/ompi/mpi/c/error_string.c @@ -47,11 +47,22 @@ int MPI_Error_string(int errorcode, char *string, int *resultlen) OPAL_CR_NOOP_PROGRESS(); if ( MPI_PARAM_CHECK ) { - OMPI_ERR_INIT_FINALIZE(FUNC_NAME); - if ( ompi_mpi_errcode_is_invalid(errorcode)) { - return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG, - FUNC_NAME); + /* If we have an error, the action that we take depends on + whether we're currently (after MPI_Init and before + MPI_Finalize) or not */ + int32_t state = ompi_mpi_state; + if (state >= OMPI_MPI_STATE_INIT_COMPLETED && + state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG, + FUNC_NAME); + } else { + /* We have no MPI object here so call ompi_errhandle_invoke + * directly */ + return ompi_errhandler_invoke(NULL, NULL, -1, + ompi_errcode_get_mpi_code(MPI_ERR_ARG), + FUNC_NAME); + } } } From 3cd85a9ec5b68ca5d3000ed6bf94ace4a90c36b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bouteiller?= Date: Tue, 9 Jun 2020 08:25:28 -0400 Subject: [PATCH 2/6] Add the initial_errhandler info key to MPI_INFO_ENV and populate the value from prun populated paremeters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bouteiller Allow errhandlers to invoke the initial error handler before MPI_INIT Signed-off-by: Aurelien Bouteiller Indentation Signed-off-by: Aurelien Bouteiller --- ompi/errhandler/errhandler.c | 58 ++++++++++++++++++++++++++++++++++-- ompi/errhandler/errhandler.h | 20 +++++++++++++ ompi/info/info.c | 6 ++++ ompi/runtime/ompi_mpi_init.c | 12 ++++---- ompi/runtime/ompi_rte.c | 5 ++++ opal/util/proc.c | 3 +- opal/util/proc.h | 1 + 7 files changed, 95 insertions(+), 10 deletions(-) diff --git a/ompi/errhandler/errhandler.c b/ompi/errhandler/errhandler.c index 50d4c60fd50..6af34c5a42a 100644 --- a/ompi/errhandler/errhandler.c +++ b/ompi/errhandler/errhandler.c @@ -78,6 +78,55 @@ ompi_predefined_errhandler_t ompi_mpi_errors_throw_exceptions = {{{0}}}; ompi_predefined_errhandler_t *ompi_mpi_errors_throw_exceptions_addr = &ompi_mpi_errors_throw_exceptions; +static opal_mutex_t errhandler_init_lock = OPAL_MUTEX_STATIC_INIT; +ompi_errhandler_t* ompi_initial_error_handler_eh = NULL; +void (*ompi_initial_error_handler)(struct ompi_communicator_t **comm, int *error_code, ...) = NULL; + +/* + * Initialize the initial errhandler infrastructure only. + * This does not allocate any memory and does not require a corresponding fini. + */ +int ompi_initial_errhandler_init(void) { + opal_mutex_lock(&errhandler_init_lock); + if ( NULL != ompi_initial_error_handler ) { + /* Already initialized (presumably by an API call before MPI_init) */ + opal_mutex_unlock(&errhandler_init_lock); + return OMPI_SUCCESS; + } + + /* If it has been requested from the launch keys, set the initial + * error handler that will be attached by default with predefined + * communicators. We use an env because that can be obtained before + * OPAL and PMIx initialization. + */ + char *env = getenv("OMPI_MCA_mpi_initial_errhandler"); + if( NULL != env ) { + if( 0 == strcasecmp(env, "mpi_errors_are_fatal") ) { + ompi_initial_error_handler = &ompi_mpi_errors_are_fatal_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_are_fatal.eh; + } + else if( 0 == strcasecmp(env, "mpi_errors_abort") ) { + ompi_initial_error_handler = &ompi_mpi_errors_abort_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_abort.eh; + } + else if( 0 == strcasecmp(env, "mpi_errors_return") ) { + ompi_initial_error_handler = &ompi_mpi_errors_return_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_return.eh; + } + else { + /* invalid entry detected, ignore it, set fatal by default */ + opal_output(0, "WARNING: invalid value for launch key 'mpi_initial_errhandler'; defaulting to 'mpi_errors_are_fatal'."); + ompi_initial_error_handler = &ompi_mpi_errors_are_fatal_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_are_fatal.eh; + } + } + else { + ompi_initial_error_handler = &ompi_mpi_errors_are_fatal_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_are_fatal.eh; + } + opal_mutex_unlock(&errhandler_init_lock); + return OMPI_SUCCESS; +} /* * Initialize OMPI errhandler infrastructure @@ -163,9 +212,12 @@ int ompi_errhandler_init(void) "MPI_ERRORS_THROW_EXCEPTIONS", sizeof(ompi_mpi_errors_throw_exceptions.eh.eh_name)); - /* All done */ - - return OMPI_SUCCESS; + /* Lets initialize the initial error handler if not already done */ + char *env = getenv("OMPI_MCA_mpi_initial_errhandler"); + if( NULL != env ) { + ompi_process_info.initial_errhandler = strndup(env, MPI_MAX_INFO_VAL); + } + return ompi_initial_errhandler_init(); } diff --git a/ompi/errhandler/errhandler.h b/ompi/errhandler/errhandler.h index 1df48c32a40..139740089f2 100644 --- a/ompi/errhandler/errhandler.h +++ b/ompi/errhandler/errhandler.h @@ -185,6 +185,26 @@ OMPI_DECLSPEC extern ompi_predefined_errhandler_t ompi_mpi_errors_throw_exceptio */ OMPI_DECLSPEC extern opal_pointer_array_t ompi_errhandler_f_to_c_table; +/** + * This function selects the initial error handler. + * It may be called during MPI_INIT, or during the first MPI call + * that raises an error. This function does not allocate memory, + * and will only populate the ompi_initial_error_handler_eh and + * ompi_initial_error_handler pointers with predefined error handler + * and error handler functions aliases. + */ +OMPI_DECLSPEC int ompi_initial_errhandler_init(void); +/** + * The initial error handler pointer. Will be set to alias one of the + * predefined error handlers through launch keys during the first MPI call, + * and will then be attached to predefined communicators. + */ +OMPI_DECLSPEC extern ompi_errhandler_t* ompi_initial_error_handler_eh; +/** + * The initial error handler function pointer. Will be called when an error + * is raised before MPI_INIT or after MPI_FINALIZE. + */ +OMPI_DECLSPEC extern void (*ompi_initial_error_handler)(struct ompi_communicator_t **comm, int *error_code, ...); /** * Forward declaration so that we don't have to include diff --git a/ompi/info/info.c b/ompi/info/info.c index c5bc171f7a0..ba51bdc2d39 100644 --- a/ompi/info/info.c +++ b/ompi/info/info.c @@ -131,6 +131,12 @@ int ompi_mpiinfo_init(void) opal_info_set(&ompi_mpi_info_env.info.super, "soft", cptr); free(cptr); + /* the initial error handler, set it as requested (nothing if not + * requested) */ + if (NULL != ompi_process_info.initial_errhandler) { + opal_info_set(&ompi_mpi_info_env.info.super, "mpi_initial_errhandler", ompi_process_info.initial_errhandler); + } + /* local host name */ opal_info_set(&ompi_mpi_info_env.info.super, "host", ompi_process_info.nodename); diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 17d7186400d..62f689df763 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -751,12 +751,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, goto error; } - /* initialize info */ - if (OMPI_SUCCESS != (ret = ompi_mpiinfo_init())) { - error = "ompi_info_init() failed"; - goto error; - } - /* initialize error handlers */ if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) { error = "ompi_errhandler_init() failed"; @@ -775,6 +769,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, goto error; } + /* initialize info */ + if (OMPI_SUCCESS != (ret = ompi_mpiinfo_init())) { + error = "ompi_info_init() failed"; + goto error; + } + /* initialize groups */ if (OMPI_SUCCESS != (ret = ompi_group_init())) { error = "ompi_group_init() failed"; diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c index bad581cf25a..b8f59326516 100644 --- a/ompi/runtime/ompi_rte.c +++ b/ompi/runtime/ompi_rte.c @@ -935,6 +935,11 @@ int ompi_rte_finalize(void) opal_process_info.initial_wdir = NULL; } + if (NULL != opal_process_info.initial_errhandler) { + free(opal_process_info.initial_errhandler); + opal_process_info.initial_errhandler = NULL; + } + /* cleanup our internal nspace hack */ opal_pmix_finalize_nspace_tracker(); diff --git a/opal/util/proc.c b/opal/util/proc.c index 05b2bbea7a4..26973fdd619 100644 --- a/opal/util/proc.c +++ b/opal/util/proc.c @@ -51,7 +51,8 @@ opal_process_info_t opal_process_info = { .num_apps = 0, .initial_wdir = NULL, .reincarnation = 0, - .proc_is_bound = false + .proc_is_bound = false, + .initial_errhandler = NULL, }; static opal_proc_t opal_local_proc = { diff --git a/opal/util/proc.h b/opal/util/proc.h index c7b5928794f..785a6f7ec95 100644 --- a/opal/util/proc.h +++ b/opal/util/proc.h @@ -126,6 +126,7 @@ typedef struct opal_process_info_t { char *initial_wdir; uint32_t reincarnation; bool proc_is_bound; + char *initial_errhandler; } opal_process_info_t; OPAL_DECLSPEC extern opal_process_info_t opal_process_info; From 83d0f9215235eed4266a76b606f3550c2569e2b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bouteiller?= Date: Thu, 11 Jun 2020 12:02:39 -0400 Subject: [PATCH 3/6] Set the initial error handler onto predefined communicators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bouteiller update to the predefined initial error handler selection Signed-off-by: Aurelien Bouteiller --- ompi/communicator/comm_init.c | 16 ++++++++++------ ompi/errhandler/errhandler.h | 4 +++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index 3c11f2186f5..524394cf529 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -135,8 +135,8 @@ int ompi_comm_init(void) ompi_mpi_comm_world.comm.c_remote_group = group; OBJ_RETAIN(ompi_mpi_comm_world.comm.c_remote_group); ompi_mpi_comm_world.comm.c_cube_dim = opal_cube_dim((int)size); - ompi_mpi_comm_world.comm.error_handler = &ompi_mpi_errors_are_fatal.eh; - OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh ); + ompi_mpi_comm_world.comm.error_handler = ompi_initial_error_handler_eh; + OBJ_RETAIN( ompi_mpi_comm_world.comm.error_handler ); OMPI_COMM_SET_PML_ADDED(&ompi_mpi_comm_world.comm); opal_pointer_array_set_item (&ompi_mpi_communicators, 0, &ompi_mpi_comm_world); @@ -188,8 +188,8 @@ int ompi_comm_init(void) ompi_mpi_comm_self.comm.c_local_group = group; ompi_mpi_comm_self.comm.c_remote_group = group; OBJ_RETAIN(ompi_mpi_comm_self.comm.c_remote_group); - ompi_mpi_comm_self.comm.error_handler = &ompi_mpi_errors_are_fatal.eh; - OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh ); + ompi_mpi_comm_self.comm.error_handler = ompi_initial_error_handler_eh; + OBJ_RETAIN( ompi_mpi_comm_self.comm.error_handler ); OMPI_COMM_SET_PML_ADDED(&ompi_mpi_comm_self.comm); opal_pointer_array_set_item (&ompi_mpi_communicators, 1, &ompi_mpi_comm_self); @@ -214,8 +214,10 @@ int ompi_comm_init(void) ompi_mpi_comm_null.comm.c_contextid = 2; ompi_mpi_comm_null.comm.c_my_rank = MPI_PROC_NULL; + /* unlike world, self, and parent, comm_null does not inherit the initial error + * handler */ ompi_mpi_comm_null.comm.error_handler = &ompi_mpi_errors_are_fatal.eh; - OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh ); + OBJ_RETAIN( ompi_mpi_comm_null.comm.error_handler ); opal_pointer_array_set_item (&ompi_mpi_communicators, 2, &ompi_mpi_comm_null); opal_string_copy(ompi_mpi_comm_null.comm.c_name, "MPI_COMM_NULL", @@ -228,6 +230,8 @@ int ompi_comm_init(void) OBJ_RETAIN(&ompi_mpi_comm_null); OBJ_RETAIN(&ompi_mpi_group_null.group); OBJ_RETAIN(&ompi_mpi_errors_are_fatal.eh); + /* During dyn_init, the comm_parent error handler will be set to the same + * as comm_world (thus, the initial error handler). */ /* initialize communicator requests (for ompi_comm_idup) */ ompi_comm_request_init (); diff --git a/ompi/errhandler/errhandler.h b/ompi/errhandler/errhandler.h index 139740089f2..ce14ae6c097 100644 --- a/ompi/errhandler/errhandler.h +++ b/ompi/errhandler/errhandler.h @@ -233,7 +233,9 @@ struct ompi_request_t; int32_t state = ompi_mpi_state; \ if (OPAL_UNLIKELY(state < OMPI_MPI_STATE_INIT_COMPLETED || \ state > OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT)) { \ - ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \ + ompi_errhandler_invoke(NULL, NULL, -1, \ + ompi_errcode_get_mpi_code(MPI_ERR_ARG), \ + name); \ } \ } From bed909c3baa853a23dc6da2ed0fc31d9d03e3521 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bouteiller?= Date: Thu, 11 Jun 2020 12:03:19 -0400 Subject: [PATCH 4/6] Read the info key mpi_initial_errhandler from spawn/spawn_multiple MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bouteiller Use the same env to transmit the initial error handler to spawnees Signed-off-by: Aurelien Bouteiller --- ompi/dpm/dpm.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index 18c5962f8cf..af205f8a01e 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -782,6 +782,7 @@ int ompi_dpm_spawn(int count, const char *array_of_commands[], int flag=0; char cwd[OPAL_PATH_MAX]; char host[OPAL_MAX_INFO_VAL]; /*** should define OMPI_HOST_MAX ***/ + char init_errh[OPAL_MAX_INFO_VAL]; char prefix[OPAL_MAX_INFO_VAL]; char stdin_target[OPAL_MAX_INFO_VAL]; char params[OPAL_MAX_INFO_VAL]; @@ -814,6 +815,7 @@ int ompi_dpm_spawn(int count, const char *array_of_commands[], - "file": filename, where additional information is provided. - "soft": see page 92 of MPI-2. - "host": desired host where to spawn the processes + - "mpi_initial_errhandler": the error handler attached to predefined communicators. Non-standard keys: - "hostfile": hostfile containing hosts where procs are to be spawned @@ -968,6 +970,15 @@ int ompi_dpm_spawn(int count, const char *array_of_commands[], } #endif + /* check for 'mpi_initial_errhandler' */ + ompi_info_get (array_of_info[i], "mpi_initial_errhandler", sizeof(init_errh) - 1, init_errh, &flag); + if ( flag ) { + /* this is set as an environment because it must be available + * before pmix_init */ + opal_setenv("OMPI_MCA_mpi_initial_errhandler", init_errh, true, &app->env); + continue; + } + /* 'path', 'arch', 'file', 'soft' -- to be implemented */ /* non-standard keys From 5f1f7fe3136730ffcc82ce52b61031ab73046edc Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 16 Jun 2020 04:20:24 -0400 Subject: [PATCH 5/6] route errors to self/initial error handler depending upon the state of MPI initialization Signed-off-by: Aurelien Bouteiller --- ompi/errhandler/errhandler_invoke.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/ompi/errhandler/errhandler_invoke.c b/ompi/errhandler/errhandler_invoke.c index ad966967156..d913fe20cbf 100644 --- a/ompi/errhandler/errhandler_invoke.c +++ b/ompi/errhandler/errhandler_invoke.c @@ -41,9 +41,25 @@ int ompi_errhandler_invoke(ompi_errhandler_t *errhandler, void *mpi_object, ompi_win_t *win; ompi_file_t *file; - /* If we got no errorhandler, then just invoke errors_are_fatal */ + /* If we got no errorhandler, then route the error to the appropriate + * predefined error handler */ if (NULL == errhandler) { - ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, message); + int32_t state = ompi_mpi_state; + if (state >= OMPI_MPI_STATE_INIT_COMPLETED && + state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) { + comm = &ompi_mpi_comm_self.comm; + comm->error_handler->eh_comm_fn(&comm, &err_code, message, NULL); + } + else { + if(NULL == ompi_initial_error_handler) { + int rc = ompi_initial_errhandler_init(); + if(OMPI_SUCCESS != rc) { + /* don't know what else to do... */ + ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, message); + } + } + ompi_initial_error_handler(NULL, NULL, message); + } return err_code; } From 7118755ae8c72db00e95790e8f35d0c6e00760bd Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 16 Jun 2020 04:21:25 -0400 Subject: [PATCH 6/6] Add a tester for the initial error handler Signed-off-by: Aurelien Bouteiller --- test/simple/Makefile | 2 +- test/simple/initial_errh.c | 202 +++++++++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 test/simple/initial_errh.c diff --git a/test/simple/Makefile b/test/simple/Makefile index 00e76123e8f..3595945596e 100644 --- a/test/simple/Makefile +++ b/test/simple/Makefile @@ -1,4 +1,4 @@ -PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort comm_abort simple_spawn \ +PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort comm_abort initial_errh simple_spawn \ concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child \ bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help \ crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop \ diff --git a/test/simple/initial_errh.c b/test/simple/initial_errh.c new file mode 100644 index 00000000000..27adbd6ce08 --- /dev/null +++ b/test/simple/initial_errh.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include +#include "mpi.h" + +#define print1(format...) if(0 == rank) printf(format) + +int main_child(int argc, char *argv[]); + +int main(int argc, char *argv[]) +{ + int rank=MPI_PROC_NULL, rc; + /* info_env and error handlers */ + char init_errh_info[MPI_MAX_INFO_VAL+1]; int flag; + MPI_Errhandler errh; + /* error ops */ + int eclass=MPI_SUCCESS; + char estr[MPI_MAX_ERROR_STRING]="NOT UPDATED"; int slen; + /* spawn params */ + char* spawn_argv[3]; + MPI_Info spawn_info; + int spawn_err[2] = {MPI_SUCCESS}; + MPI_Comm icomm = MPI_COMM_NULL; + + /* We will verify pre-init behavior in a spawnee to avoid aborting early in + * implementations with only partial support. + */ + if(argc > 1 && 0 == strcmp(argv[1], "preinit-error")) { + return main_child(argc, argv); + } + + /* Lets assume everything goes fine until we inject our own errors, no + * error checking */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + print1( +"# This test checks compliance with MPI-4 initial error handler.\n" +"# This test assumes that the command line parameter '-initial-errhandler mpi_errors_return'\n" +"# is passed to 'mpiexec', in which case, a compliant implementation will:\n" +"# * Set the MPI_INFO_ENV key to the requested error handler.\n" +"# * The requested handler is set on the predefined communicators MPI_COMM_SELF, MPI_COMM_WORLD,\n" +"# and the communicator returned from MPI_COMM_GET_PARENT.\n" +"# In a high quality implementation:\n" +"# * Errors reported from calls during, before, and after MPI_INIT and MPI_FINALIZE also invoke the\n" +"# initial error handler.\n" +"# * MPI_ERROR_STRING and MPI_ERROR_CLASS provide useful information before/after MPI_INIT and\n" +"# MPI_FINALIZE respectively.\n\n"); + + print1("MPI_INFO_ENV for key 'mpi_initial_errhandler'\n"); + MPI_Info_get(MPI_INFO_ENV, "mpi_initial_errhandler", MPI_MAX_INFO_VAL, init_errh_info, &flag); + if(flag) { + print1(" MPI-4 COMPLIANT:\tMPI_INFO_ENV value set for key 'mpi_initial_errhandler' = %s\n\n", init_errh_info); + } + else { + print1(" NOT MPI-4 COMPLIANT:\tMPI_INFO_ENV has no value set for key 'mpi_initial_errhandler'\n\n"); + } + + print1("MPI_COMM_GET_ERRHANDLER:\n"); + MPI_Comm_get_errhandler(MPI_COMM_SELF, &errh); + if(MPI_ERRORS_RETURN == errh) { + print1(" MPI-4 COMPLIANT:\tMPI_COMM_SELF error handler set to MPI_ERRORS_RETURN.\n\n"); + } + else + if(MPI_ERRORS_ABORT == errh) { + print1(" UNEXPECTED:\tMPI_COMM_SELF error handler set to MPI_ERRORS_ABORT.\n\n"); + } + else + if(MPI_ERRORS_ARE_FATAL == errh) { + print1(" NOT MPI-4 COMPLIANT:\tMPI_COMM_SELF error handler set to MPI_ERRORS_ARE_FATAL.\n\n"); + } + else { + print1(" UNEXPECTED:\tMPI_COMM_SELF error handler is not one of the predefined ones.\n\n"); + } + + sleep(1); + + MPI_Info_create(&spawn_info); + MPI_Info_set(spawn_info, "mpi_initial_errhandler", "mpi_errors_return"); + spawn_argv[0] = argv[0]; + spawn_argv[1] = "preinit-error"; + spawn_argv[2] = NULL; + MPI_Comm_spawn(argv[0], &spawn_argv[1], 1, spawn_info, 0, MPI_COMM_WORLD, &icomm, spawn_err); + + /* wait for the spawnee completion before testing post-finalize error + * handling */ + MPI_Barrier(icomm); + MPI_Comm_disconnect(&icomm); + sleep(2); + + /* set error handler to fatal before FINALIZE */ + rc = MPI_Comm_set_errhandler(MPI_COMM_SELF, MPI_ERRORS_ARE_FATAL); + if(MPI_SUCCESS != rc) { + MPI_Error_string(rc, estr, &slen); + fprintf(stderr, " UNEXPECTED: An error occured during MPI_COMM_SETERRHANDLER(SELF) rc=%d: %s\n", rc, estr); + return rc; + } + /* FINALIZE should force reversion to the initial errhandler, so we need to + * check again (though we did not insert errors so all should go smooth). */ + rc = MPI_Finalize(); + if(MPI_SUCCESS != rc) { + MPI_Error_string(rc, estr, &slen); + fprintf(stderr, " UNEXPECTED: An error occured during MPI_FINALIZE rc=%d: %s\n", rc, estr); + return rc; + } + + printf("Post-finalize MPI_ERROR_STRING call:\n"); + rc = MPI_Error_string(MPI_ERR_WIN, estr, &slen); + if(MPI_SUCCESS != rc) { + fprintf(stderr, " NOT MPI-4 COMPLIANT:\tpost-finalize MPI_ERROR_STRING returned %d (expected MPI_SUCCESS)\n", rc); + } + else if(0 == strcmp(estr, "NOT UPDATED")) { + fprintf(stderr, " NOT MPI-4 COMPLIANT:\tpost-finalize MPI_ERROR_STRING did not set a valid string.\n"); + } + else { + /* We can't further check if the error string makes sense; In any + * case, any string is compliant, even low-quality non-informative + * generic strings. So we just print it. */ + printf(" MPI-4 COMPLIANT:\tpost-finalize MPI_ERROR_STRING for MPI_ERR_WIN: %s\n", estr); + } + return 0; +} + +int main_child(int argc, char *argv[]) { + int rank=0, rc; + MPI_Comm icomm=MPI_COMM_NULL; + int eclass=MPI_SUCCESS; + char estr[MPI_MAX_ERROR_STRING]="NOT UPDATED"; int slen; + + /* ERROR_CLASS and ERROR_STRING are callable before MPI_INIT */ + + printf("Pre-init MPI_ERROR_CLASS call:\n"); + rc = MPI_Error_class(MPI_ERR_WIN, &eclass); + if(MPI_SUCCESS != rc) { + fprintf(stderr, " NOT MPI-4 COMPLIANT:\tpre-init MPI_ERROR_CLASS returned %d (expected MPI_SUCCESS)\n", rc); + } + else if(MPI_ERR_WIN != eclass) { + fprintf(stderr, " NOT MPI-4 COMPLIANT:\tpre-init MPI_ERROR_CLASS set eclass=%d (expected %d)\n", eclass, MPI_ERR_WIN); + } + else { + printf(" MPI-4 COMPLIANT:\tPre-init MPI_ERROR_CLASS\n"); + } + + print1("Pre-init MPI_ERROR_STRING call:\n"); + rc = MPI_Error_string(MPI_ERR_WIN, estr, &slen); + if(MPI_SUCCESS != rc) { + fprintf(stderr, " NOT MPI-4 COMPLIANT:\tpre-init MPI_ERROR_STRING returned %d (expected MPI_SUCCESS)\n", rc); + } + else if(0 == strcmp(estr, "NOT UPDATED")) { + fprintf(stderr, " NOT MPI-4 COMPLIANT:\tpre-init MPI_ERROR_STRING did not set a valid string.\n"); + } + else { + /* We can't further check if the error string makes sense; In any + * case, any string is compliant, even low-quality non-informative + * generic strings. So we just print it. */ + printf(" MPI-4 COMPLIANT:\tPre-init MPI_ERROR_STRING for MPI_ERR_WIN: %s\n", estr); + } + + printf("Pre-init error in a call: compliant if it does not abort\n"); + rc = MPI_Error_class(MPI_ERR_LASTCODE+1, &eclass); + eclass = rc; + rc = MPI_Error_string(eclass, estr, &slen); + if(MPI_SUCCESS != rc) { + printf(" MPI-4 COMPLIANT:\tPre-init MPI_ERROR_CLASS with erroneous arguments returned (LOW QUALITY: error code=%d caused error %d in MPI_ERROR_STRING).\n", eclass, rc); + } + else { + printf(" MPI-4 COMPLIANT:\tPre-init MPI_ERROR_STRING for non-existing code returned %d: %s\n", eclass, estr); + } + + printf("Initializing MPI and setting error handlers on predefined communicators.\n"); + rc = MPI_Init(&argc, &argv); + if(MPI_SUCCESS != rc) { + MPI_Error_string(rc, estr, &slen); + fprintf(stderr, " UNEXPECTED: An error occured during MPI_INIT rc=%d: %s\n", rc, estr); + return rc; + } + + /* sync-up with parent */ + MPI_Comm_get_parent(&icomm); + rc = MPI_Comm_set_errhandler(icomm, MPI_ERRORS_ARE_FATAL); + if(MPI_SUCCESS != rc) { + MPI_Error_string(rc, estr, &slen); + fprintf(stderr, " UNEXPECTED: An error occured during MPI_COMM_SETERRHANDLER(PARENT) rc=%d: %s\n", rc, estr); + return rc; + } + MPI_Barrier(icomm); + MPI_Comm_disconnect(&icomm); + + MPI_Finalize(); + return 0; +}