From e14e84acebaddedab396691c9bab70457c46901e Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Fri, 21 Feb 2020 12:13:39 -0600 Subject: [PATCH] sharedfp/individual: defer error when not being able to open datafile This commit changes the behavior of the individual sharedfp component. If the component cannot create either the datafile or the metadatafile during File_open, no error is being raised going forward. This allows applications that do not use shared file pointer operations to continue execution without any issue. If the user however subsequently calls MPI_File_write_shared or similar operations, an error will be raised. Fixes issue #7429 Signed-off-by: Edgar Gabriel (cherry picked from commit df6e3e503aee6954807a6bdc3a73dfa9a7d030af) --- .../sharedfp_individual_file_open.c | 39 +++++++++++++++---- .../individual/sharedfp_individual_write.c | 32 ++++++++------- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/ompi/mca/sharedfp/individual/sharedfp_individual_file_open.c b/ompi/mca/sharedfp/individual/sharedfp_individual_file_open.c index ecdb8c723ea..5bff7247362 100644 --- a/ompi/mca/sharedfp/individual/sharedfp_individual_file_open.c +++ b/ompi/mca/sharedfp/individual/sharedfp_individual_file_open.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013-2018 University of Houston. All rights reserved. + * Copyright (c) 2013-2019 University of Houston. All rights reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. @@ -92,11 +92,18 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_DELETE_ON_CLOSE, &(MPI_INFO_NULL->super), datafilehandle, false); if ( OMPI_SUCCESS != err) { - opal_output(0, "mca_sharedfp_individual_file_open: Error during datafile file open\n"); + opal_output(ompi_sharedfp_base_framework.framework_output, + "mca_sharedfp_individual_file_open: Error during datafile file open. Continuing anyway. \n"); free (sh); free (datafilename); free (datafilehandle); - return err; + + // We reset the error code here to OMPI_SUCCESS since the individual component can act as + // a dummy component, in case no sharedfp operations are used by the code. Invoking any write/read + // operations will however lead to an error, since the sharedfp_data pointer will be NULL. + sh = NULL; + err = OMPI_SUCCESS; + goto exit; } /*----------------------------------------------------------*/ @@ -113,9 +120,13 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm, if ( NULL == metadatafilename ) { free (sh); free (datafilename); + mca_common_ompio_file_close ( datafilehandle); free (datafilehandle); opal_output(0, "mca_sharedfp_individual_file_open: Error during memory allocation\n"); - return OMPI_ERR_OUT_OF_RESOURCE; + + sh=NULL; + err = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; } snprintf ( metadatafilename, len, "%s%s%d", filename, ".metadata.",fh->f_rank); @@ -123,22 +134,34 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm, if ( NULL == metadatafilehandle ) { free (sh); free (datafilename); + mca_common_ompio_file_close ( datafilehandle); free (datafilehandle); free (metadatafilename); opal_output(0, "mca_sharedfp_individual_file_open: Error during memory allocation\n"); - return OMPI_ERR_OUT_OF_RESOURCE; + + sh = NULL; + err = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; } err = mca_common_ompio_file_open ( MPI_COMM_SELF,metadatafilename, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_DELETE_ON_CLOSE, &(MPI_INFO_NULL->super), metadatafilehandle, false); if ( OMPI_SUCCESS != err) { - opal_output(0, "mca_sharedfp_individual_file_open: Error during metadatafile file open\n"); + opal_output(ompi_sharedfp_base_framework.framework_output, + "mca_sharedfp_individual_file_open: Error during metadatafile file open. Continuing anyway. \n"); free (sh); free (datafilename); + mca_common_ompio_file_close ( datafilehandle); free (datafilehandle); free (metadatafilename); free (metadatafilehandle); - return err; + + // We reset the error code here to OMPI_SUCCESS since the individual component can act as + // a dummy component, in case no sharedfp operations are used by the code. Invoking any write/read + // operations will however lead to an error, since the sharedfp_data pointer will be NULL. + sh = NULL; + err = OMPI_SUCCESS; + goto exit; } /*save the datafilehandle and metadatahandle in the sharedfp individual module data structure*/ @@ -150,6 +173,8 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm, headnode->metadatafilename = metadatafilename; } + +exit: /*save the sharedfp individual module data structure in the ompio filehandle structure*/ fh->f_sharedfp_data = sh; diff --git a/ompi/mca/sharedfp/individual/sharedfp_individual_write.c b/ompi/mca/sharedfp/individual/sharedfp_individual_write.c index 2b016001913..a1d21fb96c9 100644 --- a/ompi/mca/sharedfp/individual/sharedfp_individual_write.c +++ b/ompi/mca/sharedfp/individual/sharedfp_individual_write.c @@ -54,24 +54,26 @@ int mca_sharedfp_individual_write (ompio_file_t *fh, /*Retrieve data structure for shared file pointer operations*/ sh = fh->f_sharedfp_data; headnode = (mca_sharedfp_individual_header_record*)sh->selected_module_data; + if ( NULL == headnode) { + opal_output (0, "sharedfp_individual_write_ordered: headnode is NULL but file is open\n"); + return OMPI_ERROR; + } - if (headnode) { - /*Insert metadata record into a queue*/ - mca_sharedfp_individual_insert_metadata(OMPI_FILE_WRITE_SHARED, totalbytes, sh); - - /*Write the data into individual file*/ - ret = mca_common_ompio_file_write_at ( headnode->datafilehandle, - headnode->datafile_offset, - buf, count, datatype, status); - if ( OMPI_SUCCESS != ret ) { - opal_output(0,"mca_sharedfp_individual_write: Error while writing the datafile \n"); - return -1; - } - - /* Update the datafileoffset*/ - headnode->datafile_offset = headnode->datafile_offset + totalbytes; + /*Insert metadata record into a queue*/ + mca_sharedfp_individual_insert_metadata(OMPI_FILE_WRITE_SHARED, totalbytes, sh); + + /*Write the data into individual file*/ + ret = mca_common_ompio_file_write_at ( headnode->datafilehandle, + headnode->datafile_offset, + buf, count, datatype, status); + if ( OMPI_SUCCESS != ret ) { + opal_output(0,"mca_sharedfp_individual_write: Error while writing the datafile \n"); + return -1; } + /* Update the datafileoffset*/ + headnode->datafile_offset = headnode->datafile_offset + totalbytes; + return ret; }