diff --git a/opal/mca/pmix/pmix3x/pmix/Makefile.am b/opal/mca/pmix/pmix3x/pmix/Makefile.am index 47e21332730..88eda877676 100644 --- a/opal/mca/pmix/pmix3x/pmix/Makefile.am +++ b/opal/mca/pmix/pmix3x/pmix/Makefile.am @@ -11,7 +11,7 @@ # All rights reserved. # Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. -# Copyright (c) 2013-2019 Intel, Inc. All rights reserved. +# Copyright (c) 2013-2018 Intel, Inc. All rights reserved. # Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights # reserved. # $COPYRIGHT$ diff --git a/opal/mca/pmix/pmix3x/pmix/NEWS b/opal/mca/pmix/pmix3x/pmix/NEWS index f18016dd7f2..8b102555cba 100644 --- a/opal/mca/pmix/pmix3x/pmix/NEWS +++ b/opal/mca/pmix/pmix3x/pmix/NEWS @@ -1,5 +1,5 @@ Copyright (c) 2015-2019 Intel, Inc. All rights reserved. -Copyright (c) 2017-2019 IBM Corporation. All rights reserved. +Copyright (c) 2017-2020 IBM Corporation. All rights reserved. $COPYRIGHT$ Additional copyrights may follow @@ -21,6 +21,45 @@ example, a bug might be fixed in the master, and then moved to multiple release branches. +3.1.6 -- TBD +---------------------- + + +3.1.5 -- 14 Feb 2020 +---------------------- +NOTE: The signature of the PMIx_Allocation_request has changed +in accordance with an Errata update of the PMIx v3 Standard +- PR #1413/#1465: Remove unnecessary error log +- PR #1433: Return the correct status from PMIx_Publish +- PR #1445: Sync. with master to fix 'get' of data for unknown namespace + Includes the following PRs from master + - PR #1382: dstore: fixed truncate key-names while restoring + - PR #1405: Fix xnspace dmodex and add verbose debug + - PR #1406: Resolve request for job-level data + - PR #1407/#1409/#1411: Fix dmodex across nspaces + - PR #1434/#1436: Cleanup handling of info arrays + - PR #1435: Cleanup example and remove debug + - PR #1437: Update blocking Allocation_request signature + - PR #1440: Fix 'get' of data for unknown namespace + - PR #1442: Fix fences with namespaces where no local processes are running +- PR #1472: Initialize nlocal and local in the tracker +- PR #1487: Sync. with master to fix info array and cross-version issues +- PR #1493/#1497/#1501/#1505/#1589: Info array and Cross-version fixes +- PR #1511/#1517/#1520/#1523/#1534/#1565: Fix pmix tests +- PR #1530: Improve handling of servers piror to v3.1.5 +- PR #1531: Update transfer from hash to dstore +- PR #1538: Fix singleton initialization +- PR #1547: Add missing PMIx_IOF_deregister function +- PR #1554/#1591: Fix memory leak on namespace deregister +- PR #1561: Configury fix for reproducible builds +- PR #1579: Protect pthread setpshared calls +- PR #1587: Fix to gds/dstore configure logic +- PR #1610: Adjust hotel timeout to be in whole seconds +- PR #1613: dstore: Fix cache size calculation +- PR #1622: Fix multiple occurrences of unaligned access in pmix tests +- PR #1620: Re-address the collective tracker problem + + 3.1.4 -- 9 Aug 2019 ---------------------- - PR #1342: Fix if_linux_ipv6_open interface filter diff --git a/opal/mca/pmix/pmix3x/pmix/VERSION b/opal/mca/pmix/pmix3x/pmix/VERSION index 8d2b40af3ea..f3fb512ecf3 100644 --- a/opal/mca/pmix/pmix3x/pmix/VERSION +++ b/opal/mca/pmix/pmix3x/pmix/VERSION @@ -4,6 +4,8 @@ # Copyright (c) 2013 Mellanox Technologies, Inc. # All rights reserved. # Copyright (c) 2014-2016 Intel, Inc. All rights reserved. +# Copyright (c) 2020 IBM Corporation. All rights reserved. + # This is the VERSION file for PMIx, describing the precise # version of PMIx in this distribution. The various components of @@ -15,7 +17,7 @@ major=3 minor=1 -release=4 +release=5 # greek is used for alpha or beta release tags. If it is non-empty, # it will be appended to the version number. It does not have to be @@ -23,14 +25,14 @@ release=4 # The only requirement is that it must be entirely printable ASCII # characters and have no white space. -greek= +greek=rc2 # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always" # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=gite6837057 +repo_rev=git1fca232 # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +46,7 @@ tarball_version= # The date when this release was created -date="Aug 09, 2019" +date="Feb 10, 2020" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library @@ -75,7 +77,7 @@ date="Aug 09, 2019" # Version numbers are described in the Libtool current:revision:age # format. -libpmix_so_version=4:24:2 +libpmix_so_version=4:25:2 libpmi_so_version=1:1:0 libpmi2_so_version=1:0:0 @@ -84,4 +86,4 @@ libpmi2_so_version=1:0:0 # # well. Yuck; this somewhat breaks the # # components-don't-affect-the-build-system abstraction. # -libmca_common_dstore_so_version=1:1:0 +libmca_common_dstore_so_version=1:2:0 diff --git a/opal/mca/pmix/pmix3x/pmix/config/Makefile.am b/opal/mca/pmix/pmix3x/pmix/config/Makefile.am index ebc3af9d96a..ffe29370600 100644 --- a/opal/mca/pmix/pmix3x/pmix/config/Makefile.am +++ b/opal/mca/pmix/pmix3x/pmix/config/Makefile.am @@ -1,4 +1,4 @@ -# Copyright (c) 2013-2019 Intel, Inc. All rights reserved. +# Copyright (c) 2013-2016 Intel, Inc. All rights reserved # Copyright (c) 2016 Research Organization for Information Science # and Technology (RIST). All rights reserved. # Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. diff --git a/opal/mca/pmix/pmix3x/pmix/config/c_get_alignment.m4 b/opal/mca/pmix/pmix3x/pmix/config/c_get_alignment.m4 index 6596c0ae88d..4ef0ba25e32 100644 --- a/opal/mca/pmix/pmix3x/pmix/config/c_get_alignment.m4 +++ b/opal/mca/pmix/pmix3x/pmix/config/c_get_alignment.m4 @@ -11,7 +11,7 @@ dnl University of Stuttgart. All rights reserved. dnl Copyright (c) 2004-2005 The Regents of the University of California. dnl All rights reserved. dnl Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. -dnl Copyright (c) 2014-2019 Intel, Inc. All rights reserved. +dnl Copyright (c) 2014-2015 Intel, Inc. All rights reserved. dnl Copyright (c) 2015-2019 Research Organization for Information Science dnl and Technology (RIST). All rights reserved. dnl $COPYRIGHT$ diff --git a/opal/mca/pmix/pmix3x/pmix/config/distscript.sh b/opal/mca/pmix/pmix3x/pmix/config/distscript.sh index e5c948f15f1..d4a9dce6ab3 100755 --- a/opal/mca/pmix/pmix3x/pmix/config/distscript.sh +++ b/opal/mca/pmix/pmix3x/pmix/config/distscript.sh @@ -15,7 +15,7 @@ # and Technology (RIST). All rights reserved. # Copyright (c) 2015 Los Alamos National Security, LLC. All rights # reserved. -# Copyright (c) 2017-2019 Intel, Inc. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow diff --git a/opal/mca/pmix/pmix3x/pmix/config/pmix.m4 b/opal/mca/pmix/pmix3x/pmix/config/pmix.m4 index 1d37089f8ae..b057aa91a67 100644 --- a/opal/mca/pmix/pmix3x/pmix/config/pmix.m4 +++ b/opal/mca/pmix/pmix3x/pmix/config/pmix.m4 @@ -17,7 +17,7 @@ dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights dnl reserved. dnl Copyright (c) 2009-2011 Oak Ridge National Labs. All rights reserved. dnl Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved. -dnl Copyright (c) 2013-2019 Intel, Inc. All rights reserved. +dnl Copyright (c) 2013-2020 Intel, Inc. All rights reserved. dnl Copyright (c) 2015-2019 Research Organization for Information Science dnl and Technology (RIST). All rights reserved. dnl Copyright (c) 2016 Mellanox Technologies, Inc. @@ -785,6 +785,15 @@ AC_DEFUN([PMIX_SETUP_CORE],[ PMIX_ZLIB_CONFIG + ################################## + # Dstore Locking + ################################## + + pmix_show_title "Dstore Locking" + + PMIX_CHECK_DSTOR_LOCK + + ################################## # MCA ################################## @@ -823,13 +832,6 @@ AC_DEFUN([PMIX_SETUP_CORE],[ PMIX_MCA - ################################## - # Dstore Locking - ################################## - - pmix_show_title "Dstore Locking" - - PMIX_CHECK_DSTOR_LOCK ############################################################################ # final compiler config diff --git a/opal/mca/pmix/pmix3x/pmix/config/pmix_check_compiler_version.m4 b/opal/mca/pmix/pmix3x/pmix/config/pmix_check_compiler_version.m4 index da822b04810..4704b8f1c6c 100644 --- a/opal/mca/pmix/pmix3x/pmix/config/pmix_check_compiler_version.m4 +++ b/opal/mca/pmix/pmix3x/pmix/config/pmix_check_compiler_version.m4 @@ -1,7 +1,7 @@ dnl -*- shell-script -*- dnl dnl Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. -dnl Copyright (c) 2013-2019 Intel, Inc. All rights reserved. +dnl Copyright (c) 2013-2017 Intel, Inc. All rights reserved. dnl Copyright (c) 2019 Research Organization for Information Science dnl and Technology (RIST). All rights reserved. dnl diff --git a/opal/mca/pmix/pmix3x/pmix/config/pmix_check_icc.m4 b/opal/mca/pmix/pmix3x/pmix/config/pmix_check_icc.m4 index 05ce9431bd3..de92a5d66cf 100644 --- a/opal/mca/pmix/pmix3x/pmix/config/pmix_check_icc.m4 +++ b/opal/mca/pmix/pmix3x/pmix/config/pmix_check_icc.m4 @@ -10,7 +10,7 @@ dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, dnl University of Stuttgart. All rights reserved. dnl Copyright (c) 2004-2005 The Regents of the University of California. dnl All rights reserved. -dnl Copyright (c) 2014-2019 Intel, Inc. All rights reserved. +dnl Copyright (c) 2014 Intel, Inc. All rights reserved. dnl Copyright (c) 2016-2019 Research Organization for Information Science dnl and Technology (RIST). All rights reserved. dnl $COPYRIGHT$ diff --git a/opal/mca/pmix/pmix3x/pmix/config/pmix_check_lock.m4 b/opal/mca/pmix/pmix3x/pmix/config/pmix_check_lock.m4 index 0590dcf56cd..182b8329014 100644 --- a/opal/mca/pmix/pmix3x/pmix/config/pmix_check_lock.m4 +++ b/opal/mca/pmix/pmix3x/pmix/config/pmix_check_lock.m4 @@ -5,7 +5,7 @@ dnl All rights reserved. dnl Copyright (c) 2017 IBM Corporation. All rights reserved. dnl Copyright (c) 2017 Research Organization for Information Science dnl and Technology (RIST). All rights reserved. -dnl Copyright (c) 2017 Intel, Inc. All rights reserved. +dnl Copyright (c) 2017-2020 Intel, Inc. All rights reserved. dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -14,35 +14,61 @@ dnl $HEADER$ dnl AC_DEFUN([PMIX_CHECK_DSTOR_LOCK],[ + + PMIX_VAR_SCOPE_PUSH(orig_libs pmix_prefer_write_nonrecursive) + orig_libs=$LIBS LIBS="-lpthread $LIBS" - _x_ac_pthread_lock_found="0" - _x_ac_fcntl_lock_found="0" + _x_ac_pthread_lock_found=0 + _x_ac_fcntl_lock_found=0 + pmix_prefer_write_nonrecursive=0 - AC_CHECK_MEMBERS([struct flock.l_type], - [ - AC_DEFINE([HAVE_FCNTL_FLOCK], [1], - [Define to 1 if you have the locking by fcntl.]) - _x_ac_fcntl_lock_found="1" - ], [], [#include ]) + AC_CHECK_MEMBER([struct flock.l_type], + [pmix_fcntl_flock_happy=yes + _x_ac_fcntl_lock_found=1], + [pmix_fcntl_flock_happy=no], + [#include ]) if test "$DSTORE_PTHREAD_LOCK" = "1"; then + + AC_MSG_CHECKING([pthread_process_shared]) + AC_EGREP_CPP([yes], + [#include + #ifdef PTHREAD_PROCESS_SHARED + yes + #endif + ], + [AC_MSG_RESULT(yes) + pmix_pthread_process_shared=yes], + [AC_MSG_RESULT(no) + pmix_pthread_process_shared=no]) + AC_CHECK_FUNC([pthread_rwlockattr_setkind_np], - [AC_EGREP_HEADER([PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP], - [pthread.h],[ - AC_DEFINE([HAVE_PTHREAD_SETKIND], [1], - [Define to 1 if you have the `pthread_rwlockattr_setkind_np` function.])])]) + [pmix_pthread_rwlockattr_setkind_np=yes + AC_EGREP_CPP([yes], + [#include + #ifdef PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP + yes + #endif + ], + [pmix_pthread_rwlock_prefer_writer_nonrecursive_np=yes], + [pmix_pthread_rwlock_prefer_writer_nonrecursive_np=no])], + [pmix_pthread_rwlockattr_setkind_np=no]) AC_CHECK_FUNC([pthread_rwlockattr_setpshared], - [AC_EGREP_HEADER([PTHREAD_PROCESS_SHARED], - [pthread.h],[ - AC_DEFINE([HAVE_PTHREAD_SHARED], [1], - [Define to 1 if you have the `PTHREAD_PROCESS_SHARED` definition. - ]) - _x_ac_pthread_lock_found="1" - ]) - ]) + [pmix_pthread_rwlockattr_setpshared=yes + AS_IF([test "$pmix_pthread_process_shared" = "yes"], + [_x_ac_pthread_lock_found=1]]), + [pmix_pthread_rwlockattr_setpshared=no]) + + AC_CHECK_FUNC([pthread_mutexattr_setpshared], + [pmix_pthread_mutexattr_setpshared=yes], + [pmix_pthread_mutexattr_setpshared=no]) + + AS_IF([test "$pmix_pthread_rwlockattr_setkind_np" = "yes" && test "$pmix_pthread_rwlock_prefer_writer_nonrecursive_np" = "yes"], + [pmix_prefer_write_nonrecursive=1], + [pmix_prefer_write_nonrecursive=0]) if test "$_x_ac_pthread_lock_found" = "0"; then if test "$_x_ac_fcntl_lock_found" = "1"; then @@ -57,6 +83,12 @@ AC_DEFUN([PMIX_CHECK_DSTOR_LOCK],[ fi fi LIBS="$orig_libs" + + AC_DEFINE_UNQUOTED([PMIX_PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP], + [$pmix_prefer_write_nonrecursive], + [Whether or not we found the optional write_nonrecursive_np flag]) AM_CONDITIONAL([HAVE_DSTORE_PTHREAD_LOCK], [test "$_x_ac_pthread_lock_found" = "1"]) AM_CONDITIONAL([HAVE_DSTORE_FCNTL_LOCK], [test "$_x_ac_fcntl_lock_found" = "1"]) + + PMIX_VAR_SCOPE_POP ]) diff --git a/opal/mca/pmix/pmix3x/pmix/config/pmix_config_pthreads.m4 b/opal/mca/pmix/pmix3x/pmix/config/pmix_config_pthreads.m4 index b23f66ebb01..6a93035e2b4 100644 --- a/opal/mca/pmix/pmix3x/pmix/config/pmix_config_pthreads.m4 +++ b/opal/mca/pmix/pmix3x/pmix/config/pmix_config_pthreads.m4 @@ -10,7 +10,7 @@ dnl University of Stuttgart. All rights reserved. dnl Copyright (c) 2004-2005 The Regents of the University of California. dnl All rights reserved. dnl Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. -dnl Copyright (c) 2014-2019 Intel, Inc. All rights reserved. +dnl Copyright (c) 2014-2020 Intel, Inc. All rights reserved. dnl Copyright (c) 2014-2016 Research Organization for Information Science dnl and Technology (RIST). All rights reserved. dnl $COPYRIGHT$ @@ -273,11 +273,6 @@ PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS # Try the normal linking methods (that's no fun) PMIX_INTL_POSIX_THREADS_LIBS -# -# check to see if we can create shared memory mutexes and conditions -# -AC_CHECK_FUNCS([pthread_mutexattr_setpshared pthread_condattr_setpshared]) - # # check to see if we can set error checking mutexes # diff --git a/opal/mca/pmix/pmix3x/pmix/config/pmix_functions.m4 b/opal/mca/pmix/pmix3x/pmix/config/pmix_functions.m4 index e9b237cd53f..a75d339b2bd 100644 --- a/opal/mca/pmix/pmix3x/pmix/config/pmix_functions.m4 +++ b/opal/mca/pmix/pmix3x/pmix/config/pmix_functions.m4 @@ -94,9 +94,19 @@ EOF # Save some stats about this build # -PMIX_CONFIGURE_USER="`whoami`" -PMIX_CONFIGURE_HOST="`(hostname || uname -n) 2> /dev/null | sed 1q`" -PMIX_CONFIGURE_DATE="`date`" +DATE_FMT="+%Y-%m-%dT%H:%M:%S" +if test -n "$SOURCE_DATE_EPOCH" ; then + PMIX_CONFIGURE_USER="reproduciblebuild" + PMIX_CONFIGURE_HOST="reproduciblebuild" + PMIX_CONFIGURE_DATE=$(date -u -d "@$SOURCE_DATE_EPOCH" "$DATE_FMT" 2>/dev/null || date -u -r "$SOURCE_DATE_EPOCH" "$DATE_FMT" 2>/dev/null || date -u "$DATE_FMT") +else + PMIX_CONFIGURE_USER="`whoami`" + PMIX_CONFIGURE_HOST="`(hostname || uname -n) 2> /dev/null | sed 1q`" + PMIX_CONFIGURE_DATE="`date $DATE_FMT`" +fi + +AC_SUBST([SOURCE_DATE_EPOCH]) +AM_CONDITIONAL([SOURCE_DATE_EPOCH_SET], [test -n "$SOURCE_DATE_EPOCH"]) # # Save these details so that they can be used in pmix_info later diff --git a/opal/mca/pmix/pmix3x/pmix/contrib/pmix.spec b/opal/mca/pmix/pmix3x/pmix/contrib/pmix.spec index ae488781f7e..ad3e79c2972 100644 --- a/opal/mca/pmix/pmix3x/pmix/contrib/pmix.spec +++ b/opal/mca/pmix/pmix3x/pmix/contrib/pmix.spec @@ -12,7 +12,7 @@ # Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2013 Mellanox Technologies, Inc. # All rights reserved. -# Copyright (c) 2015-2019 Intel, Inc. All rights reserved. +# Copyright (c) 2015-2018 Intel, Inc. All rights reserved. # Copyright (c) 2015 Research Organization for Information Science # and Technology (RIST). All rights reserved. # $COPYRIGHT$ @@ -192,7 +192,7 @@ Summary: An extended/exascale implementation of PMI Name: %{?_name:%{_name}}%{!?_name:pmix} -Version: 3.1.4 +Version: 3.1.5rc2 Release: 1%{?dist} License: BSD Group: Development/Libraries diff --git a/opal/mca/pmix/pmix3x/pmix/examples/dmodex.c b/opal/mca/pmix/pmix3x/pmix/examples/dmodex.c index 76a1ac8ca0c..8a95a405b78 100644 --- a/opal/mca/pmix/pmix3x/pmix/examples/dmodex.c +++ b/opal/mca/pmix/pmix3x/pmix/examples/dmodex.c @@ -23,67 +23,16 @@ * */ -#include +#include +#include -#define _GNU_SOURCE #include - #include #include #include -#include -#include "examples.h" - static uint32_t nprocs; static pmix_proc_t myproc; -static uint32_t getcount = 0; - -static void opcbfunc(pmix_status_t status, void *cbdata) -{ - mylock_t *lock = (mylock_t*)cbdata; - - fprintf(stderr, "%s:%d completed fence_nb\n", myproc.nspace, myproc.rank); - lock->status = status; - DEBUG_WAKEUP_THREAD(lock); -} - -static void valcbfunc(pmix_status_t status, - pmix_value_t *val, void *cbdata) -{ - char *key = (char*)cbdata; - - if (PMIX_SUCCESS == status) { - if (NULL != strstr(key, "local")) { - if (PMIX_UINT64 != val->type) { - fprintf(stderr, "%s:%d: PMIx_Get_nb Key %s returned wrong type: %d\n", myproc.nspace, myproc.rank, key, val->type); - goto done; - } - if (1234 != val->data.uint64) { - fprintf(stderr, "%s:%d: PMIx_Get_nb Key %s returned wrong value: %d\n", myproc.nspace, myproc.rank, key, (int)val->data.uint64); - goto done; - } - } else if (NULL != strstr(key, "remote")) { - if (PMIX_STRING != val->type) { - fprintf(stderr, "%s:%d: PMIx_Get_nb Key %s returned wrong type: %d\n", myproc.nspace, myproc.rank, key, val->type); - goto done; - } - if (0 != strcmp(val->data.string, "1234")) { - fprintf(stderr, "%s:%d: PMIx_Get_nb Key %s returned wrong value: %s\n", myproc.nspace, myproc.rank, key, val->data.string); - goto done; - } - } else { - fprintf(stderr, "%s:%d PMIx_Get_nb returned wrong key: %s\n", myproc.nspace, myproc.rank, key); - goto done; - } - fprintf(stderr, "%s:%d PMIx_Get_nb Key %s returned correctly\n", myproc.nspace, myproc.rank, key); - } else { - fprintf(stderr, "%s:%d PMIx_Get_nb Key %s failed\n", myproc.nspace, myproc.rank, key); - } - done: - free(key); - getcount++; -} int main(int argc, char **argv) { @@ -92,8 +41,11 @@ int main(int argc, char **argv) pmix_value_t *val = &value; char *tmp; pmix_proc_t proc; - uint32_t n, num_gets; - mylock_t mylock; + uint32_t n, k, nlocal; + bool local, all_local; + char **peers; + pmix_rank_t *locals; + uint8_t j; /* init us */ if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) { @@ -102,58 +54,55 @@ int main(int argc, char **argv) } fprintf(stderr, "Client ns %s rank %d: Running\n", myproc.nspace, myproc.rank); - /* get our universe size */ - PMIX_PROC_CONSTRUCT(&proc); + /* get our job size */ (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; - if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) { - fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %d\n", myproc.nspace, myproc.rank, rc); + if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &val))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Get job size failed: %s\n", + myproc.nspace, myproc.rank, PMIx_Error_string(rc)); goto done; } nprocs = val->data.uint32; PMIX_VALUE_RELEASE(val); - fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, nprocs); + fprintf(stderr, "Client %s:%d job size %d\n", myproc.nspace, myproc.rank, nprocs); /* put a few values */ - if (0 > asprintf(&tmp, "%s-%d-internal", myproc.nspace, myproc.rank)) { - exit(1); - } + (void)asprintf(&tmp, "%s-%d-internal", myproc.nspace, myproc.rank); value.type = PMIX_UINT32; value.data.uint32 = 1234; if (PMIX_SUCCESS != (rc = PMIx_Store_internal(&myproc, tmp, &value))) { fprintf(stderr, "Client ns %s rank %d: PMIx_Store_internal failed: %d\n", myproc.nspace, myproc.rank, rc); goto done; } - free(tmp); - if (0 > asprintf(&tmp, "%s-%d-local", myproc.nspace, myproc.rank)) { - exit(1); - } + (void)asprintf(&tmp, "%s-%d-local", myproc.nspace, myproc.rank); value.type = PMIX_UINT64; value.data.uint64 = 1234; if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_LOCAL, tmp, &value))) { fprintf(stderr, "Client ns %s rank %d: PMIx_Put internal failed: %d\n", myproc.nspace, myproc.rank, rc); goto done; } - free(tmp); - if (0 > asprintf(&tmp, "%s-%d-remote", myproc.nspace, myproc.rank)) { - exit(1); - } + (void)asprintf(&tmp, "%s-%d-remote", myproc.nspace, myproc.rank); value.type = PMIX_STRING; value.data.string = "1234"; - if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) { + if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_GLOBAL, tmp, &value))) { fprintf(stderr, "Client ns %s rank %d: PMIx_Put internal failed: %d\n", myproc.nspace, myproc.rank, rc); goto done; } - free(tmp); - /* introduce a delay by one rank so we can check what happens - * if a "get" is received prior to data being provided */ - - if (0 == myproc.rank) { - sleep(2); + value.type = PMIX_BYTE_OBJECT; + value.data.bo.bytes = (char*)malloc(128); + for (j=0; j < 128; j++) { + value.data.bo.bytes[j] = j; + } + value.data.bo.size = 128; + if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_GLOBAL, "ghex", &value))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Put ghex failed: %d\n", myproc.nspace, myproc.rank, rc); + PMIX_VALUE_DESTRUCT(&value); + goto done; } + PMIX_VALUE_DESTRUCT(&value); /* commit the data to the server */ if (PMIX_SUCCESS != (rc = PMIx_Commit())) { @@ -161,56 +110,92 @@ int main(int argc, char **argv) goto done; } - /* call fence_nb, but don't return any data */ - PMIX_PROC_CONSTRUCT(&proc); - (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); - proc.rank = PMIX_RANK_WILDCARD; - DEBUG_CONSTRUCT_LOCK(&mylock); - if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(&proc, 1, NULL, 0, opcbfunc, &mylock))) { - fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc); - DEBUG_DESTRUCT_LOCK(&mylock); + /* get a list of our local peers */ + if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_LOCAL_PEERS, NULL, 0, &val))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Get local peers failed: %s\n", + myproc.nspace, myproc.rank, PMIx_Error_string(rc)); goto done; } + /* split the returned string to get the rank of each local peer */ + peers = pmix_argv_split(val->data.string, ','); + PMIX_VALUE_RELEASE(val); + nlocal = pmix_argv_count(peers); + if (nprocs == nlocal) { + all_local = true; + } else { + all_local = false; + locals = (pmix_rank_t*)malloc(pmix_argv_count(peers) * sizeof(pmix_rank_t)); + for (n=0; NULL != peers[n]; n++) { + locals[n] = strtoul(peers[n], NULL, 10); + } + } + pmix_argv_free(peers); /* get the committed data - ask for someone who doesn't exist as well */ - num_gets = 0; - for (n=0; n <= nprocs; n++) { - if (0 > asprintf(&tmp, "%s-%d-local", myproc.nspace, n)) { - exit(1); - } - (void)strncpy(proc.nspace, tmp, PMIX_MAX_NSLEN); - proc.rank = n; - if (PMIX_SUCCESS != (rc = PMIx_Get_nb(&proc, tmp, - NULL, 0, valcbfunc, tmp))) { - fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s failed: %d\n", myproc.nspace, n, tmp, rc); - goto done; + for (n=0; n < nprocs; n++) { + if (all_local) { + local = true; + } else { + local = false; + /* see if this proc is local to us */ + for (k=0; k < nlocal; k++) { + if (proc.rank == locals[k]) { + local = true; + break; + } + } } - ++num_gets; - if (0 > asprintf(&tmp, "%s-%d-remote", myproc.nspace, n)) { - exit(1); + if (local) { + (void)asprintf(&tmp, "%s-%d-local", myproc.nspace, n); + proc.rank = n; + if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s failed: %d\n", myproc.nspace, n, tmp, rc); + goto done; + } + if (PMIX_UINT64 != val->type) { + fprintf(stderr, "%s:%d: PMIx_Get Key %s returned wrong type: %d\n", myproc.nspace, myproc.rank, tmp, val->type); + goto done; + } + if (1234 != val->data.uint64) { + fprintf(stderr, "%s:%d: PMIx_Get Key %s returned wrong value: %d\n", myproc.nspace, myproc.rank, tmp, (int)val->data.uint64); + goto done; + } + fprintf(stderr, "%s:%d Local value for %s:%d successfully retrieved\n", myproc.nspace, myproc.rank, proc.nspace, proc.rank); + } else { + (void)asprintf(&tmp, "%s-%d-remote", myproc.nspace, n); + if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s failed: %d\n", myproc.nspace, n, tmp, rc); + goto done; + } + if (PMIX_STRING != val->type) { + fprintf(stderr, "%s:%d: PMIx_Get Key %s returned wrong type: %d\n", myproc.nspace, myproc.rank, tmp, val->type); + goto done; + } + if (0 != strcmp(val->data.string, "1234")) { + fprintf(stderr, "%s:%d: PMIx_Get Key %s returned wrong value: %s\n", myproc.nspace, myproc.rank, tmp, val->data.string); + goto done; + } + fprintf(stderr, "%s:%d Remote value for %s:%d successfully retrieved\n", myproc.nspace, myproc.rank, proc.nspace, proc.rank); } - (void)strncpy(proc.nspace, tmp, PMIX_MAX_NSLEN); - if (PMIX_SUCCESS != (rc = PMIx_Get_nb(&proc, tmp, - NULL, 0, valcbfunc, tmp))) { - fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s failed: %d\n", myproc.nspace, n, tmp, rc); - goto done; + /* if this isn't us, then get the ghex key */ + if (n != myproc.rank) { + if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, "ghex", NULL, 0, &val))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Get ghex failed: %d\n", myproc.nspace, n, rc); + goto done; + } + if (PMIX_BYTE_OBJECT != val->type) { + fprintf(stderr, "%s:%d: PMIx_Get ghex returned wrong type: %d\n", myproc.nspace, myproc.rank, val->type); + goto done; + } + if (128 != val->data.bo.size) { + fprintf(stderr, "%s:%d: PMIx_Get ghex returned wrong size: %d\n", myproc.nspace, myproc.rank, (int)val->data.bo.size); + goto done; + } + fprintf(stderr, "%s:%d Ghex for %s:%d successfully retrieved\n", myproc.nspace, myproc.rank, proc.nspace, proc.rank); } - ++num_gets; } - /* wait for the first fence to finish */ - DEBUG_WAIT_THREAD(&mylock); - - /* wait for all my "get" calls to complete */ - while (getcount < num_gets) { - struct timespec ts; - ts.tv_sec = 0; - ts.tv_nsec = 100000; - nanosleep(&ts, NULL); - } - - /* call fence again so everyone waits before leaving */ - (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); + /* call fence so everyone waits before leaving */ proc.rank = PMIX_RANK_WILDCARD; if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) { fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc); @@ -219,7 +204,7 @@ int main(int argc, char **argv) done: /* finalize us */ - fprintf(stderr, "Client ns %s rank %d: Finalizing", myproc.nspace, myproc.rank); + fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank); if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) { fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc); } else { diff --git a/opal/mca/pmix/pmix3x/pmix/include/pmix.h b/opal/mca/pmix/pmix3x/pmix/include/pmix.h index ba29692c99d..61ce017ca6a 100644 --- a/opal/mca/pmix/pmix3x/pmix/include/pmix.h +++ b/opal/mca/pmix/pmix3x/pmix/include/pmix.h @@ -473,7 +473,8 @@ PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata, * previously released is included. */ PMIX_EXPORT pmix_status_t PMIx_Allocation_request(pmix_alloc_directive_t directive, - pmix_info_t *info, size_t ninfo); + pmix_info_t *info, size_t ninfo, + pmix_info_t **results, size_t *nresults); PMIX_EXPORT pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t directive, pmix_info_t *info, size_t ninfo, @@ -620,8 +621,8 @@ PMIX_EXPORT pmix_status_t PMIx_Validate_credential(const pmix_byte_object_t *cre * * source - the nspace/rank of the process that generated the data * - * payload - pointer to character array containing the data. Note that - * multiple strings may be included, and that the array may + * payload - pointer to a PMIx byte object containing the data. Note that + * multiple strings may be included, and that the data may * _not_ be NULL terminated * * info - an optional array of info provided by the source containing @@ -630,7 +631,7 @@ PMIX_EXPORT pmix_status_t PMIx_Validate_credential(const pmix_byte_object_t *cre * ninfo - number of elements in the optional info array */ typedef void (*pmix_iof_cbfunc_t)(size_t iofhdlr, pmix_iof_channel_t channel, - pmix_proc_t *source, char *payload, + pmix_proc_t *source, pmix_byte_object_t *payload, pmix_info_t info[], size_t ninfo); @@ -654,7 +655,9 @@ PMIX_EXPORT pmix_status_t PMIx_Validate_credential(const pmix_byte_object_t *cre * NOTE: STDIN is not supported as it will always * be delivered to the stdin file descriptor * - * cbfunc - function to be called when relevant IO is received + * cbfunc - function to be called when relevant IO is received. A + * NULL indicates that the IO is to be written to stdout + * or stderr as per the originating channel * * regcbfunc - since registration is async, this is the * function to be called when registration is diff --git a/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in b/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in index a3039ff6748..62c4e3e01eb 100644 --- a/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in +++ b/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in @@ -676,7 +676,7 @@ typedef uint32_t pmix_rank_t; #define PMIX_IOF_TAG_OUTPUT "pmix.iof.tag" // (bool) Tag output with the channel it comes from #define PMIX_IOF_TIMESTAMP_OUTPUT "pmix.iof.ts" // (bool) Timestamp output #define PMIX_IOF_XML_OUTPUT "pmix.iof.xml" // (bool) Format output in XML - +#define PMIX_IOF_STOP "pmix.iof.stop" // (bool) Stop forwarding the specified channel(s) /* Attributes for controlling contents of application setup data */ #define PMIX_SETUP_APP_ENVARS "pmix.setup.env" // (bool) harvest and include relevant envars @@ -735,7 +735,7 @@ typedef int pmix_status_t; * at least defined to ensure older codes will compile */ #define PMIX_SUCCESS 0 #define PMIX_ERROR -1 // general error -#define PMIX_ERR_SILENT -2 // internal-only +#define PMIX_ERR_SILENT -2 /* debugger release flag */ #define PMIX_ERR_DEBUGGER_RELEASE -3 /* fault tolerance */ @@ -749,45 +749,45 @@ typedef int pmix_status_t; /* communication failures */ #define PMIX_ERR_SERVER_FAILED_REQUEST -10 #define PMIX_EXISTS -11 -#define PMIX_ERR_INVALID_CRED -12 // internal-only -#define PMIX_ERR_HANDSHAKE_FAILED -13 // internal-only -#define PMIX_ERR_READY_FOR_HANDSHAKE -14 // internal-only +#define PMIX_ERR_INVALID_CRED -12 +#define PMIX_ERR_HANDSHAKE_FAILED -13 +#define PMIX_ERR_READY_FOR_HANDSHAKE -14 #define PMIX_ERR_WOULD_BLOCK -15 -#define PMIX_ERR_UNKNOWN_DATA_TYPE -16 // internal-only -#define PMIX_ERR_PROC_ENTRY_NOT_FOUND -17 // internal-only -#define PMIX_ERR_TYPE_MISMATCH -18 // internal-only -#define PMIX_ERR_UNPACK_INADEQUATE_SPACE -19 // internal-only -#define PMIX_ERR_UNPACK_FAILURE -20 // internal-only -#define PMIX_ERR_PACK_FAILURE -21 // internal-only -#define PMIX_ERR_PACK_MISMATCH -22 // internal-only +#define PMIX_ERR_UNKNOWN_DATA_TYPE -16 +#define PMIX_ERR_PROC_ENTRY_NOT_FOUND -17 +#define PMIX_ERR_TYPE_MISMATCH -18 +#define PMIX_ERR_UNPACK_INADEQUATE_SPACE -19 +#define PMIX_ERR_UNPACK_FAILURE -20 +#define PMIX_ERR_PACK_FAILURE -21 +#define PMIX_ERR_PACK_MISMATCH -22 #define PMIX_ERR_NO_PERMISSIONS -23 #define PMIX_ERR_TIMEOUT -24 #define PMIX_ERR_UNREACH -25 -#define PMIX_ERR_IN_ERRNO -26 // internal-only +#define PMIX_ERR_IN_ERRNO -26 #define PMIX_ERR_BAD_PARAM -27 -#define PMIX_ERR_RESOURCE_BUSY -28 // internal-only +#define PMIX_ERR_RESOURCE_BUSY -28 #define PMIX_ERR_OUT_OF_RESOURCE -29 #define PMIX_ERR_DATA_VALUE_NOT_FOUND -30 #define PMIX_ERR_INIT -31 -#define PMIX_ERR_NOMEM -32 // internal-only -#define PMIX_ERR_INVALID_ARG -33 // internal-only -#define PMIX_ERR_INVALID_KEY -34 // internal-only -#define PMIX_ERR_INVALID_KEY_LENGTH -35 // internal-only -#define PMIX_ERR_INVALID_VAL -36 // internal-only -#define PMIX_ERR_INVALID_VAL_LENGTH -37 // internal-only -#define PMIX_ERR_INVALID_LENGTH -38 // internal-only -#define PMIX_ERR_INVALID_NUM_ARGS -39 // internal-only -#define PMIX_ERR_INVALID_ARGS -40 // internal-only -#define PMIX_ERR_INVALID_NUM_PARSED -41 // internal-only -#define PMIX_ERR_INVALID_KEYVALP -42 // internal-only +#define PMIX_ERR_NOMEM -32 +#define PMIX_ERR_INVALID_ARG -33 +#define PMIX_ERR_INVALID_KEY -34 +#define PMIX_ERR_INVALID_KEY_LENGTH -35 +#define PMIX_ERR_INVALID_VAL -36 +#define PMIX_ERR_INVALID_VAL_LENGTH -37 +#define PMIX_ERR_INVALID_LENGTH -38 +#define PMIX_ERR_INVALID_NUM_ARGS -39 +#define PMIX_ERR_INVALID_ARGS -40 +#define PMIX_ERR_INVALID_NUM_PARSED -41 +#define PMIX_ERR_INVALID_KEYVALP -42 #define PMIX_ERR_INVALID_SIZE -43 #define PMIX_ERR_INVALID_NAMESPACE -44 -#define PMIX_ERR_SERVER_NOT_AVAIL -45 // internal-only +#define PMIX_ERR_SERVER_NOT_AVAIL -45 #define PMIX_ERR_NOT_FOUND -46 #define PMIX_ERR_NOT_SUPPORTED -47 #define PMIX_ERR_NOT_IMPLEMENTED -48 #define PMIX_ERR_COMM_FAILURE -49 -#define PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER -50 // internal-only +#define PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER -50 #define PMIX_ERR_CONFLICTING_CLEANUP_DIRECTIVES -51 /* define a starting point for v2.x error values */ @@ -1018,10 +1018,12 @@ static inline void* pmix_calloc(size_t n, size_t m) }while(0) /* define a convenience macro for loading nspaces */ -#define PMIX_LOAD_NSPACE(a, b) \ - do { \ - memset((a), 0, PMIX_MAX_NSLEN+1); \ - pmix_strncpy((a), (b), PMIX_MAX_NSLEN); \ +#define PMIX_LOAD_NSPACE(a, b) \ + do { \ + memset((a), 0, PMIX_MAX_NSLEN+1); \ + if (NULL != (b)) { \ + pmix_strncpy((char*)(a), (b), PMIX_MAX_NSLEN); \ + } \ }while(0) /* define a convenience macro for checking nspaces */ @@ -1239,7 +1241,7 @@ typedef struct pmix_proc { #define PMIX_PROC_LOAD(m, n, r) \ do { \ PMIX_PROC_CONSTRUCT((m)); \ - pmix_strncpy((m)->nspace, (n), PMIX_MAX_NSLEN); \ + pmix_strncpy((char*)(m)->nspace, (n), PMIX_MAX_NSLEN); \ (m)->rank = (r); \ } while(0) @@ -1249,9 +1251,9 @@ typedef struct pmix_proc { memset((t), 0, PMIX_MAX_NSLEN+1); \ _len = strlen((c)); \ if ((_len + strlen((n))) < PMIX_MAX_NSLEN) { \ - pmix_strncpy((t), (c), PMIX_MAX_NSLEN); \ + pmix_strncpy((char*)(t), (c), PMIX_MAX_NSLEN); \ (t)[_len] = ':'; \ - pmix_strncpy(&(t)[_len+1], (n), PMIX_MAX_NSLEN - _len); \ + pmix_strncpy((char*)&(t)[_len+1], (n), PMIX_MAX_NSLEN - _len); \ } \ } while(0) @@ -1539,7 +1541,7 @@ typedef struct pmix_info { #define PMIX_INFO_LOAD(m, k, v, t) \ do { \ if (NULL != (k)) { \ - pmix_strncpy((m)->key, (k), PMIX_MAX_KEYLEN); \ + pmix_strncpy((char*)(m)->key, (k), PMIX_MAX_KEYLEN); \ } \ (m)->flags = 0; \ pmix_value_load(&((m)->value), (v), (t)); \ @@ -1547,7 +1549,7 @@ typedef struct pmix_info { #define PMIX_INFO_XFER(d, s) \ do { \ if (NULL != (s)->key) { \ - pmix_strncpy((d)->key, (s)->key, PMIX_MAX_KEYLEN); \ + pmix_strncpy((char*)(d)->key, (s)->key, PMIX_MAX_KEYLEN); \ } \ (d)->flags = (s)->flags; \ pmix_value_xfer(&(d)->value, (pmix_value_t*)&(s)->value); \ @@ -1628,9 +1630,9 @@ typedef struct pmix_pdata { do { \ if (NULL != (m)) { \ memset((m), 0, sizeof(pmix_pdata_t)); \ - pmix_strncpy((m)->proc.nspace, (p)->nspace, PMIX_MAX_NSLEN); \ + pmix_strncpy((char*)(m)->proc.nspace, (p)->nspace, PMIX_MAX_NSLEN); \ (m)->proc.rank = (p)->rank; \ - pmix_strncpy((m)->key, (k), PMIX_MAX_KEYLEN); \ + pmix_strncpy((char*)(m)->key, (k), PMIX_MAX_KEYLEN); \ pmix_value_load(&((m)->value), (v), (t)); \ } \ } while (0) @@ -1639,9 +1641,9 @@ typedef struct pmix_pdata { do { \ if (NULL != (d)) { \ memset((d), 0, sizeof(pmix_pdata_t)); \ - pmix_strncpy((d)->proc.nspace, (s)->proc.nspace, PMIX_MAX_NSLEN); \ + pmix_strncpy((char*)(d)->proc.nspace, (s)->proc.nspace, PMIX_MAX_NSLEN); \ (d)->proc.rank = (s)->proc.rank; \ - pmix_strncpy((d)->key, (s)->key, PMIX_MAX_KEYLEN); \ + pmix_strncpy((char*)(d)->key, (s)->key, PMIX_MAX_KEYLEN); \ pmix_value_xfer(&((d)->value), &((s)->value)); \ } \ } while (0) @@ -2657,10 +2659,12 @@ static inline void pmix_darray_destruct(pmix_data_array_t *m) (m)->array = NULL; \ } \ } while(0) -#define PMIX_DATA_ARRAY_CREATE(m, n, t) \ - do { \ - (m) = (pmix_data_array_t*)pmix_calloc(1, sizeof(pmix_data_array_t)); \ - PMIX_DATA_ARRAY_CONSTRUCT((m), (n), (t)); \ +#define PMIX_DATA_ARRAY_CREATE(m, n, t) \ + do { \ + (m) = (pmix_data_array_t*)pmix_calloc(1, sizeof(pmix_data_array_t)); \ + if (NULL != (m)) { \ + PMIX_DATA_ARRAY_CONSTRUCT((m), (n), (t)); \ + } \ } while(0) #define PMIX_DATA_ARRAY_DESTRUCT(m) pmix_darray_destruct(m) diff --git a/opal/mca/pmix/pmix3x/pmix/include/pmix_server.h b/opal/mca/pmix/pmix3x/pmix/include/pmix_server.h index 4d3f36bbd43..9a5ac16ff2e 100644 --- a/opal/mca/pmix/pmix3x/pmix/include/pmix_server.h +++ b/opal/mca/pmix/pmix3x/pmix/include/pmix_server.h @@ -449,7 +449,10 @@ typedef pmix_status_t (*pmix_server_validate_cred_fn_t)(const pmix_proc_t *proc, * * This call serves as a registration with the host RM for the given IO channels from * the specified procs - the host RM is expected to ensure that this local PMIx server - * is on the distribution list for the channel/proc combination + * is on the distribution list for the channel/proc combination. IF the PMIX_IOF_STOP + * is included in the directives, then the local PMIx server is requesting that the + * host RM remove the server from the distribution list for the specified channel/proc + * combination. */ typedef pmix_status_t (*pmix_server_iof_fn_t)(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t directives[], size_t ndirs, diff --git a/opal/mca/pmix/pmix3x/pmix/src/class/pmix_hotel.c b/opal/mca/pmix/pmix3x/pmix/src/class/pmix_hotel.c index fd114a77aa3..69126edba37 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/class/pmix_hotel.c +++ b/opal/mca/pmix/pmix3x/pmix/src/class/pmix_hotel.c @@ -3,6 +3,7 @@ * Copyright (c) 2012-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2020 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,8 +60,8 @@ pmix_status_t pmix_hotel_init(pmix_hotel_t *h, int num_rooms, h->num_rooms = num_rooms; h->evbase = evbase; - h->eviction_timeout.tv_usec = eviction_timeout % 1000000; - h->eviction_timeout.tv_sec = eviction_timeout / 1000000; + h->eviction_timeout.tv_usec = 0; + h->eviction_timeout.tv_sec = eviction_timeout; h->evict_callback_fn = evict_callback_fn; h->rooms = (pmix_hotel_room_t*)malloc(num_rooms * sizeof(pmix_hotel_room_t)); if (NULL != evict_callback_fn) { diff --git a/opal/mca/pmix/pmix3x/pmix/src/class/pmix_hotel.h b/opal/mca/pmix/pmix3x/pmix/src/class/pmix_hotel.h index 883a2c5c6ce..f1e331dbb8e 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/class/pmix_hotel.h +++ b/opal/mca/pmix/pmix3x/pmix/src/class/pmix_hotel.h @@ -3,6 +3,7 @@ * Copyright (c) 2012-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2020 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -141,7 +142,7 @@ PMIX_CLASS_DECLARATION(pmix_hotel_t); * @param num_rooms The total number of rooms in the hotel (IN) * @param evbase Pointer to event base used for eviction timeout * @param eviction_timeout Max length of a stay at the hotel before - * the eviction callback is invoked (in microseconds) + * the eviction callback is invoked (in seconds) * @param evict_callback_fn Callback function invoked if an occupant * does not check out before the eviction_timeout. * diff --git a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client.c b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client.c index 8d522e7a4fa..23110da10c4 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client.c +++ b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client.c @@ -223,7 +223,11 @@ static void job_data(struct pmix_peer_t *pr, /* unpack the nspace - should be same as our own */ PMIX_BFROPS_UNPACK(rc, pmix_client_globals.myserver, buf, &nspace, &cnt, PMIX_STRING); - if (PMIX_SUCCESS != rc) { + if (PMIX_SUCCESS != rc || + !PMIX_CHECK_NSPACE(nspace, pmix_globals.myid.nspace)) { + if (PMIX_SUCCESS == rc) { + rc = PMIX_ERR_INVALID_VAL; + } PMIX_ERROR_LOG(rc); cb->status = PMIX_ERROR; PMIX_POST_OBJECT(cb); @@ -235,6 +239,7 @@ static void job_data(struct pmix_peer_t *pr, PMIX_GDS_STORE_JOB_INFO(cb->status, pmix_client_globals.myserver, nspace, buf); + free(nspace); cb->status = PMIX_SUCCESS; PMIX_POST_OBJECT(cb); @@ -386,14 +391,18 @@ static void client_iof_handler(struct pmix_peer_t *pr, pmix_byte_object_t bo; int32_t cnt; pmix_status_t rc; + size_t refid, ninfo=0; + pmix_iof_req_t *req; + pmix_info_t *info=NULL; pmix_output_verbose(2, pmix_client_globals.iof_output, - "recvd IOF"); + "recvd IOF with %d bytes", (int)buf->bytes_used); - /* if the buffer is empty, they are simply closing the channel */ + /* if the buffer is empty, they are simply closing the socket */ if (0 == buf->bytes_used) { return; } + PMIX_BYTE_OBJECT_CONSTRUCT(&bo); cnt = 1; PMIX_BFROPS_UNPACK(rc, peer, buf, &source, &cnt, PMIX_PROC); @@ -408,13 +417,52 @@ static void client_iof_handler(struct pmix_peer_t *pr, return; } cnt = 1; - PMIX_BFROPS_UNPACK(rc, peer, buf, &bo, &cnt, PMIX_BYTE_OBJECT); + PMIX_BFROPS_UNPACK(rc, peer, buf, &refid, &cnt, PMIX_SIZE); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); return; } - if (NULL != bo.bytes && 0 < bo.size) { - pmix_iof_write_output(&source, channel, &bo, NULL); + cnt = 1; + PMIX_BFROPS_UNPACK(rc, peer, buf, &ninfo, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return; + } + if (0 < ninfo) { + PMIX_INFO_CREATE(info, ninfo); + cnt = ninfo; + PMIX_BFROPS_UNPACK(rc, peer, buf, info, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + } + cnt = 1; + PMIX_BFROPS_UNPACK(rc, peer, buf, &bo, &cnt, PMIX_BYTE_OBJECT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + /* lookup the handler for this IOF package */ + if (NULL == (req = (pmix_iof_req_t*)pmix_pointer_array_get_item(&pmix_globals.iof_requests, refid))) { + /* something wrong here - should not happen */ + PMIX_ERROR_LOG(PMIX_ERR_NOT_FOUND); + goto cleanup; + } + /* if the handler invokes a callback function, do so */ + if (NULL != req->cbfunc) { + req->cbfunc(refid, channel, &source, &bo, info, ninfo); + } else { + /* otherwise, simply write it out to the specified std IO channel */ + if (NULL != bo.bytes && 0 < bo.size) { + pmix_iof_write_output(&source, channel, &bo, NULL); + } + } + + cleanup: + /* cleanup the memory */ + if (0 < ninfo) { + PMIX_INFO_FREE(info, ninfo); } PMIX_BYTE_OBJECT_DESTRUCT(&bo); } @@ -435,11 +483,12 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, size_t n; bool found; pmix_ptl_posted_recv_t *rcv; + pid_t pid; PMIX_ACQUIRE_THREAD(&pmix_global_lock); if (0 < pmix_globals.init_cntr || - (NULL != pmix_globals.mypeer && PMIX_PROC_IS_SERVER(pmix_globals.mypeer))) { + (NULL != pmix_globals.mypeer && PMIX_PEER_IS_SERVER(pmix_globals.mypeer))) { /* since we have been called before, the nspace and * rank should be known. So return them here if * requested */ @@ -459,13 +508,6 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, } ++pmix_globals.init_cntr; - /* if we don't see the required info, then we cannot init */ - if (NULL == (evar = getenv("PMIX_NAMESPACE"))) { - pmix_init_result = PMIX_ERR_INVALID_NAMESPACE; - PMIX_RELEASE_THREAD(&pmix_global_lock); - return PMIX_ERR_INVALID_NAMESPACE; - } - /* setup the runtime - this init's the globals, * opens and initializes the required frameworks */ if (PMIX_SUCCESS != (rc = pmix_rte_init(PMIX_PROC_CLIENT, info, ninfo, @@ -519,24 +561,39 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, pmix_output_verbose(2, pmix_client_globals.base_output, "pmix: init called"); - /* we require our nspace */ - if (NULL != proc) { - pmix_strncpy(proc->nspace, evar, PMIX_MAX_NSLEN); - } - PMIX_LOAD_NSPACE(pmix_globals.myid.nspace, evar); - /* set the global pmix_namespace_t object for our peer */ - pmix_globals.mypeer->nptr->nspace = strdup(evar); - - /* we also require our rank */ - if (NULL == (evar = getenv("PMIX_RANK"))) { - /* let the caller know that the server isn't available yet */ - pmix_init_result = PMIX_ERR_DATA_VALUE_NOT_FOUND; - PMIX_RELEASE_THREAD(&pmix_global_lock); - return PMIX_ERR_DATA_VALUE_NOT_FOUND; - } - pmix_globals.myid.rank = strtol(evar, NULL, 10); - if (NULL != proc) { - proc->rank = pmix_globals.myid.rank; + /* see if the required info is present */ + if (NULL == (evar = getenv("PMIX_NAMESPACE"))) { + /* if we didn't see a PMIx server (e.g., missing envar), + * then allow us to run as a singleton */ + pid = getpid(); + snprintf(pmix_globals.myid.nspace, PMIX_MAX_NSLEN, "singleton.%lu", (unsigned long)pid); + pmix_globals.myid.rank = 0; + /* mark that we shouldn't connect to a server */ + pmix_client_globals.singleton = true; + if (NULL != proc) { + PMIX_LOAD_PROCID(proc, pmix_globals.myid.nspace, pmix_globals.myid.rank); + } + pmix_globals.mypeer->nptr->nspace = strdup(pmix_globals.myid.nspace); + } else { + if (NULL != proc) { + pmix_strncpy(proc->nspace, evar, PMIX_MAX_NSLEN); + } + PMIX_LOAD_NSPACE(pmix_globals.myid.nspace, evar); + /* set the global pmix_namespace_t object for our peer */ + pmix_globals.mypeer->nptr->nspace = strdup(evar); + + /* we also require our rank */ + if (NULL == (evar = getenv("PMIX_RANK"))) { + /* let the caller know that the server isn't available yet */ + pmix_init_result = PMIX_ERR_DATA_VALUE_NOT_FOUND; + PMIX_RELEASE_THREAD(&pmix_global_lock); + return PMIX_ERR_DATA_VALUE_NOT_FOUND; + } else { + pmix_globals.myid.rank = strtol(evar, NULL, 10); + } + if (NULL != proc) { + proc->rank = pmix_globals.myid.rank; + } } pmix_globals.pindex = -1; /* setup a rank_info object for us */ @@ -619,42 +676,55 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, } PMIX_INFO_DESTRUCT(&ginfo); - /* connect to the server */ - rc = pmix_ptl_base_connect_to_peer((struct pmix_peer_t*)pmix_client_globals.myserver, info, ninfo); - if (PMIX_SUCCESS != rc) { - pmix_init_result = rc; - PMIX_RELEASE_THREAD(&pmix_global_lock); - return rc; - } - /* mark that we are using the same module as used for the server */ - pmix_globals.mypeer->nptr->compat.ptl = pmix_client_globals.myserver->nptr->compat.ptl; - - /* send a request for our job info - we do this as a non-blocking - * transaction because some systems cannot handle very large - * blocking operations and error out if we try them. */ - req = PMIX_NEW(pmix_buffer_t); - PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, - req, &cmd, 1, PMIX_COMMAND); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(req); - pmix_init_result = rc; - PMIX_RELEASE_THREAD(&pmix_global_lock); - return rc; - } - /* send to the server */ - PMIX_CONSTRUCT(&cb, pmix_cb_t); - PMIX_PTL_SEND_RECV(rc, pmix_client_globals.myserver, - req, job_data, (void*)&cb); - if (PMIX_SUCCESS != rc) { - pmix_init_result = rc; - PMIX_RELEASE_THREAD(&pmix_global_lock); - return rc; + if (pmix_client_globals.singleton) { + pmix_globals.mypeer->nptr->compat.ptl = pmix_ptl_base_assign_module(); + pmix_globals.mypeer->nptr->compat.bfrops = pmix_bfrops_base_assign_module(NULL); + pmix_client_globals.myserver->nptr->compat.bfrops = pmix_bfrops_base_assign_module(NULL); + /* initialize our data values */ + rc = pmix_tool_init_info(); + if (PMIX_SUCCESS != rc) { + pmix_init_result = rc; + PMIX_RELEASE_THREAD(&pmix_global_lock); + return rc; + } + } else { + /* connect to the server */ + rc = pmix_ptl_base_connect_to_peer((struct pmix_peer_t*)pmix_client_globals.myserver, info, ninfo); + if (PMIX_SUCCESS != rc) { + pmix_init_result = rc; + PMIX_RELEASE_THREAD(&pmix_global_lock); + return rc; + } + /* mark that we are using the same module as used for the server */ + pmix_globals.mypeer->nptr->compat.ptl = pmix_client_globals.myserver->nptr->compat.ptl; + + /* send a request for our job info - we do this as a non-blocking + * transaction because some systems cannot handle very large + * blocking operations and error out if we try them. */ + req = PMIX_NEW(pmix_buffer_t); + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + req, &cmd, 1, PMIX_COMMAND); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(req); + pmix_init_result = rc; + PMIX_RELEASE_THREAD(&pmix_global_lock); + return rc; + } + /* send to the server */ + PMIX_CONSTRUCT(&cb, pmix_cb_t); + PMIX_PTL_SEND_RECV(rc, pmix_client_globals.myserver, + req, job_data, (void*)&cb); + if (PMIX_SUCCESS != rc) { + pmix_init_result = rc; + PMIX_RELEASE_THREAD(&pmix_global_lock); + return rc; + } + /* wait for the data to return */ + PMIX_WAIT_THREAD(&cb.lock); + rc = cb.status; + PMIX_DESTRUCT(&cb); } - /* wait for the data to return */ - PMIX_WAIT_THREAD(&cb.lock); - rc = cb.status; - PMIX_DESTRUCT(&cb); if (PMIX_SUCCESS == rc) { pmix_init_result = PMIX_SUCCESS; @@ -1201,8 +1271,14 @@ static void _commitfn(int sd, short args, void *cbdata) return PMIX_ERR_INIT; } + /* if we are a singleton, there is nothing to do */ + if (pmix_client_globals.singleton) { + PMIX_RELEASE_THREAD(&pmix_global_lock); + return PMIX_SUCCESS; + } + /* if we are a server, or we aren't connected, don't attempt to send */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { PMIX_RELEASE_THREAD(&pmix_global_lock); return PMIX_SUCCESS; // not an error } @@ -1225,25 +1301,23 @@ static void _commitfn(int sd, short args, void *cbdata) return rc; } -static void _resolve_peers(int sd, short args, void *cbdata) -{ - pmix_cb_t *cb = (pmix_cb_t*)cbdata; - - cb->status = pmix_preg.resolve_peers(cb->key, cb->pname.nspace, - &cb->procs, &cb->nprocs); - /* post the data so the receiving thread can acquire it */ - PMIX_POST_OBJECT(cb); - PMIX_WAKEUP_THREAD(&cb->lock); -} - /* need to thread-shift this request */ PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename, const pmix_nspace_t nspace, pmix_proc_t **procs, size_t *nprocs) { - pmix_cb_t *cb; + pmix_info_t info[2], *iptr; pmix_status_t rc; pmix_proc_t proc; + pmix_value_t *val; + char **p, **tmp=NULL, *prs; + pmix_proc_t *pa; + size_t m, n, np, ninfo; + pmix_namespace_t *ns; + + /* set default response */ + *procs = NULL; + *nprocs = 0; PMIX_ACQUIRE_THREAD(&pmix_global_lock); if (pmix_globals.init_cntr <= 0) { @@ -1252,70 +1326,153 @@ PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename, } PMIX_RELEASE_THREAD(&pmix_global_lock); + /* if I am a client and my server is earlier than v3.1.5, then + * I need to look for this data under rank=PMIX_RANK_WILDCARD + * with a key equal to the nodename */ + if (PMIX_PEER_IS_CLIENT(pmix_globals.mypeer) && + PMIX_PEER_IS_EARLIER(pmix_client_globals.myserver, 3, 1, 5)) { + proc.rank = PMIX_RANK_WILDCARD; + iptr = NULL; + ninfo = 0; + } else { + proc.rank = PMIX_RANK_UNDEF; + PMIX_INFO_LOAD(&info[0], PMIX_NODE_INFO, NULL, PMIX_BOOL); + PMIX_INFO_LOAD(&info[1], PMIX_HOSTNAME, nodename, PMIX_STRING); + iptr = info; + ninfo = 2; + } + + if (NULL == nspace || 0 == strlen(nspace)) { + rc = PMIX_ERR_NOT_FOUND; + np = 0; + /* cycle across all known nspaces and aggregate the results */ + PMIX_LIST_FOREACH(ns, &pmix_globals.nspaces, pmix_namespace_t) { + PMIX_LOAD_NSPACE(proc.nspace, ns->nspace); + rc = PMIx_Get(&proc, PMIX_LOCAL_PEERS, iptr, ninfo, &val); + if (PMIX_SUCCESS != rc) { + continue; + } - cb = PMIX_NEW(pmix_cb_t); - cb->key = (char*)nodename; - cb->pname.nspace = strdup(nspace); + /* sanity check */ + if (NULL == val) { + rc = PMIX_ERR_NOT_FOUND; + continue; + } + if (PMIX_STRING != val->type) { + rc = PMIX_ERR_INVALID_VAL; + PMIX_VALUE_FREE(val, 1); + continue; + } + if (NULL == val->data.string) { + /* no local peers on this node */ + PMIX_VALUE_FREE(val, 1); + continue; + } + /* prepend the nspace */ + if (0 > asprintf(&prs, "%s:%s", ns->nspace, val->data.string)) { + PMIX_VALUE_FREE(val, 1); + continue; + } + /* add to our list of results */ + pmix_argv_append_nosize(&tmp, prs); + /* split to count the npeers */ + p = pmix_argv_split(val->data.string, ','); + np += pmix_argv_count(p); + /* done with this entry */ + pmix_argv_free(p); + free(prs); + PMIX_VALUE_FREE(val, 1); + } + if (0 < np) { + /* allocate the proc array */ + PMIX_PROC_CREATE(pa, np); + if (NULL == pa) { + rc = PMIX_ERR_NOMEM; + pmix_argv_free(tmp); + goto done; + } + *procs = pa; + *nprocs = np; + /* transfer the results */ + np = 0; + for (n=0; NULL != tmp[n]; n++) { + /* find the nspace delimiter */ + prs = strchr(tmp[n], ':'); + *prs = '\0'; + ++prs; + p = pmix_argv_split(prs, ','); + for (m=0; NULL != p[m]; m++) { + PMIX_LOAD_NSPACE(&pa[np].nspace, tmp[n]); + pa[n].rank = strtoul(p[m], NULL, 10); + } + pmix_argv_free(p); + } + pmix_argv_free(tmp); + rc = PMIX_SUCCESS; + } + goto done; + } - PMIX_THREADSHIFT(cb, _resolve_peers); + /* get the list of local peers for this nspace and node */ + PMIX_LOAD_NSPACE(proc.nspace, nspace); - /* wait for the result */ - PMIX_WAIT_THREAD(&cb->lock); + rc = PMIx_Get(&proc, PMIX_LOCAL_PEERS, iptr, ninfo, &val); + if (PMIX_SUCCESS != rc) { + goto done; + } - /* if the nspace wasn't found, then we need to - * ask the server for that info */ - if (PMIX_ERR_INVALID_NAMESPACE == cb->status) { - pmix_strncpy(proc.nspace, nspace, PMIX_MAX_NSLEN); - proc.rank = PMIX_RANK_WILDCARD; - /* any key will suffice as it will bring down - * the entire data blob */ - rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, NULL); - if (PMIX_SUCCESS != rc) { - PMIX_RELEASE(cb); - return rc; - } - /* retry the fetch */ - cb->lock.active = true; - PMIX_THREADSHIFT(cb, _resolve_peers); - PMIX_WAIT_THREAD(&cb->lock); + /* sanity check */ + if (NULL == val) { + rc = PMIX_ERR_NOT_FOUND; + goto done; + } + if (PMIX_STRING != val->type || + NULL == val->data.string) { + rc = PMIX_ERR_INVALID_VAL; + PMIX_VALUE_FREE(val, 1); + goto done; } - *procs = cb->procs; - *nprocs = cb->nprocs; - rc = cb->status; - PMIX_RELEASE(cb); - return rc; -} + /* split the procs to get a list */ + p = pmix_argv_split(val->data.string, ','); + np = pmix_argv_count(p); + PMIX_VALUE_FREE(val, 1); -static void _resolve_nodes(int fd, short args, void *cbdata) -{ - pmix_cb_t *cb = (pmix_cb_t*)cbdata; - char *regex, **names; - - /* get a regular expression describing the PMIX_NODE_MAP */ - cb->status = pmix_preg.resolve_nodes(cb->pname.nspace, ®ex); - if (PMIX_SUCCESS == cb->status) { - /* parse it into an argv array of names */ - cb->status = pmix_preg.parse_nodes(regex, &names); - if (PMIX_SUCCESS == cb->status) { - /* assemble it into a comma-delimited list */ - cb->key = pmix_argv_join(names, ','); - pmix_argv_free(names); - } else { - free(regex); - } + /* allocate the proc array */ + PMIX_PROC_CREATE(pa, np); + if (NULL == pa) { + rc = PMIX_ERR_NOMEM; + pmix_argv_free(p); + goto done; } - /* post the data so the receiving thread can acquire it */ - PMIX_POST_OBJECT(cb); - PMIX_WAKEUP_THREAD(&cb->lock); + /* transfer the results */ + for (n=0; n < np; n++) { + PMIX_LOAD_NSPACE(&pa[n].nspace, nspace); + pa[n].rank = strtoul(p[n], NULL, 10); + } + pmix_argv_free(p); + *procs = pa; + *nprocs = np; + + done: + if (NULL != iptr) { + PMIX_INFO_DESTRUCT(&info[0]); + PMIX_INFO_DESTRUCT(&info[1]); + } + return rc; } -/* need to thread-shift this request */ PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const pmix_nspace_t nspace, char **nodelist) { - pmix_cb_t *cb; pmix_status_t rc; pmix_proc_t proc; + pmix_value_t *val; + char **tmp = NULL, **p; + size_t n; + pmix_namespace_t *ns; + + /* set default response */ + *nodelist = NULL; PMIX_ACQUIRE_THREAD(&pmix_global_lock); if (pmix_globals.init_cntr <= 0) { @@ -1324,35 +1481,69 @@ PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const pmix_nspace_t nspace, char ** } PMIX_RELEASE_THREAD(&pmix_global_lock); - cb = PMIX_NEW(pmix_cb_t); - cb->pname.nspace = strdup(nspace); - - PMIX_THREADSHIFT(cb, _resolve_nodes); + /* get the list of nodes for this nspace */ + proc.rank = PMIX_RANK_WILDCARD; - /* wait for the result */ - PMIX_WAIT_THREAD(&cb->lock); + if (NULL == nspace || 0 == strlen(nspace)) { + rc = PMIX_ERR_NOT_FOUND; + /* cycle across all known nspaces and aggregate the results */ + PMIX_LIST_FOREACH(ns, &pmix_globals.nspaces, pmix_namespace_t) { + PMIX_LOAD_NSPACE(proc.nspace, ns->nspace); + rc = PMIx_Get(&proc, PMIX_NODE_LIST, NULL, 0, &val); + if (PMIX_SUCCESS != rc) { + continue; + } - /* if the nspace wasn't found, then we need to - * ask the server for that info */ - if (PMIX_ERR_INVALID_NAMESPACE == cb->status) { - pmix_strncpy(proc.nspace, nspace, PMIX_MAX_NSLEN); - proc.rank = PMIX_RANK_WILDCARD; - /* any key will suffice as it will bring down - * the entire data blob */ - rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, NULL); - if (PMIX_SUCCESS != rc) { - PMIX_RELEASE(cb); - return rc; + /* sanity check */ + if (NULL == val) { + rc = PMIX_ERR_NOT_FOUND; + continue; + } + if (PMIX_STRING != val->type) { + rc = PMIX_ERR_INVALID_VAL; + PMIX_VALUE_FREE(val, 1); + continue; + } + if (NULL == val->data.string) { + /* no nodes found */ + PMIX_VALUE_FREE(val, 1); + continue; + } + /* add to our list of results, ensuring uniqueness */ + p = pmix_argv_split(val->data.string, ','); + for (n=0; NULL != p[n]; n++) { + pmix_argv_append_unique_nosize(&tmp, p[n], true); + } + pmix_argv_free(p); + PMIX_VALUE_FREE(val, 1); + } + if (0 < pmix_argv_count(tmp)) { + *nodelist = pmix_argv_join(tmp, ','); + pmix_argv_free(tmp); + rc = PMIX_SUCCESS; } - /* retry the fetch */ - cb->lock.active = true; - PMIX_THREADSHIFT(cb, _resolve_nodes); - PMIX_WAIT_THREAD(&cb->lock); + return rc; } - /* the string we want is in the key field */ - *nodelist = cb->key; - rc = cb->status; - PMIX_RELEASE(cb); - return rc; + PMIX_LOAD_NSPACE(proc.nspace, nspace); + rc = PMIx_Get(&proc, PMIX_NODE_LIST, NULL, 0, &val); + if (PMIX_SUCCESS != rc) { + return rc; + } + + /* sanity check */ + if (NULL == val) { + return PMIX_ERR_NOT_FOUND; + } + if (PMIX_STRING != val->type || + NULL == val->data.string) { + PMIX_VALUE_FREE(val, 1); + return PMIX_ERR_INVALID_VAL; + } + + /* pass back the result */ + *nodelist = strdup(val->data.string); + PMIX_VALUE_FREE(val, 1); + + return PMIX_SUCCESS; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_fence.c b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_fence.c index adac9bbf4d3..76c8a936c6e 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_fence.c +++ b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_fence.c @@ -81,6 +81,12 @@ PMIX_EXPORT pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs, return PMIX_ERR_INIT; } + /* if we are a singleton, there is nothing to do */ + if (pmix_client_globals.singleton) { + PMIX_RELEASE_THREAD(&pmix_global_lock); + return PMIX_SUCCESS; + } + /* if we aren't connected, don't attempt to send */ if (!pmix_globals.connected) { PMIX_RELEASE_THREAD(&pmix_global_lock); diff --git a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_get.c b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_get.c index b620c05cf29..a2a92ae87d9 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_get.c +++ b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_get.c @@ -52,6 +52,7 @@ #include "src/class/pmix_list.h" #include "src/mca/bfrops/bfrops.h" +#include "src/mca/ptl/base/base.h" #include "src/threads/threads.h" #include "src/util/argv.h" #include "src/util/compress.h" @@ -60,7 +61,7 @@ #include "src/util/name_fns.h" #include "src/util/output.h" #include "src/mca/gds/gds.h" -#include "src/mca/ptl/ptl.h" +#include "src/mca/ptl/base/base.h" #include "pmix_client_ops.h" @@ -90,6 +91,9 @@ PMIX_EXPORT pmix_status_t PMIx_Get(const pmix_proc_t *proc, { pmix_cb_t *cb; pmix_status_t rc; + size_t n, nfo; + pmix_proc_t p; + pmix_info_t nodeinfo, *iptr; PMIX_ACQUIRE_THREAD(&pmix_global_lock); @@ -104,18 +108,62 @@ PMIX_EXPORT pmix_status_t PMIx_Get(const pmix_proc_t *proc, (NULL == proc) ? "NULL" : PMIX_NAME_PRINT(proc), (NULL == key) ? "NULL" : key); - /* try to get data directly, without threadshift */ - if (PMIX_RANK_UNDEF != proc->rank && NULL != key) { - if (PMIX_SUCCESS == (rc = _getfn_fastpath(proc, key, info, ninfo, val))) { - goto done; + memcpy(&p, proc, sizeof(pmix_proc_t)); + iptr = (pmix_info_t*)info; + nfo = ninfo; + + if (!PMIX_PEER_IS_EARLIER(pmix_client_globals.myserver, 3, 1, 5)) { + if (PMIX_RANK_UNDEF == proc->rank || NULL == key) { + goto doget; + } + /* if they are asking about a node-level piece of info, + * then the rank must be UNDEF */ + if (pmix_check_node_info(key)) { + p.rank = PMIX_RANK_UNDEF; + /* see if they told us to get node info */ + if (NULL == info) { + /* guess not - better do it */ + PMIX_INFO_LOAD(&nodeinfo, PMIX_NODE_INFO, NULL, PMIX_BOOL); + iptr = &nodeinfo; + nfo = 1; + } + goto doget; + } + /* if they are asking about an app-level piece of info, + * then the rank must be UNDEF */ + if (pmix_check_app_info(key)) { + p.rank = PMIX_RANK_UNDEF; + /* see if they told us to get app info */ + if (NULL == info) { + /* guess not - better do it */ + PMIX_INFO_LOAD(&nodeinfo, PMIX_APP_INFO, NULL, PMIX_BOOL); + iptr = &nodeinfo; + nfo = 1; + } + goto doget; + } + + /* see if they are requesting session, node, or app-level info */ + for (n=0; n < ninfo; n++) { + if (PMIX_CHECK_KEY(info, PMIX_NODE_INFO) || + PMIX_CHECK_KEY(info, PMIX_APP_INFO) || + PMIX_CHECK_KEY(info, PMIX_SESSION_INFO)) { + goto doget; + } } } + /* try to get data directly, without threadshift */ + if (PMIX_SUCCESS == (rc = _getfn_fastpath(&p, key, iptr, nfo, val))) { + goto done; + } + + doget: /* create a callback object as we need to pass it to the * recv routine so we know which callback to use when * the return message is recvd */ cb = PMIX_NEW(pmix_cb_t); - if (PMIX_SUCCESS != (rc = PMIx_Get_nb(proc, key, info, ninfo, _value_cbfunc, cb))) { + if (PMIX_SUCCESS != (rc = PMIx_Get_nb(&p, key, iptr, nfo, _value_cbfunc, cb))) { PMIX_RELEASE(cb); return rc; } @@ -298,6 +346,7 @@ static void _getnb_cbfunc(struct pmix_peer_t *pr, int32_t cnt; pmix_proc_t proc; pmix_kval_t *kv; + bool diffnspace; pmix_output_verbose(2, pmix_client_globals.get_output, "pmix: get_nb callback recvd"); @@ -312,6 +361,9 @@ static void _getnb_cbfunc(struct pmix_peer_t *pr, pmix_strncpy(proc.nspace, cb->pname.nspace, PMIX_MAX_NSLEN); proc.rank = cb->pname.rank; + /* check for a different nspace */ + diffnspace = !PMIX_CHECK_NSPACE(pmix_globals.myid.nspace, proc.nspace); + /* a zero-byte buffer indicates that this recv is being * completed due to a lost connection */ if (PMIX_BUFFER_IS_EMPTY(buf)) { @@ -333,7 +385,7 @@ static void _getnb_cbfunc(struct pmix_peer_t *pr, if (PMIX_SUCCESS != ret) { goto done; } - if (PMIX_RANK_UNDEF == proc.rank) { + if (PMIX_RANK_UNDEF == proc.rank || diffnspace) { PMIX_GDS_ACCEPT_KVS_RESP(rc, pmix_globals.mypeer, buf); } else { PMIX_GDS_ACCEPT_KVS_RESP(rc, pmix_client_globals.myserver, buf); @@ -356,9 +408,18 @@ static void _getnb_cbfunc(struct pmix_peer_t *pr, /* fetch the data from server peer module - since it is passing * it back to the user, we need a copy of it */ cb->copy = true; - if (PMIX_RANK_UNDEF == proc.rank) { + if (PMIX_RANK_UNDEF == proc.rank || diffnspace) { + if (PMIX_PEER_IS_EARLIER(pmix_client_globals.myserver, 3, 1, 5)) { + /* everything is under rank=wildcard */ + proc.rank = PMIX_RANK_WILDCARD; + } PMIX_GDS_FETCH_KV(rc, pmix_globals.mypeer, cb); } else { + if (PMIX_RANK_UNDEF == proc.rank && + PMIX_PEER_IS_EARLIER(pmix_client_globals.myserver, 3, 1, 5)) { + /* everything is under rank=wildcard */ + proc.rank = PMIX_RANK_WILDCARD; + } PMIX_GDS_FETCH_KV(rc, pmix_client_globals.myserver, cb); } if (PMIX_SUCCESS == rc) { @@ -561,6 +622,7 @@ static void _getnbfn(int fd, short flags, void *cbdata) pmix_proc_t proc; bool optional = false; bool immediate = false; + bool internal_only = false; struct timeval tv; pmix_query_caddy_t *cd; @@ -596,14 +658,10 @@ static void _getnbfn(int fd, short flags, void *cbdata) } } else if (PMIX_CHECK_KEY(&cb->info[n], PMIX_DATA_SCOPE)) { cb->scope = cb->info[n].value.data.scope; - } else if (PMIX_CHECK_KEY(&cb->info[n], PMIX_SESSION_INFO)) { - cb->level = PMIX_LEVEL_SESSION; - } else if (PMIX_CHECK_KEY(&cb->info[n], PMIX_JOB_INFO)) { - cb->level = PMIX_LEVEL_JOB; - } else if (PMIX_CHECK_KEY(&cb->info[n], PMIX_APP_INFO)) { - cb->level = PMIX_LEVEL_APP; - } else if (PMIX_CHECK_KEY(&cb->info[n], PMIX_NODE_INFO)) { - cb->level = PMIX_LEVEL_NODE; + } else if (PMIX_CHECK_KEY(&cb->info[n], PMIX_NODE_INFO) || + PMIX_CHECK_KEY(&cb->info[n], PMIX_APP_INFO) || + PMIX_CHECK_KEY(&cb->info[n], PMIX_SESSION_INFO)) { + internal_only = true; } } } @@ -623,7 +681,7 @@ static void _getnbfn(int fd, short flags, void *cbdata) /* if the key is NULL or starts with "pmix", then they are looking * for data that was provided by the server at startup */ - if (NULL == cb->key || 0 == strncmp(cb->key, "pmix", 4)) { + if (!internal_only && (NULL == cb->key || 0 == strncmp(cb->key, "pmix", 4))) { cb->proc = &proc; /* fetch the data from my server's module - since we are passing * it back to the user, we need a copy of it */ @@ -638,11 +696,13 @@ static void _getnbfn(int fd, short flags, void *cbdata) if (PMIX_SUCCESS != rc) { pmix_output_verbose(5, pmix_client_globals.get_output, "pmix:client job-level data NOT found"); - if (0 != strncmp(cb->pname.nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN)) { + if (!PMIX_CHECK_NSPACE(cb->pname.nspace, pmix_globals.myid.nspace)) { /* we are asking about the job-level info from another * namespace. It seems that we don't have it - go and - * ask server + * ask server and indicate we only need job-level info + * by setting the rank to WILDCARD */ + proc.rank = PMIX_RANK_WILDCARD; goto request; } else if (NULL != cb->key) { /* if immediate was given, then we are being directed to @@ -723,8 +783,8 @@ static void _getnbfn(int fd, short flags, void *cbdata) /* if we got here, then we don't have the data for this proc. If we * are a server, or we are a client and not connected, then there is * nothing more we can do */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) || - (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && !pmix_globals.connected)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) || + (!PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && !pmix_globals.connected)) { rc = PMIX_ERR_NOT_FOUND; goto respond; } @@ -744,8 +804,7 @@ static void _getnbfn(int fd, short flags, void *cbdata) * this nspace:rank. If we do, then no need to ask again as the * request will return _all_ data from that proc */ PMIX_LIST_FOREACH(cbret, &pmix_client_globals.pending_requests, pmix_cb_t) { - if (0 == strncmp(cbret->pname.nspace, cb->pname.nspace, PMIX_MAX_NSLEN) && - cbret->pname.rank == cb->pname.rank) { + if (PMIX_CHECK_PROCID(&cbret->pname, &cb->pname)) { /* we do have a pending request, but we still need to track this * outstanding request so we can satisfy it once the data is returned */ pmix_list_append(&pmix_client_globals.pending_requests, &cb->super); @@ -755,16 +814,16 @@ static void _getnbfn(int fd, short flags, void *cbdata) /* we don't have a pending request, so let's create one - don't worry * about packing the key as we return everything from that proc */ - msg = _pack_get(cb->pname.nspace, cb->pname.rank, cb->info, cb->ninfo, PMIX_GETNB_CMD); + msg = _pack_get(cb->pname.nspace, proc.rank, cb->info, cb->ninfo, PMIX_GETNB_CMD); if (NULL == msg) { rc = PMIX_ERROR; goto respond; } pmix_output_verbose(2, pmix_client_globals.get_output, - "%s REQUESTING DATA FROM SERVER FOR %s KEY %s", + "%s REQUESTING DATA FROM SERVER FOR %s:%s KEY %s", PMIX_NAME_PRINT(&pmix_globals.myid), - PMIX_NAME_PRINT(cb->proc), cb->key); + cb->proc->nspace, PMIX_RANK_PRINT(proc.rank), cb->key); /* track the callback object */ pmix_list_append(&pmix_client_globals.pending_requests, &cb->super); diff --git a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_ops.h b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_ops.h index c84dffd6d67..11fc42ca3b8 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_ops.h +++ b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_ops.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -23,6 +23,7 @@ BEGIN_C_DECLS typedef struct { pmix_peer_t *myserver; // messaging support to/from my server + bool singleton; // no server pmix_list_t pending_requests; // list of pmix_cb_t pending data requests pmix_pointer_array_t peers; // array of pmix_peer_t cached for data ops // verbosity for client get operations diff --git a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_pub.c b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_pub.c index bd6795eac5e..df1799de916 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_pub.c +++ b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_pub.c @@ -1,8 +1,8 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. + * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Artem Y. Polyakov . * All rights reserved. * Copyright (c) 2016 Mellanox Technologies, Inc. @@ -523,13 +523,13 @@ static void wait_cbfunc(struct pmix_peer_t *pr, (NULL == buf) ? -1 : (int)buf->bytes_used); if (NULL == buf) { - rc = PMIX_ERR_BAD_PARAM; + ret = PMIX_ERR_BAD_PARAM; goto report; } /* a zero-byte buffer indicates that this recv is being * completed due to a lost connection */ if (PMIX_BUFFER_IS_EMPTY(buf)) { - rc = PMIX_ERR_UNREACH; + ret = PMIX_ERR_UNREACH; goto report; } @@ -539,11 +539,12 @@ static void wait_cbfunc(struct pmix_peer_t *pr, buf, &ret, &cnt, PMIX_STATUS); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); + ret = rc; } report: if (NULL != cb->cbfunc.opfn) { - cb->cbfunc.opfn(rc, cb->cbdata); + cb->cbfunc.opfn(ret, cb->cbdata); } PMIX_RELEASE(cb); } diff --git a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_control.c b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_control.c index 1c2f74308a0..cce767474ed 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_control.c +++ b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_control.c @@ -187,8 +187,8 @@ PMIX_EXPORT pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_ /* if we are the server, then we just issue the request and * return the response */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { PMIX_RELEASE_THREAD(&pmix_global_lock); if (NULL == pmix_host_server.job_control) { /* nothing we can do */ @@ -344,8 +344,8 @@ PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pm /* if we are the server, then we just issue the request and * return the response */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { PMIX_RELEASE_THREAD(&pmix_global_lock); if (NULL == pmix_host_server.monitor) { /* nothing we can do */ diff --git a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_data.c b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_data.c index a9e32c661f6..b2b26f60c67 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_data.c +++ b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_data.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -85,7 +85,7 @@ static pmix_peer_t* find_peer(const pmix_proc_t *proc) return pmix_globals.mypeer; } - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { /* see if we know this proc */ for (i=0; i < pmix_server_globals.clients.size; i++) { if (NULL != (peer = (pmix_peer_t*)pmix_pointer_array_get_item(&pmix_server_globals.clients, i))) { diff --git a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_iof.c b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_iof.c index 03f38eddc60..893d19544b8 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_iof.c +++ b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_iof.c @@ -40,24 +40,49 @@ static void msgcbfunc(struct pmix_peer_t *peer, pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata; int32_t m; pmix_status_t rc, status; + size_t refid = 0; + + PMIX_ACQUIRE_OBJECT(cd); /* unpack the return status */ m=1; PMIX_BFROPS_UNPACK(rc, peer, buf, &status, &m, PMIX_STATUS); - if (PMIX_SUCCESS == rc && PMIX_SUCCESS == status) { - /* store the request on our list - we are in an event, and + if (NULL != cd->iofreq && PMIX_SUCCESS == rc && PMIX_SUCCESS == status) { + /* get the reference ID */ + m=1; + PMIX_BFROPS_UNPACK(rc, peer, buf, &refid, &m, PMIX_SIZE); + /* store the request - we are in an event, and * so this is safe */ - pmix_list_append(&pmix_globals.iof_requests, &cd->iofreq->super); + if (NULL == pmix_pointer_array_get_item(&pmix_globals.iof_requests, refid)) { + pmix_pointer_array_set_item(&pmix_globals.iof_requests, refid, cd->iofreq); + } + if (NULL != cd->cbfunc.hdlrregcbfn) { + cd->cbfunc.hdlrregcbfn(PMIX_SUCCESS, refid, cd->cbdata); + } } else if (PMIX_SUCCESS != rc) { status = rc; - PMIX_RELEASE(cd->iofreq); } pmix_output_verbose(2, pmix_client_globals.iof_output, - "pmix:iof_register returned status %s", PMIx_Error_string(status)); + "pmix:iof_register/deregister returned status %s", PMIx_Error_string(status)); - if (NULL != cd->cbfunc.opcbfn) { - cd->cbfunc.opcbfn(status, cd->cbdata); + if (NULL == cd->iofreq) { + /* this was a deregistration request */ + if (NULL == cd->cbfunc.opcbfn) { + cd->status = status; + PMIX_WAKEUP_THREAD(&cd->lock); + } else { + cd->cbfunc.opcbfn(status, cd->cbdata); + } + } else if (NULL == cd->cbfunc.hdlrregcbfn) { + cd->status = status; + cd->ncodes = refid; + PMIX_WAKEUP_THREAD(&cd->lock); + } else { + cd->cbfunc.hdlrregcbfn(PMIX_SUCCESS, refid, cd->cbdata); + } + if (PMIX_SUCCESS != rc && NULL != cd->iofreq) { + PMIX_RELEASE(cd->iofreq); } PMIX_RELEASE(cd); } @@ -83,8 +108,8 @@ PMIX_EXPORT pmix_status_t PMIx_IOF_pull(const pmix_proc_t procs[], size_t nprocs } /* if we are a server, we cannot do this */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { PMIX_RELEASE_THREAD(&pmix_global_lock); return PMIX_ERR_NOT_SUPPORTED; } @@ -182,6 +207,97 @@ PMIX_EXPORT pmix_status_t PMIx_IOF_pull(const pmix_proc_t procs[], size_t nprocs return rc; } +PMIX_EXPORT pmix_status_t PMIx_IOF_deregister(size_t iofhdlr, + const pmix_info_t directives[], size_t ndirs, + pmix_op_cbfunc_t cbfunc, void *cbdata) +{ + pmix_shift_caddy_t *cd; + pmix_cmd_t cmd = PMIX_IOF_DEREG_CMD; + pmix_buffer_t *msg; + pmix_status_t rc; + + PMIX_ACQUIRE_THREAD(&pmix_global_lock); + + pmix_output_verbose(2, pmix_client_globals.iof_output, + "pmix:iof_deregister"); + + if (pmix_globals.init_cntr <= 0) { + PMIX_RELEASE_THREAD(&pmix_global_lock); + return PMIX_ERR_INIT; + } + + /* if we are a server, we cannot do this */ + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { + PMIX_RELEASE_THREAD(&pmix_global_lock); + return PMIX_ERR_NOT_SUPPORTED; + } + + /* if we aren't connected, don't attempt to send */ + if (!pmix_globals.connected) { + PMIX_RELEASE_THREAD(&pmix_global_lock); + return PMIX_ERR_UNREACH; + } + PMIX_RELEASE_THREAD(&pmix_global_lock); + + /* send this request to the server */ + cd = PMIX_NEW(pmix_shift_caddy_t); + if (NULL == cd) { + return PMIX_ERR_NOMEM; + } + cd->cbfunc.opcbfn = cbfunc; + cd->cbdata = cbdata; + + /* setup the registration cmd */ + msg = PMIX_NEW(pmix_buffer_t); + if (NULL == msg) { + PMIX_RELEASE(cd->iofreq); + PMIX_RELEASE(cd); + return PMIX_ERR_NOMEM; + } + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, &cmd, 1, PMIX_COMMAND); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, &ndirs, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + if (0 < ndirs) { + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, directives, ndirs, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + } + + /* pack the handler ID */ + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, &iofhdlr, 1, PMIX_SIZE); + + pmix_output_verbose(2, pmix_client_globals.iof_output, + "pmix:iof_dereg sending to server"); + PMIX_PTL_SEND_RECV(rc, pmix_client_globals.myserver, + msg, msgcbfunc, (void*)cd); + + cleanup: + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + PMIX_RELEASE(cd); + } else if (NULL == cbfunc) { + PMIX_WAIT_THREAD(&cd->lock); + rc = cd->status; + PMIX_RELEASE(cd); + } + return rc; +} + typedef struct { pmix_op_cbfunc_t cbfunc; void *cbdata; @@ -237,8 +353,8 @@ pmix_status_t PMIx_IOF_push(const pmix_proc_t targets[], size_t ntargets, /* if we are not a server, then we send the provided * data to our server for processing */ - if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer) || - PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_SERVER(pmix_globals.mypeer) || + PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { msg = PMIX_NEW(pmix_buffer_t); if (NULL == msg) { return PMIX_ERR_NOMEM; @@ -317,6 +433,99 @@ pmix_status_t PMIx_IOF_push(const pmix_proc_t targets[], size_t ntargets, return PMIX_SUCCESS; } +pmix_status_t pmix_iof_process_iof(pmix_iof_channel_t channels, + const pmix_proc_t *source, + const pmix_byte_object_t *bo, + const pmix_info_t *info, size_t ninfo, + const pmix_iof_req_t *req) +{ + bool match; + size_t m; + pmix_buffer_t *msg; + pmix_status_t rc; + + /* if the channel wasn't included, then ignore it */ + if (!(channels & req->channels)) { + return PMIX_SUCCESS; + } + /* see if the source matches the request */ + match = false; + for (m=0; m < req->nprocs; m++) { + if (PMIX_CHECK_PROCID(source, &req->procs[m])) { + match = true; + break; + } + } + if (!match) { + return PMIX_SUCCESS; + } + /* never forward back to the source! This can happen if the source + * is a launcher - also, never forward to a peer that is no + * longer with us */ + if (NULL == req->requestor->info || req->requestor->finalized) { + return PMIX_SUCCESS; + } + if (PMIX_CHECK_PROCID(source, &req->requestor->info->pname)) { + return PMIX_SUCCESS; + } + /* setup the msg */ + if (NULL == (msg = PMIX_NEW(pmix_buffer_t))) { + PMIX_ERROR_LOG(PMIX_ERR_OUT_OF_RESOURCE); + return PMIX_ERR_OUT_OF_RESOURCE; + } + /* provide the source */ + PMIX_BFROPS_PACK(rc, req->requestor, msg, source, 1, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + /* provide the channel */ + PMIX_BFROPS_PACK(rc, req->requestor, msg, &channels, 1, PMIX_IOF_CHANNEL); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + /* provide the handler ID so they know which cbfunc to use */ + PMIX_BFROPS_PACK(rc, req->requestor, msg, &req->refid, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + /* pack the number of info's provided */ + PMIX_BFROPS_PACK(rc, req->requestor, msg, &ninfo, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + /* if some were provided, then pack them too */ + if (0 < ninfo) { + PMIX_BFROPS_PACK(rc, req->requestor, msg, info, ninfo, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + } + /* pack the data */ + PMIX_BFROPS_PACK(rc, req->requestor, msg, bo, 1, PMIX_BYTE_OBJECT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + /* send it to the requestor */ + PMIX_PTL_SEND_ONEWAY(rc, req->requestor, msg, PMIX_PTL_TAG_IOF); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + } + return PMIX_OPERATION_SUCCEEDED; +} + pmix_status_t pmix_iof_write_output(const pmix_proc_t *name, pmix_iof_channel_t stream, const pmix_byte_object_t *bo, diff --git a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_iof.h b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_iof.h index 3525c5fb471..66f05a0ade2 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_iof.h +++ b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_iof.h @@ -200,6 +200,11 @@ PMIX_EXPORT void pmix_iof_stdin_write_handler(int fd, short event, void *cbdata) PMIX_EXPORT bool pmix_iof_stdin_check(int fd); PMIX_EXPORT void pmix_iof_stdin_cb(int fd, short event, void *cbdata); PMIX_EXPORT void pmix_iof_read_local_handler(int fd, short event, void *cbdata); +PMIX_EXPORT pmix_status_t pmix_iof_process_iof(pmix_iof_channel_t channels, + const pmix_proc_t *source, + const pmix_byte_object_t *bo, + const pmix_info_t *info, size_t ninfo, + const pmix_iof_req_t *req); END_C_DECLS diff --git a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_log.c b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_log.c index 0c5aa760fc7..e8c97667e43 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_log.c +++ b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_log.c @@ -28,6 +28,7 @@ #include "src/util/output.h" #include "src/mca/bfrops/bfrops.h" #include "src/mca/plog/base/base.h" +#include "src/mca/ptl/base/base.h" #include "src/client/pmix_client_ops.h" #include "src/server/pmix_server_ops.h" @@ -159,8 +160,8 @@ PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata, /* if we are a client or tool, we never do this ourselves - we * always pass this request to our server for execution */ - if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { /* if we aren't connected, don't attempt to send */ if (!pmix_globals.connected) { PMIX_RELEASE_THREAD(&pmix_global_lock); @@ -181,15 +182,17 @@ PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata, PMIX_RELEASE(cd); return rc; } - /* provide the timestamp - zero will indicate - * that it wasn't taken */ - PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, - msg, ×tamp, 1, PMIX_TIME); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - PMIX_RELEASE(cd); - return rc; + if (!PMIX_PEER_IS_EARLIER(pmix_client_globals.myserver, 3, PMIX_MINOR_WILDCARD, PMIX_RELEASE_WILDCARD)) { + /* provide the timestamp - zero will indicate + * that it wasn't taken */ + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, ×tamp, 1, PMIX_TIME); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + PMIX_RELEASE(cd); + return rc; + } } /* pack the number of data entries */ PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, diff --git a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_query.c b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_query.c index 1f217d18a5d..d4a944cb046 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_query.c +++ b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_query.c @@ -290,8 +290,8 @@ PMIX_EXPORT pmix_status_t PMIx_Query_info_nb(pmix_query_t queries[], size_t nque query: /* if we are the server, then we just issue the query and * return the response */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { PMIX_RELEASE_THREAD(&pmix_global_lock); if (NULL == pmix_host_server.query) { /* nothing we can do */ @@ -359,7 +359,22 @@ static void acb(pmix_status_t status, void *release_cbdata) { pmix_cb_t *cb = (pmix_cb_t*)cbdata; + size_t n; + cb->status = status; + if (NULL != info) { + PMIX_INFO_CREATE(cb->info, ninfo); + if (NULL == cb->info) { + cb->status = PMIX_ERR_NOMEM; + goto done; + } + cb->ninfo = ninfo; + for (n=0; n < ninfo; n++) { + PMIX_INFO_XFER(&cb->info[n], &info[n]); + } + } + + done: if (NULL != release_fn) { release_fn(release_cbdata); } @@ -367,7 +382,8 @@ static void acb(pmix_status_t status, } PMIX_EXPORT pmix_status_t PMIx_Allocation_request(pmix_alloc_directive_t directive, - pmix_info_t *info, size_t ninfo) + pmix_info_t *info, size_t ninfo, + pmix_info_t **results, size_t *nresults) { pmix_cb_t cb; pmix_status_t rc; @@ -383,6 +399,10 @@ PMIX_EXPORT pmix_status_t PMIx_Allocation_request(pmix_alloc_directive_t directi pmix_output_verbose(2, pmix_globals.debug_output, "%s pmix:allocate", PMIX_NAME_PRINT(&pmix_globals.myid)); + /* set the default response */ + *results = NULL; + *nresults = 0; + /* create a callback object as we need to pass it to the * recv routine so we know which callback to use when * the return message is recvd */ @@ -396,6 +416,13 @@ PMIX_EXPORT pmix_status_t PMIx_Allocation_request(pmix_alloc_directive_t directi /* wait for the operation to complete */ PMIX_WAIT_THREAD(&cb.lock); rc = cb.status; + if (NULL != cb.info) { + *results = cb.info; + *nresults = cb.ninfo; + /* protect the data */ + cb.info = NULL; + cb.ninfo = 0; + } PMIX_DESTRUCT(&cb); pmix_output_verbose(2, pmix_globals.debug_output, @@ -425,8 +452,8 @@ PMIX_EXPORT pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t dire /* if we are the server, then we just issue the request and * return the response */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { PMIX_RELEASE_THREAD(&pmix_global_lock); if (NULL == pmix_host_server.allocate) { /* nothing we can do */ diff --git a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_security.c b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_security.c index c4797c1cd05..995f45bdddd 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_security.c +++ b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_security.c @@ -131,8 +131,8 @@ PMIX_EXPORT pmix_status_t PMIx_Get_credential(const pmix_info_t info[], size_t n } /* if we are the server */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { PMIX_RELEASE_THREAD(&pmix_global_lock); /* if the host doesn't support this operation, * see if we can generate it ourselves */ @@ -317,8 +317,8 @@ PMIX_EXPORT pmix_status_t PMIx_Validate_credential(const pmix_byte_object_t *cre } /* if we are the server */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { PMIX_RELEASE_THREAD(&pmix_global_lock); /* if the host doesn't support this operation, * see if we can validate it ourselves */ diff --git a/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event.h b/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event.h index 6ba6b774932..abfd05f96f4 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event.h +++ b/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event.h @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -200,7 +200,7 @@ void pmix_event_timeout_cb(int fd, short flags, void *arg); (p)->info->pname.rank); \ /* if I'm a client or tool and this is my server, then we don't */ \ /* set the targets - otherwise, we do */ \ - if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && \ + if (!PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && \ !PMIX_CHECK_PROCID(&pmix_client_globals.myserver->info->pname, \ &(p)->info->pname)) { \ PMIX_PROC_CREATE(ch->targets, 1); \ diff --git a/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_notification.c b/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_notification.c index c667489394c..3f59e6a8815 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_notification.c +++ b/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_notification.c @@ -51,8 +51,8 @@ PMIX_EXPORT pmix_status_t PMIx_Notify_event(pmix_status_t status, return PMIX_ERR_INIT; } - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { PMIX_RELEASE_THREAD(&pmix_global_lock); pmix_output_verbose(2, pmix_server_globals.event_output, @@ -1223,8 +1223,8 @@ void pmix_event_timeout_cb(int fd, short flags, void *arg) pmix_list_remove_item(&pmix_globals.cached_events, &ch->super); /* process this event thru the regular channels */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { pmix_server_notify_client_of_event(ch->status, &ch->source, ch->range, ch->info, ch->ninfo, ch->final_cbfunc, ch->final_cbdata); diff --git a/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_registration.c b/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_registration.c index be2346048d8..4ba4d72f1cd 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_registration.c +++ b/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_registration.c @@ -97,11 +97,6 @@ static void regevents_cbfunc(struct pmix_peer_t *peer, pmix_ptl_hdr_t *hdr, PMIX_BFROPS_UNPACK(rc, peer, buf, &ret, &cnt, PMIX_STATUS); if ((PMIX_SUCCESS != rc) || (PMIX_SUCCESS != ret)) { - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - } else { - PMIX_ERROR_LOG(ret); - } /* remove the err handler and call the error handler * reg completion callback fn so the requestor * doesn't hang */ @@ -310,9 +305,9 @@ static pmix_status_t _add_hdlr(pmix_rshift_caddy_t *cd, pmix_list_t *xfer) * type with our server, or if we have directives, then we need to notify * the server - however, don't do this for a v1 server as the event * notification system there doesn't work */ - if ((!PMIX_PROC_IS_SERVER(pmix_globals.mypeer) || PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) && + if ((!PMIX_PEER_IS_SERVER(pmix_globals.mypeer) || PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) && pmix_globals.connected && - !PMIX_PROC_IS_V1(pmix_client_globals.myserver) && + !PMIX_PEER_IS_V1(pmix_client_globals.myserver) && (need_register || 0 < pmix_list_get_size(xfer))) { pmix_output_verbose(2, pmix_client_globals.event_output, "pmix: _add_hdlr sending to server"); @@ -332,8 +327,8 @@ static pmix_status_t _add_hdlr(pmix_rshift_caddy_t *cd, pmix_list_t *xfer) /* if we are a server and are registering for events, then we only contact * our host if we want environmental events */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer) && cd->enviro && + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer) && cd->enviro && NULL != pmix_host_server.register_events) { pmix_output_verbose(2, pmix_client_globals.event_output, "pmix: _add_hdlr registering with server"); @@ -918,7 +913,7 @@ static void dereg_event_hdlr(int sd, short args, void *cbdata) /* if I am not the server, and I am connected, then I need * to notify the server to remove my registration */ - if ((!PMIX_PROC_IS_SERVER(pmix_globals.mypeer) || PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) && + if ((!PMIX_PEER_IS_SERVER(pmix_globals.mypeer) || PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) && pmix_globals.connected) { msg = PMIX_NEW(pmix_buffer_t); PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, diff --git a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c index df8a6b6e5b4..ee8f83f1461 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c +++ b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c @@ -1,8 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2017 Research Organization for Information Science - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Artem Y. Polyakov . * All rights reserved. @@ -100,7 +99,7 @@ static void nscon(pmix_namespace_t *p) { p->nspace = NULL; p->nprocs = 0; - p->nlocalprocs = 0; + p->nlocalprocs = SIZE_MAX; p->all_registered = false; p->version_stored = false; p->jobbkt = NULL; @@ -170,7 +169,11 @@ PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_rank_info_t, static void pcon(pmix_peer_t *p) { - p->proc_type = PMIX_PROC_UNDEF; + p->proc_type.type = PMIX_PROC_UNDEF; + p->proc_type.major = PMIX_MAJOR_WILDCARD; + p->proc_type.minor = PMIX_MINOR_WILDCARD; + p->proc_type.release = PMIX_RELEASE_WILDCARD; + p->proc_type.padding = 0; p->protocol = PMIX_PROTOCOL_UNDEF; p->finalized = false; p->info = NULL; @@ -228,22 +231,24 @@ PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_peer_t, static void iofreqcon(pmix_iof_req_t *p) { - p->peer = NULL; - memset(&p->pname, 0, sizeof(pmix_name_t)); + p->requestor = NULL; + p->refid = 0; + p->procs = NULL; + p->nprocs = 0; p->channels = PMIX_FWD_NO_CHANNELS; p->cbfunc = NULL; } static void iofreqdes(pmix_iof_req_t *p) { - if (NULL != p->peer) { - PMIX_RELEASE(p->peer); + if (NULL != p->requestor) { + PMIX_RELEASE(p->requestor); } - if (NULL != p->pname.nspace) { - free(p->pname.nspace); + if (0 < p->nprocs) { + PMIX_PROC_FREE(p->procs, p->nprocs); } } PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_iof_req_t, - pmix_list_item_t, + pmix_object_t, iofreqcon, iofreqdes); @@ -306,7 +311,6 @@ static void cbcon(pmix_cb_t *p) PMIX_CONSTRUCT(&p->kvs, pmix_list_t); p->copy = false; p->timer_running = false; - p->level = PMIX_LEVEL_UNDEF; } static void cbdes(pmix_cb_t *p) { diff --git a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h index 113cd48faab..8da9c9ccd2d 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h +++ b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -106,6 +106,7 @@ typedef uint8_t pmix_cmd_t; #define PMIX_VALIDATE_CRED_CMD 21 #define PMIX_IOF_PULL_CMD 22 #define PMIX_IOF_PUSH_CMD 23 +#define PMIX_IOF_DEREG_CMD 29 /* provide a "pretty-print" function for cmds */ const char* pmix_command_string(pmix_cmd_t cmd); @@ -122,16 +123,6 @@ typedef enum { PMIX_COLLECT_MAX } pmix_collect_t; -/* define a set of flags indicating the level - * of information being stored/requested */ -typedef enum { - PMIX_LEVEL_UNDEF, - PMIX_LEVEL_SESSION, - PMIX_LEVEL_JOB, - PMIX_LEVEL_APP, - PMIX_LEVEL_NODE -} pmix_level_t; - /**** PEER STRUCTURES ****/ /* clients can only talk to their server, and servers are @@ -256,9 +247,11 @@ PMIX_CLASS_DECLARATION(pmix_peer_t); /* tracker for IOF requests */ typedef struct { - pmix_list_item_t super; - pmix_peer_t *peer; - pmix_name_t pname; + pmix_object_t super; + pmix_peer_t *requestor; + size_t refid; + pmix_proc_t *procs; + size_t nprocs; pmix_iof_channel_t channels; pmix_iof_cbfunc_t cbfunc; } pmix_iof_req_t; @@ -302,6 +295,7 @@ typedef struct { bool hybrid; // true if participating procs are from more than one nspace pmix_proc_t *pcs; // copy of the original array of participants size_t npcs; // number of procs in the array + pmix_list_t nslist; // unique nspace list of participants pmix_lock_t lock; // flag for waiting for completion bool def_complete; // all local procs have been registered and the trk definition is complete pmix_list_t local_cbs; // list of pmix_server_caddy_t for sending result to the local participants @@ -329,6 +323,8 @@ typedef struct { pmix_server_trkr_t *trk; pmix_ptl_hdr_t hdr; pmix_peer_t *peer; + pmix_info_t *info; + size_t ninfo; } pmix_server_caddy_t; PMIX_CLASS_DECLARATION(pmix_server_caddy_t); @@ -399,7 +395,6 @@ typedef struct { pmix_list_t kvs; bool copy; bool timer_running; - pmix_level_t level; } pmix_cb_t; PMIX_CLASS_DECLARATION(pmix_cb_t); @@ -477,7 +472,7 @@ typedef struct { bool commits_pending; struct timeval event_window; pmix_list_t cached_events; // events waiting in the window prior to processing - pmix_list_t iof_requests; // list of pmix_iof_req_t IOF requests + pmix_pointer_array_t iof_requests; // array of pmix_iof_req_t IOF requests int max_events; // size of the notifications hotel int event_eviction_time; // max time to cache notifications pmix_hotel_t notifications; // hotel of pending notifications @@ -490,6 +485,7 @@ typedef struct { pmix_gds_base_module_t *mygds; /* IOF controls */ bool tag_output; + pmix_list_t stdin_targets; // list of pmix_namelist_t bool xml_output; bool timestamp_output; size_t output_limit; @@ -502,6 +498,40 @@ PMIX_EXPORT void pmix_execute_epilog(pmix_epilog_t *ep); PMIX_EXPORT extern pmix_globals_t pmix_globals; PMIX_EXPORT extern pmix_lock_t pmix_global_lock; +static inline bool pmix_check_node_info(const char* key) +{ + char *keys[] = { + PMIX_LOCAL_PEERS, + PMIX_LOCAL_SIZE, + NULL + }; + size_t n; + + for (n=0; NULL != keys[n]; n++) { + if (0 == strncmp(key, keys[n], PMIX_MAX_KEYLEN)) { + return true; + } + } + return false; +} + +static inline bool pmix_check_app_info(const char* key) +{ + char *keys[] = { + PMIX_APP_SIZE, + NULL + }; + size_t n; + + for (n=0; NULL != keys[n]; n++) { + if (0 == strncmp(key, keys[n], PMIX_MAX_KEYLEN)) { + return true; + } + } + return false; +} + + END_C_DECLS #endif /* PMIX_GLOBALS_H */ diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/base/help-pmix-mca-base.txt b/opal/mca/pmix/pmix3x/pmix/src/mca/base/help-pmix-mca-base.txt index 3c8a67f1990..16b8b86ada7 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/base/help-pmix-mca-base.txt +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/base/help-pmix-mca-base.txt @@ -11,7 +11,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2008-2019 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2018-2019 Intel, Inc. All rights reserved. +# Copyright (c) 2018 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/bfrops/v12/unpack.c b/opal/mca/pmix/pmix3x/pmix/src/mca/bfrops/v12/unpack.c index a001728ef02..667f9c64841 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/bfrops/v12/unpack.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/bfrops/v12/unpack.c @@ -119,7 +119,6 @@ pmix_status_t pmix12_bfrop_unpack(pmix_buffer_t *buffer, *num_vals = 0; /* don't error log here as the user may be unpacking past * the end of the buffer, which isn't necessarily an error */ - PMIX_ERROR_LOG(rc); return rc; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/common/dstore/dstore_base.c b/opal/mca/pmix/pmix3x/pmix/src/mca/common/dstore/dstore_base.c index c0fc676e6c9..81abef989df 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/common/dstore/dstore_base.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/common/dstore/dstore_base.c @@ -1,7 +1,7 @@ /* - * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * Copyright (c) 2016-2018 IBM Corporation. All rights reserved. - * Copyright (c) 2016-2018 Mellanox Technologies, Inc. + * Copyright (c) 2016-2019 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2018-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -48,7 +48,7 @@ #include "src/util/pmix_environ.h" #include "src/util/hash.h" #include "src/mca/preg/preg.h" - +#include "src/mca/ptl/base/base.h" #include "src/mca/gds/base/base.h" #include "src/mca/pshmem/base/base.h" #include "dstore_common.h" @@ -514,7 +514,7 @@ static int _esh_session_init(pmix_common_dstore_ctx_t *ds_ctx, size_t idx, ns_ma s->jobuid = jobuid; s->nspace_path = strdup(ds_ctx->base_path); - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { if (0 != mkdir(s->nspace_path, 0770)) { if (EEXIST != errno) { pmix_output(0, "session init: can not create session directory \"%s\": %s", @@ -566,7 +566,7 @@ static void _esh_session_release(pmix_common_dstore_ctx_t *ds_ctx, size_t idx) ds_ctx->lock_cbs->finalize(&_ESH_SESSION_lock(ds_ctx->session_array, idx)); if (NULL != s->nspace_path) { - if(PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if(PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { _esh_dir_del(s->nspace_path); } free(s->nspace_path); @@ -649,7 +649,7 @@ static int _update_ns_elem(pmix_common_dstore_ctx_t *ds_ctx, ns_track_elem_t *ns /* synchronize number of meta segments for the target namespace. */ for (i = ns_elem->num_meta_seg; i < info->num_meta_seg; i++) { - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { seg = pmix_common_dstor_create_new_segment(PMIX_DSTORE_NS_META_SEGMENT, ds_ctx->base_path, info->ns_map.name, i, ds_ctx->jobuid, ds_ctx->setjobuid); @@ -684,7 +684,7 @@ static int _update_ns_elem(pmix_common_dstore_ctx_t *ds_ctx, ns_track_elem_t *ns } /* synchronize number of data segments for the target namespace. */ for (i = ns_elem->num_data_seg; i < info->num_data_seg; i++) { - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { seg = pmix_common_dstor_create_new_segment(PMIX_DSTORE_NS_DATA_SEGMENT, ds_ctx->base_path, info->ns_map.name, i, ds_ctx->jobuid, ds_ctx->setjobuid); @@ -821,6 +821,8 @@ static ns_track_elem_t *_get_track_elem_for_namespace(pmix_common_dstore_ctx_t * { ns_track_elem_t *new_elem = NULL; size_t size = pmix_value_array_get_size(ds_ctx->ns_track_array); + ns_track_elem_t *ns_trk; + size_t i, idx = -1; PMIX_OUTPUT_VERBOSE((10, pmix_gds_base_framework.framework_output, "%s:%d:%s: nspace %s", @@ -836,16 +838,30 @@ static ns_track_elem_t *_get_track_elem_for_namespace(pmix_common_dstore_ctx_t * return pmix_value_array_get_item(ds_ctx->ns_track_array, ns_map->track_idx); } + /* Try to find an empty tracker structure */ + ns_trk = PMIX_VALUE_ARRAY_GET_BASE(ds_ctx->ns_track_array, ns_track_elem_t); + for (i = 0; i < size; i++) { + ns_track_elem_t *trk = ns_trk + i; + if (!trk->in_use) { + idx = i; + new_elem = trk; + break; + } + } + /* If we failed - allocate a new tracker */ + if (NULL == new_elem) { + idx = size; + if (NULL == (new_elem = pmix_value_array_get_item(ds_ctx->ns_track_array, idx))) { + return NULL; + } + } + /* create shared memory regions for this namespace and store its info locally * to operate with address and detach/unlink afterwards. */ - if (NULL == (new_elem = pmix_value_array_get_item(ds_ctx->ns_track_array, size))) { - return NULL; - } PMIX_CONSTRUCT(new_elem, ns_track_elem_t); pmix_strncpy(new_elem->ns_map.name, ns_map->name, sizeof(new_elem->ns_map.name)-1); /* save latest track idx to info of nspace */ - ns_map->track_idx = size; - + ns_map->track_idx = idx; return new_elem; } @@ -1591,7 +1607,7 @@ pmix_common_dstore_ctx_t *pmix_common_dstor_init(const char *ds_name, pmix_info_ ds_ctx->ds_name = strdup(ds_name); /* find the temp dir */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { ds_ctx->session_map_search = (session_map_search_fn_t)_esh_session_map_search_server; /* scan incoming info for directives */ @@ -1762,7 +1778,8 @@ PMIX_EXPORT void pmix_common_dstor_finalize(pmix_common_dstore_ctx_t *ds_ctx) pmix_pshmem.finalize(); if (NULL != ds_ctx->base_path){ - if(PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { + /* coverity[toctou] */ if (lstat(ds_ctx->base_path, &st) >= 0){ if (PMIX_SUCCESS != (rc = _esh_dir_del(ds_ctx->base_path))) { PMIX_ERROR_LOG(rc); @@ -1878,7 +1895,7 @@ PMIX_EXPORT pmix_status_t pmix_common_dstor_store(pmix_common_dstore_ctx_t *ds_c "[%s:%d] gds: dstore store for key '%s' scope %d", proc->nspace, proc->rank, kv->key, scope); - if (PMIX_PROC_IS_CLIENT(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_CLIENT(pmix_globals.mypeer)) { rc = PMIX_ERR_NOT_SUPPORTED; PMIX_ERROR_LOG(rc); return rc; @@ -1965,7 +1982,7 @@ static pmix_status_t _dstore_fetch(pmix_common_dstore_ctx_t *ds_ctx, __FILE__, __LINE__, __func__, nspace, rank, key)); /* protect info of dstore segments before it will be updated */ - if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { if (0 != (rc = pthread_mutex_lock(&ds_ctx->lock))) { goto error; } @@ -2156,9 +2173,10 @@ static pmix_status_t _dstore_fetch(pmix_common_dstore_ctx_t *ds_ctx, break; } } else if (NULL == key) { + char *kname_ptr = PMIX_DS_KNAME_PTR(ds_ctx, addr); PMIX_OUTPUT_VERBOSE((10, pmix_gds_base_framework.framework_output, "%s:%d:%s: for rank %s:%u, found target key %s", - __FILE__, __LINE__, __func__, nspace, cur_rank, PMIX_DS_KNAME_PTR(ds_ctx, addr))); + __FILE__, __LINE__, __func__, nspace, cur_rank, kname_ptr)); uint8_t *data_ptr = PMIX_DS_DATA_PTR(ds_ctx, addr); size_t data_size = PMIX_DS_DATA_SIZE(ds_ctx, addr, data_ptr); @@ -2172,8 +2190,8 @@ static pmix_status_t _dstore_fetch(pmix_common_dstore_ctx_t *ds_ctx, PMIX_ERROR_LOG(rc); goto done; } - pmix_strncpy(info[kval_cnt - 1].key, PMIX_DS_KNAME_PTR(ds_ctx, addr), - PMIX_DS_KNAME_LEN(ds_ctx, addr)); + pmix_strncpy(info[kval_cnt - 1].key, kname_ptr, + PMIX_DS_KNAME_LEN(ds_ctx, kname_ptr)); pmix_value_xfer(&info[kval_cnt - 1].value, &val); PMIX_VALUE_DESTRUCT(&val); buffer.base_ptr = NULL; @@ -2464,10 +2482,29 @@ PMIX_EXPORT pmix_status_t pmix_common_dstor_del_nspace(pmix_common_dstore_ctx_t if (ns_map[map_idx].in_use && (ns_map[map_idx].data.tbl_idx == ns_map_data->tbl_idx)) { if (0 == strcmp(ns_map[map_idx].data.name, nspace)) { + /* Unmap corresponding memory regions and stop tracking this namespace */ + size_t nst_size = pmix_value_array_get_size(ds_ctx->ns_track_array); + if (nst_size && (dstor_track_idx >= 0)) { + if((dstor_track_idx + 1) > (int)nst_size) { + rc = PMIX_ERR_VALUE_OUT_OF_BOUNDS; + PMIX_ERROR_LOG(rc); + goto exit; + } + trk = pmix_value_array_get_item(ds_ctx->ns_track_array, dstor_track_idx); + if (true == trk->in_use) { + PMIX_DESTRUCT(trk); + } + } + /* Cleanup the mapping structure */ _esh_session_map_clean(ds_ctx, &ns_map[map_idx]); continue; + } else { + /* Count other namespaces belonging to this session. + * This is required to identify the moment where all + * namespaces are deleted and session can be removed as well + */ + in_use++; } - in_use++; } } @@ -2478,19 +2515,6 @@ PMIX_EXPORT pmix_status_t pmix_common_dstor_del_nspace(pmix_common_dstore_ctx_t PMIX_OUTPUT_VERBOSE((10, pmix_gds_base_framework.framework_output, "%s:%d:%s delete session for jobuid: %d", __FILE__, __LINE__, __func__, session_tbl[session_tbl_idx].jobuid)); - size = pmix_value_array_get_size(ds_ctx->ns_track_array); - if (size && (dstor_track_idx >= 0)) { - if((dstor_track_idx + 1) > (int)size) { - rc = PMIX_ERR_VALUE_OUT_OF_BOUNDS; - PMIX_ERROR_LOG(rc); - goto exit; - } - trk = pmix_value_array_get_item(ds_ctx->ns_track_array, dstor_track_idx); - if (true == trk->in_use) { - PMIX_DESTRUCT(trk); - pmix_value_array_remove_item(ds_ctx->ns_track_array, dstor_track_idx); - } - } _esh_session_release(ds_ctx, session_tbl_idx); } exit: @@ -2697,8 +2721,9 @@ static pmix_status_t _store_job_info(pmix_common_dstore_ctx_t *ds_ctx, ns_map_da pmix_cb_t cb; pmix_kval_t *kv; pmix_buffer_t buf; - pmix_kval_t *kv2 = NULL, *kvp; + pmix_kval_t kv2, *kvp; pmix_status_t rc = PMIX_SUCCESS; + pmix_info_t *ihost; PMIX_CONSTRUCT(&cb, pmix_cb_t); PMIX_CONSTRUCT(&buf, pmix_buffer_t); @@ -2720,33 +2745,55 @@ static pmix_status_t _store_job_info(pmix_common_dstore_ctx_t *ds_ctx, ns_map_da } PMIX_LIST_FOREACH(kv, &cb.kvs, pmix_kval_t) { - if ((PMIX_PROC_IS_V1(_client_peer(ds_ctx)) || PMIX_PROC_IS_V20(_client_peer(ds_ctx))) && - 0 != strncmp("pmix.", kv->key, 4) && - kv->value->type == PMIX_DATA_ARRAY) { - pmix_info_t *info; - size_t size, i; - info = kv->value->data.darray->array; - size = kv->value->data.darray->size; - - for (i = 0; i < size; i++) { - if (0 == strcmp(PMIX_LOCAL_PEERS, info[i].key)) { - kv2 = PMIX_NEW(pmix_kval_t); - kv2->key = strdup(kv->key); - PMIX_VALUE_XFER(rc, kv2->value, &info[i].value); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(kv2); - goto exit; + if (PMIX_CHECK_KEY(kv, PMIX_NODE_INFO_ARRAY)) { + /* earlier PMIx versions don't know how to handle + * the info arrays - what they need is a key-value + * pair where the key is the name of the node and + * the value is the local peers. So if the peer + * is earlier than 3.1.5, construct the necessary + * translation. Otherwise, ignore it as the hash + * component will handle it for them */ + if (PMIX_PEER_IS_EARLIER(ds_ctx->clients_peer, 3, 1, 5)) { + pmix_info_t *info; + size_t size, i; + /* if it is our local node, then we are going to pass + * all info */ + info = kv->value->data.darray->array; + size = kv->value->data.darray->size; + ihost = NULL; + for (i = 0; i < size; i++) { + if (PMIX_CHECK_KEY(&info[i], PMIX_HOSTNAME)) { + ihost = &info[i]; + break; } - PMIX_BFROPS_PACK(rc, pmix_globals.mypeer, &buf, kv2, 1, PMIX_KVAL); + } + if (NULL != ihost) { + PMIX_CONSTRUCT(&kv2, pmix_kval_t); + kv2.key = ihost->value.data.string; + kv2.value = kv->value; + PMIX_BFROPS_PACK(rc, pmix_globals.mypeer, &buf, &kv2, 1, PMIX_KVAL); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); - PMIX_RELEASE(kv2); goto exit; } - PMIX_RELEASE(kv2); + /* if this host is us, then store each value as its own key */ + if (0 == strcmp(kv2.key, pmix_globals.hostname)) { + for (i = 0; i < size; i++) { + kv2.key = info[i].key; + kv2.value = &info[i].value; + PMIX_BFROPS_PACK(rc, pmix_globals.mypeer, &buf, &kv2, 1, PMIX_KVAL); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto exit; + } + } + } } } + } else if (PMIX_CHECK_KEY(kv, PMIX_APP_INFO_ARRAY) || + PMIX_CHECK_KEY(kv, PMIX_JOB_INFO_ARRAY) || + PMIX_CHECK_KEY(kv, PMIX_SESSION_INFO_ARRAY)) { + continue; } else { PMIX_BFROPS_PACK(rc, pmix_globals.mypeer, &buf, kv, 1, PMIX_KVAL); if (PMIX_SUCCESS != rc) { @@ -2804,12 +2851,14 @@ PMIX_EXPORT pmix_status_t pmix_common_dstor_register_job_info(pmix_common_dstore return rc; } + /* pickup all the job-level info by using rank=wildcard */ rc = _store_job_info(ds_ctx, ns_map, &proc); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); return rc; } + /* get the rank-level info for each rank in the job */ for (rank=0; rank < ns->nprocs; rank++) { proc.rank = rank; rc = _store_job_info(ds_ctx, ns_map, &proc); diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/common/dstore/dstore_segment.c b/opal/mca/pmix/pmix3x/pmix/src/mca/common/dstore/dstore_segment.c index 69ec1ba577f..362eeb1d56f 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/common/dstore/dstore_segment.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/common/dstore/dstore_segment.c @@ -65,16 +65,22 @@ PMIX_EXPORT int pmix_common_dstor_getpagesize(void) PMIX_EXPORT size_t pmix_common_dstor_getcacheblocksize(void) { - size_t cache_line = 0; + long cache_line = 0; #if defined(_SC_LEVEL1_DCACHE_LINESIZE) cache_line = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); #endif #if (defined(HAVE_SYS_AUXV_H)) && (defined(AT_DCACHEBSIZE)) - if (0 == cache_line) { - cache_line = getauxval(AT_DCACHEBSIZE); + if (0 >= cache_line) { + unsigned long auxval; + if( (auxval = getauxval(AT_DCACHEBSIZE)) ){ + cache_line = auxval; + } } #endif + if (0 >= cache_line) { + cache_line = 64; + } return cache_line; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/base/gds_base_fns.c b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/base/gds_base_fns.c index abec7a744da..505536a31d6 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/base/gds_base_fns.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/base/gds_base_fns.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2018 IBM Corporation. All rights reserved. diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/configure.m4 b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/configure.m4 new file mode 100644 index 00000000000..2902ffad943 --- /dev/null +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/configure.m4 @@ -0,0 +1,34 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2013 Sandia National Laboratories. All rights reserved. +# Copyright (c) 2014-2020 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_gds_ds12_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_pmix_gds_ds12_CONFIG],[ + AC_CONFIG_FILES([src/mca/gds/ds12/Makefile]) + + AS_IF([test "$pmix_fcntl_flock_happy" = "yes"], + [$1], + [AS_IF([test "$pmix_pthread_rwlockattr_setpshared" = "yes" && test "$pmix_pthread_process_shared" = "yes"], + [$1], [$2])]) + +])dnl diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_base.c b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_base.c index cdfcb252709..43c1008d561 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_base.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_base.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * Copyright (c) 2016-2018 IBM Corporation. All rights reserved. * Copyright (c) 2016-2018 Mellanox Technologies, Inc. * All rights reserved. @@ -33,7 +33,7 @@ static pmix_status_t ds12_init(pmix_info_t info[], size_t ninfo) pmix_status_t rc = PMIX_SUCCESS; pmix_common_dstore_file_cbs_t *dstore_file_cbs = NULL; - if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { dstore_file_cbs = &pmix_ds20_file_module; } ds12_ctx = pmix_common_dstor_init("ds12", info, ninfo, @@ -94,7 +94,7 @@ static pmix_status_t ds12_cache_job_info(struct pmix_namespace_t *ns, static pmix_status_t ds12_register_job_info(struct pmix_peer_t *pr, pmix_buffer_t *reply) { - if (PMIX_PROC_IS_V1(pr)) { + if (PMIX_PEER_IS_V1(pr)) { ds12_ctx->file_cbs = &pmix_ds12_file_module; } else { ds12_ctx->file_cbs = &pmix_ds20_file_module; diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_component.c b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_component.c index 9f52d4fe996..54d29c69ef2 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_component.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2016-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2019 Intel, Inc. All rights reserved. * Copyright (c) 2017 Mellanox Technologies, Inc. * All rights reserved. * $COPYRIGHT$ @@ -75,7 +75,7 @@ static int component_open(void) static int component_query(pmix_mca_base_module_t **module, int *priority) { /* launchers cannot use the dstore */ - if (PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { *priority = 0; *module = NULL; return PMIX_ERROR; diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_lock_fcntl.c b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_lock_fcntl.c index 477e91465fb..53c805580fc 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_lock_fcntl.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_lock_fcntl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. * Copyright (c) 2016-2018 Mellanox Technologies, Inc. * All rights reserved. @@ -112,7 +112,7 @@ pmix_status_t pmix_gds_ds12_lock_init(pmix_common_dstor_lock_ctx_t *ctx, const c PMIX_OUTPUT_VERBOSE((10, pmix_gds_base_framework.framework_output, "%s:%d:%s _lockfile_name: %s", __FILE__, __LINE__, __func__, lock_ctx->lockfile)); - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { lock_ctx->lockfd = open(lock_ctx->lockfile, O_CREAT | O_RDWR | O_EXCL, 0600); /* if previous launch was crashed, the lockfile might not be deleted and unlocked, @@ -157,7 +157,7 @@ pmix_status_t pmix_gds_ds12_lock_init(pmix_common_dstor_lock_ctx_t *ctx, const c } if (0 > lock_ctx->lockfd) { close(lock_ctx->lockfd); - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { unlink(lock_ctx->lockfile); } } @@ -180,7 +180,7 @@ void pmix_ds12_lock_finalize(pmix_common_dstor_lock_ctx_t *lock_ctx) close(fcntl_lock->lockfd); - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { unlink(fcntl_lock->lockfile); } free(fcntl_lock); diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_lock_pthread.c b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_lock_pthread.c index 163015856eb..57177ef5003 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_lock_pthread.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds12/gds_ds12_lock_pthread.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2020 Intel, Inc. All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. * Copyright (c) 2016-2018 Mellanox Technologies, Inc. * All rights reserved. @@ -105,7 +105,7 @@ pmix_status_t pmix_gds_ds12_lock_init(pmix_common_dstor_lock_ctx_t *ctx, const c PMIX_OUTPUT_VERBOSE((10, pmix_gds_base_framework.framework_output, "%s:%d:%s _lockfile_name: %s", __FILE__, __LINE__, __func__, lock_ctx->lockfile)); - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { if (PMIX_SUCCESS != (rc = pmix_pshmem.segment_create(lock_ctx->segment, lock_ctx->lockfile, size))) { PMIX_ERROR_LOG(rc); @@ -138,21 +138,24 @@ pmix_status_t pmix_gds_ds12_lock_init(pmix_common_dstor_lock_ctx_t *ctx, const c PMIX_ERROR_LOG(rc); goto error; } -#ifdef HAVE_PTHREAD_SETKIND +#if PMIX_PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP if (0 != pthread_rwlockattr_setkind_np(&attr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP)) { pthread_rwlockattr_destroy(&attr); - PMIX_ERROR_LOG(PMIX_ERR_INIT); + rc = PMIX_ERR_INIT; + PMIX_ERROR_LOG(rc); goto error; } #endif if (0 != pthread_rwlock_init(lock_ctx->rwlock, &attr)) { pthread_rwlockattr_destroy(&attr); - PMIX_ERROR_LOG(PMIX_ERR_INIT); + rc = PMIX_ERR_INIT; + PMIX_ERROR_LOG(rc); goto error; } if (0 != pthread_rwlockattr_destroy(&attr)) { - PMIX_ERROR_LOG(PMIX_ERR_INIT); + rc = PMIX_ERR_INIT; + PMIX_ERROR_LOG(rc); goto error; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/configure.m4 b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/configure.m4 new file mode 100644 index 00000000000..673ffce5f9e --- /dev/null +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/configure.m4 @@ -0,0 +1,32 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2013 Sandia National Laboratories. All rights reserved. +# Copyright (c) 2014-2020 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_gds_ds21_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_pmix_gds_ds21_CONFIG],[ + AC_CONFIG_FILES([src/mca/gds/ds21/Makefile]) + + AS_IF([test "$pmix_pthread_mutexattr_setpshared" = "yes" && test "$pmix_pthread_process_shared" = "yes"], + [$1], [$2]) + +])dnl diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/gds_ds21_component.c b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/gds_ds21_component.c index c1f42944df8..14e4e49b752 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/gds_ds21_component.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/gds_ds21_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2016-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2019 Intel, Inc. All rights reserved. * Copyright (c) 2017-2018 Mellanox Technologies, Inc. * All rights reserved. * $COPYRIGHT$ @@ -75,7 +75,7 @@ static int component_open(void) static int component_query(pmix_mca_base_module_t **module, int *priority) { /* launchers cannot use the dstore */ - if (PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { *priority = 0; *module = NULL; return PMIX_ERROR; diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/gds_ds21_lock_pthread.c b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/gds_ds21_lock_pthread.c index 99713f5651e..a9194a8a68b 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/gds_ds21_lock_pthread.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/ds21/gds_ds21_lock_pthread.c @@ -2,7 +2,7 @@ * Copyright (c) 2018 Mellanox Technologies, Inc. * All rights reserved. * - * Copyright (c) 2018-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2018-2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -88,7 +88,7 @@ static void ncon(lock_item_t *p) { static void ldes(lock_item_t *p) { uint32_t i; - if(PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if(PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { segment_hdr_t *seg_hdr = (segment_hdr_t *)p->seg_desc->seg_info.seg_base_addr; if (p->lockfile) { unlink(p->lockfile); @@ -150,7 +150,7 @@ pmix_status_t pmix_gds_ds21_lock_init(pmix_common_dstor_lock_ctx_t *ctx, const c PMIX_OUTPUT_VERBOSE((10, pmix_gds_base_framework.framework_output, "%s:%d:%s local_size %d", __FILE__, __LINE__, __func__, local_size)); - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { size_t seg_align_size; size_t seg_hdr_size; diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/hash/gds_hash.c b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/hash/gds_hash.c index 5e6a5341bd2..de12f51c148 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/hash/gds_hash.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/hash/gds_hash.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2020 Intel, Inc. All rights reserved. * Copyright (c) 2016-2018 IBM Corporation. All rights reserved. * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -36,6 +36,7 @@ #include "src/class/pmix_list.h" #include "src/client/pmix_client_ops.h" #include "src/server/pmix_server_ops.h" +#include "src/mca/ptl/base/base.h" #include "src/util/argv.h" #include "src/util/compress.h" #include "src/mca/preg/preg.h" @@ -125,6 +126,8 @@ pmix_gds_base_module_t pmix_hash_module = { #define PMIX_HASH_PROC_MAP 0x00000010 #define PMIX_HASH_NODE_MAP 0x00000020 +static pmix_list_t mysessions, myjobs; + /**********************************************/ /* struct definitions */ typedef struct { @@ -242,7 +245,7 @@ static PMIX_CLASS_INSTANCE(pmix_apptrkr_t, static void ndinfocon(pmix_nodeinfo_t *p) { - p->nodeid = 0; + p->nodeid = UINT32_MAX; p->hostname = NULL; PMIX_CONSTRUCT(&p->info, pmix_list_t); } @@ -257,19 +260,70 @@ static PMIX_CLASS_INSTANCE(pmix_nodeinfo_t, pmix_list_item_t, ndinfocon, ndinfodes); +static pmix_job_t* get_tracker(const pmix_nspace_t nspace, bool create) +{ + pmix_job_t *trk, *t; + pmix_namespace_t *ns, *nptr; + + /* find the hash table for this nspace */ + trk = NULL; + PMIX_LIST_FOREACH(t, &myjobs, pmix_job_t) { + if (0 == strcmp(nspace, t->ns)) { + trk = t; + break; + } + } + if (NULL == trk && create) { + /* create one */ + trk = PMIX_NEW(pmix_job_t); + trk->ns = strdup(nspace); + /* see if we already have this nspace */ + nptr = NULL; + PMIX_LIST_FOREACH(ns, &pmix_globals.nspaces, pmix_namespace_t) { + if (0 == strcmp(ns->nspace, nspace)) { + nptr = ns; + break; + } + } + if (NULL == nptr) { + nptr = PMIX_NEW(pmix_namespace_t); + if (NULL == nptr) { + PMIX_RELEASE(trk); + return NULL; + } + nptr->nspace = strdup(nspace); + pmix_list_append(&pmix_globals.nspaces, &nptr->super); + } + PMIX_RETAIN(nptr); + trk->nptr = nptr; + pmix_list_append(&myjobs, &trk->super); + } + return trk; +} + +/********************************************** + * Forward Declarations + **********************************************/ +static pmix_status_t fetch_nodeinfo(const char *key, pmix_list_t *tgt, + pmix_info_t *info, size_t ninfo, + pmix_list_t *kvs); +static pmix_status_t fetch_appinfo(const char *key, pmix_list_t *tgt, + pmix_info_t *info, size_t ninfo, + pmix_list_t *kvs); + /**********************************************/ /* process a node array - contains an array of * node-level info for a single node. Either the * nodeid, hostname, or both must be included * in the array to identify the node */ -static pmix_status_t process_node_array(pmix_info_t *info, +static pmix_status_t process_node_array(pmix_value_t *val, pmix_list_t *tgt) { size_t size, j; pmix_info_t *iptr; pmix_status_t rc = PMIX_SUCCESS; - pmix_kval_t *kp2, *k1, *knext; + pmix_kval_t *kp2, *k1; pmix_list_t cache; pmix_nodeinfo_t *nd = NULL, *ndptr; bool update; @@ -278,14 +332,14 @@ static pmix_status_t process_node_array(pmix_info_t *info, "PROCESSING NODE ARRAY"); /* array of node-level info for a specific node */ - if (PMIX_DATA_ARRAY != info->value.type) { + if (PMIX_DATA_ARRAY != val->type) { PMIX_ERROR_LOG(PMIX_ERR_TYPE_MISMATCH); return PMIX_ERR_TYPE_MISMATCH; } /* setup arrays */ - size = info->value.data.darray->size; - iptr = (pmix_info_t*)info->value.data.darray->array; + size = val->data.darray->size; + iptr = (pmix_info_t*)val->data.darray->array; PMIX_CONSTRUCT(&cache, pmix_list_t); /* cache the values while searching for the nodeid @@ -335,7 +389,7 @@ static pmix_status_t process_node_array(pmix_info_t *info, * provided list */ update = false; PMIX_LIST_FOREACH(ndptr, tgt, pmix_nodeinfo_t) { - if (ndptr->nodeid == nd->nodeid || + if ((ndptr->nodeid != UINT32_MAX && (ndptr->nodeid == nd->nodeid)) || (NULL != ndptr->hostname && NULL != nd->hostname && 0 == strcmp(ndptr->hostname, nd->hostname))) { /* we assume that the data is updating the current * values */ @@ -355,7 +409,7 @@ static pmix_status_t process_node_array(pmix_info_t *info, /* if this is an update, we have to ensure each data * item only appears once on the list */ if (update) { - PMIX_LIST_FOREACH_SAFE(k1, knext, &nd->info, pmix_kval_t) { + PMIX_LIST_FOREACH(k1, &nd->info, pmix_kval_t) { if (PMIX_CHECK_KEY(k1, kp2->key)) { pmix_list_remove_item(&nd->info, &k1->super); PMIX_RELEASE(k1); @@ -368,7 +422,9 @@ static pmix_status_t process_node_array(pmix_info_t *info, } PMIX_LIST_DESTRUCT(&cache); - pmix_list_append(tgt, &nd->super); + if (!update) { + pmix_list_append(tgt, &nd->super); + } return PMIX_SUCCESS; } @@ -378,7 +434,7 @@ static pmix_status_t process_node_array(pmix_info_t *info, * it is assumed that only app is in the job. * This assumption is checked and generates * an error if violated */ -static pmix_status_t process_app_array(pmix_info_t *info, +static pmix_status_t process_app_array(pmix_value_t *val, pmix_job_t *trk) { pmix_list_t cache, ncache; @@ -400,7 +456,7 @@ static pmix_status_t process_app_array(pmix_info_t *info, } /* array of app-level info */ - if (PMIX_DATA_ARRAY != info->value.type) { + if (PMIX_DATA_ARRAY != val->type) { PMIX_ERROR_LOG(PMIX_ERR_TYPE_MISMATCH); return PMIX_ERR_TYPE_MISMATCH; } @@ -408,8 +464,8 @@ static pmix_status_t process_app_array(pmix_info_t *info, /* setup arrays and lists */ PMIX_CONSTRUCT(&cache, pmix_list_t); PMIX_CONSTRUCT(&ncache, pmix_list_t); - size = info->value.data.darray->size; - iptr = (pmix_info_t*)info->value.data.darray->array; + size = val->data.darray->size; + iptr = (pmix_info_t*)val->data.darray->array; for (j=0; j < size; j++) { if (PMIX_CHECK_KEY(&iptr[j], PMIX_APPNUM)) { @@ -429,7 +485,7 @@ static pmix_status_t process_app_array(pmix_info_t *info, app = PMIX_NEW(pmix_apptrkr_t); app->appnum = appnum; } else if (PMIX_CHECK_KEY(&iptr[j], PMIX_NODE_INFO_ARRAY)) { - if (PMIX_SUCCESS != (rc = process_node_array(&iptr[j], &ncache))) { + if (PMIX_SUCCESS != (rc = process_node_array(&iptr[j].value, &ncache))) { PMIX_ERROR_LOG(rc); goto release; } @@ -536,11 +592,11 @@ static pmix_status_t process_job_array(pmix_info_t *info, PMIX_CONSTRUCT(&cache, pmix_list_t); for (j=0; j < size; j++) { if (PMIX_CHECK_KEY(&iptr[j], PMIX_APP_INFO_ARRAY)) { - if (PMIX_SUCCESS != (rc = process_app_array(&iptr[j], trk))) { + if (PMIX_SUCCESS != (rc = process_app_array(&iptr[j].value, trk))) { return rc; } } else if (PMIX_CHECK_KEY(&iptr[j], PMIX_NODE_INFO_ARRAY)) { - if (PMIX_SUCCESS != (rc = process_node_array(&iptr[j], &trk->nodeinfo))) { + if (PMIX_SUCCESS != (rc = process_node_array(&iptr[j].value, &trk->nodeinfo))) { PMIX_ERROR_LOG(rc); return rc; } @@ -595,12 +651,114 @@ static pmix_status_t process_job_array(pmix_info_t *info, return rc; } pmix_list_append(&trk->jobinfo, &kp2->super); + /* check for job size */ + if (PMIX_CHECK_KEY(&iptr[j], PMIX_JOB_SIZE) && + !(PMIX_HASH_JOB_SIZE & *flags)) { + trk->nptr->nprocs = iptr[j].value.data.uint32; + *flags |= PMIX_HASH_JOB_SIZE; + } } } return PMIX_SUCCESS; } -static pmix_list_t mysessions, myjobs; +static pmix_status_t process_session_array(pmix_value_t *val, + pmix_job_t *trk) +{ + pmix_session_t *s = NULL, *sptr; + size_t j, size; + pmix_info_t *iptr; + pmix_list_t cache, ncache; + pmix_status_t rc; + pmix_kval_t *kp2; + pmix_nodeinfo_t *nd; + uint32_t sid; + + /* array of session-level info */ + if (PMIX_DATA_ARRAY != val->type) { + PMIX_ERROR_LOG(PMIX_ERR_BAD_PARAM); + return PMIX_ERR_TYPE_MISMATCH; + } + size = val->data.darray->size; + iptr = (pmix_info_t*)val->data.darray->array; + + PMIX_CONSTRUCT(&cache, pmix_list_t); + PMIX_CONSTRUCT(&ncache, pmix_list_t); + for (j=0; j < size; j++) { + if (PMIX_CHECK_KEY(&iptr[j], PMIX_SESSION_ID)) { + PMIX_VALUE_GET_NUMBER(rc, &iptr[j].value, sid, uint32_t); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_LIST_DESTRUCT(&cache); + PMIX_LIST_DESTRUCT(&ncache); + return rc; + } + /* see if we already have this session - it could have + * been defined by a separate PMIX_SESSION_ID key */ + PMIX_LIST_FOREACH(sptr, &mysessions, pmix_session_t) { + if (sptr->session == sid) { + s = sptr; + break; + } + } + if (NULL == s) { + /* wasn't found, so create one */ + s = PMIX_NEW(pmix_session_t); + s->session = sid; + pmix_list_append(&mysessions, &s->super); + } + } else if (PMIX_CHECK_KEY(&iptr[j], PMIX_NODE_INFO_ARRAY)) { + if (PMIX_SUCCESS != (rc = process_node_array(&iptr[j].value, &ncache))) { + PMIX_ERROR_LOG(rc); + PMIX_LIST_DESTRUCT(&cache); + PMIX_LIST_DESTRUCT(&ncache); + return rc; + } + } else { + kp2 = PMIX_NEW(pmix_kval_t); + kp2->key = strdup(iptr[j].key); + kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + PMIX_VALUE_XFER(rc, kp2->value, &iptr[j].value); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(kp2); + PMIX_LIST_DESTRUCT(&cache); + PMIX_LIST_DESTRUCT(&ncache); + return rc; + } + pmix_list_append(&cache, &kp2->super); + } + } + if (NULL == s) { + /* this is not allowed to happen - they are required + * to provide us with a session ID per the standard */ + PMIX_LIST_DESTRUCT(&cache); + PMIX_LIST_DESTRUCT(&ncache); + rc = PMIX_ERR_BAD_PARAM; + PMIX_ERROR_LOG(rc); + return rc; + } + /* point the job at it */ + if (NULL != trk->session) { + PMIX_RELEASE(trk->session); + } + PMIX_RETAIN(s); + trk->session = s; + /* transfer the data across */ + kp2 = (pmix_kval_t*)pmix_list_remove_first(&cache); + while (NULL != kp2) { + pmix_list_append(&s->sessioninfo, &kp2->super); + kp2 = (pmix_kval_t*)pmix_list_remove_first(&cache); + } + PMIX_LIST_DESTRUCT(&cache); + nd = (pmix_nodeinfo_t*)pmix_list_remove_first(&ncache); + while (NULL != nd) { + pmix_list_append(&s->nodeinfo, &nd->super); + nd = (pmix_nodeinfo_t*)pmix_list_remove_first(&ncache); + } + PMIX_LIST_DESTRUCT(&ncache); + return PMIX_SUCCESS; +} static pmix_status_t hash_init(pmix_info_t info[], size_t ninfo) { @@ -647,20 +805,18 @@ static pmix_status_t hash_assign_module(pmix_info_t *info, size_t ninfo, return PMIX_SUCCESS; } -static pmix_status_t store_map(pmix_hash_table_t *ht, +static pmix_status_t store_map(pmix_job_t *trk, char **nodes, char **ppn, uint32_t flags) { pmix_status_t rc; - pmix_value_t *val; size_t m, n; - pmix_info_t *iptr, *info; pmix_rank_t rank; - bool updated; - pmix_kval_t *kp2; + pmix_kval_t *kp1, *kp2; char **procs; uint32_t totalprocs=0; - bool localldr; + pmix_hash_table_t *ht = &trk->internal; + pmix_nodeinfo_t *nd, *ndptr; pmix_output_verbose(2, pmix_gds_base_framework.framework_output, "[%s:%d] gds:hash:store_map", @@ -689,133 +845,99 @@ static pmix_status_t store_map(pmix_hash_table_t *ht, } for (n=0; NULL != nodes[n]; n++) { - /* check and see if we already have data for this node */ - val = NULL; - rc = pmix_hash_fetch(ht, PMIX_RANK_WILDCARD, nodes[n], &val); - if (PMIX_SUCCESS == rc && NULL != val) { - /* already have some data. See if we have the list of local peers */ - if (PMIX_DATA_ARRAY != val->type || - NULL == val->data.darray || - PMIX_INFO != val->data.darray->type || - 0 == val->data.darray->size) { - /* something is wrong */ - PMIX_VALUE_RELEASE(val); - PMIX_ERROR_LOG(PMIX_ERR_INVALID_VAL); - return PMIX_ERR_INVALID_VAL; - } - iptr = (pmix_info_t*)val->data.darray->array; - updated = false; - localldr = false; - for (m=0; m < val->data.darray->size; m++) { - if (PMIX_CHECK_KEY(&iptr[m], PMIX_LOCAL_PEERS)) { - /* we will update this entry */ - if (NULL != iptr[m].value.data.string) { - free(iptr[m].value.data.string); - } - iptr[m].value.data.string = strdup(ppn[n]); - updated = true; // no need to add the local_peers to the array - } else if (PMIX_CHECK_KEY(&iptr[m], PMIX_LOCALLDR)) { - rank = strtoul(ppn[n], NULL, 10); - iptr[m].value.data.rank = rank; - localldr = true; // no need to add localldr to the array - } - } - if (!updated || !localldr) { - /* append this entry to the current data */ - kp2 = PMIX_NEW(pmix_kval_t); - if (NULL == kp2) { - return PMIX_ERR_NOMEM; - } - kp2->key = strdup(nodes[n]); - kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); - if (NULL == kp2->value) { - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; - } - kp2->value->type = PMIX_DATA_ARRAY; - kp2->value->data.darray = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t)); - if (NULL == kp2->value->data.darray) { - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; - } - kp2->value->data.darray->type = PMIX_INFO; - /* if we didn't update the local leader, then we will - * add it here */ - m = 0; - if (!localldr) { - kp2->value->data.darray->size = val->data.darray->size + 1; - ++m; - } - /* if they didn't update the local peers, then we add it here */ - if (!updated) { - kp2->value->data.darray->size = val->data.darray->size + 1; - ++m; + /* check and see if we already have this node */ + nd = NULL; + PMIX_LIST_FOREACH(ndptr, &trk->nodeinfo, pmix_nodeinfo_t) { + if (NULL != ndptr->hostname && 0 == strcmp(ndptr->hostname, nodes[n])) { + /* we assume that the data is updating the current + * values */ + if (NULL == ndptr->hostname) { + ndptr->hostname = strdup(nodes[n]); } - PMIX_INFO_CREATE(info, kp2->value->data.darray->size); - if (NULL == info) { - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; - } - /* copy the pre-existing data across */ - for (m=0; m < val->data.darray->size; m++) { - PMIX_INFO_XFER(&info[m], &iptr[m]); - } - if (!updated) { - PMIX_INFO_LOAD(&info[kp2->value->data.darray->size-m], PMIX_LOCAL_PEERS, ppn[n], PMIX_STRING); - --m; - } - if (!localldr) { - rank = strtoul(ppn[n], NULL, 10); - PMIX_INFO_LOAD(&info[kp2->value->data.darray->size-m], PMIX_LOCALLDR, &rank, PMIX_PROC_RANK); - --m; - } - kp2->value->data.darray->array = info; - if (PMIX_SUCCESS != (rc = pmix_hash_store(ht, PMIX_RANK_WILDCARD, kp2))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(kp2); - return rc; - } - PMIX_RELEASE(kp2); - } - } else { - /* store the list as-is */ - kp2 = PMIX_NEW(pmix_kval_t); - if (NULL == kp2) { - return PMIX_ERR_NOMEM; - } - kp2->key = strdup(nodes[n]); - kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); - if (NULL == kp2->value) { - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; - } - kp2->value->type = PMIX_DATA_ARRAY; - kp2->value->data.darray = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t)); - if (NULL == kp2->value->data.darray) { - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; - } - kp2->value->data.darray->type = PMIX_INFO; - PMIX_INFO_CREATE(info, 2); - if (NULL == info) { - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; + nd = ndptr; + break; } - PMIX_INFO_LOAD(&info[0], PMIX_LOCAL_PEERS, ppn[n], PMIX_STRING); - rank = strtoul(ppn[n], NULL, 10); - PMIX_INFO_LOAD(&info[1], PMIX_LOCALLDR, &rank, PMIX_PROC_RANK); - kp2->value->data.darray->array = info; - kp2->value->data.darray->size = 2; - if (PMIX_SUCCESS != (rc = pmix_hash_store(ht, PMIX_RANK_WILDCARD, kp2))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(kp2); - return rc; + } + if (NULL == nd) { + nd = PMIX_NEW(pmix_nodeinfo_t); + nd->hostname = strdup(nodes[n]); + pmix_list_append(&trk->nodeinfo, &nd->super); + } + /* store the proc list as-is */ + kp2 = PMIX_NEW(pmix_kval_t); + if (NULL == kp2) { + return PMIX_ERR_NOMEM; + } + kp2->key = strdup(PMIX_LOCAL_PEERS); + kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + if (NULL == kp2->value) { + PMIX_RELEASE(kp2); + return PMIX_ERR_NOMEM; + } + kp2->value->type = PMIX_STRING; + kp2->value->data.string = strdup(ppn[n]); + /* ensure this item only appears once on the list */ + PMIX_LIST_FOREACH(kp1, &nd->info, pmix_kval_t) { + if (PMIX_CHECK_KEY(kp1, kp2->key)) { + pmix_list_remove_item(&nd->info, &kp1->super); + PMIX_RELEASE(kp1); + break; } + } + pmix_list_append(&nd->info, &kp2->super); + + /* save the local leader */ + rank = strtoul(ppn[n], NULL, 10); + kp2 = PMIX_NEW(pmix_kval_t); + if (NULL == kp2) { + return PMIX_ERR_NOMEM; + } + kp2->key = strdup(PMIX_LOCALLDR); + kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + if (NULL == kp2->value) { PMIX_RELEASE(kp2); + return PMIX_ERR_NOMEM; + } + kp2->value->type = PMIX_PROC_RANK; + kp2->value->data.rank = rank; + /* ensure this item only appears once on the list */ + PMIX_LIST_FOREACH(kp1, &nd->info, pmix_kval_t) { + if (PMIX_CHECK_KEY(kp1, kp2->key)) { + pmix_list_remove_item(&nd->info, &kp1->super); + PMIX_RELEASE(kp1); + break; + } } + pmix_list_append(&nd->info, &kp2->super); + /* split the list of procs so we can store their * individual location data */ procs = pmix_argv_split(ppn[n], ','); + /* save the local size in case they don't + * give it to us */ + kp2 = PMIX_NEW(pmix_kval_t); + if (NULL == kp2) { + return PMIX_ERR_NOMEM; + } + kp2->key = strdup(PMIX_LOCAL_SIZE); + kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + if (NULL == kp2->value) { + PMIX_RELEASE(kp2); + return PMIX_ERR_NOMEM; + } + kp2->value->type = PMIX_UINT32; + kp2->value->data.uint32 = pmix_argv_count(procs); + /* ensure this item only appears once on the list */ + PMIX_LIST_FOREACH(kp1, &nd->info, pmix_kval_t) { + if (PMIX_CHECK_KEY(kp1, kp2->key)) { + pmix_list_remove_item(&nd->info, &kp1->super); + PMIX_RELEASE(kp1); + break; + } + } + pmix_list_append(&nd->info, &kp2->super); + /* track total procs in job in case they + * didn't give it to us */ totalprocs += pmix_argv_count(procs); for (m=0; NULL != procs[m]; m++) { /* store the hostname for each proc */ @@ -825,6 +947,10 @@ static pmix_status_t store_map(pmix_hash_table_t *ht, kp2->value->type = PMIX_STRING; kp2->value->data.string = strdup(nodes[n]); rank = strtol(procs[m], NULL, 10); + pmix_output_verbose(2, pmix_gds_base_framework.framework_output, + "[%s:%d] gds:hash:store_map for [%s:%u]: key %s", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + trk->ns, rank, kp2->key); if (PMIX_SUCCESS != (rc = pmix_hash_store(ht, rank, kp2))) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(kp2); @@ -908,6 +1034,8 @@ static pmix_status_t store_map(pmix_hash_table_t *ht, return rc; } PMIX_RELEASE(kp2); // maintain acctg + flags |= PMIX_HASH_JOB_SIZE; + trk->nptr->nprocs = totalprocs; } /* if they didn't provide a value for max procs, just @@ -925,6 +1053,7 @@ static pmix_status_t store_map(pmix_hash_table_t *ht, return rc; } PMIX_RELEASE(kp2); // maintain acctg + flags |= PMIX_HASH_MAX_PROCS; } @@ -935,7 +1064,7 @@ pmix_status_t hash_cache_job_info(struct pmix_namespace_t *ns, pmix_info_t info[], size_t ninfo) { pmix_namespace_t *nptr = (pmix_namespace_t*)ns; - pmix_job_t *trk, *t; + pmix_job_t *trk; pmix_session_t *s = NULL, *sptr; pmix_hash_table_t *ht; pmix_kval_t *kp2, *kvptr; @@ -947,32 +1076,17 @@ pmix_status_t hash_cache_job_info(struct pmix_namespace_t *ns, pmix_status_t rc=PMIX_SUCCESS; size_t n, j, size, len; uint32_t flags = 0; - pmix_list_t cache, ncache; - pmix_nodeinfo_t *nd; + pmix_nodeinfo_t *nd, *ndptr; + pmix_apptrkr_t *apptr; pmix_output_verbose(2, pmix_gds_base_framework.framework_output, - "[%s:%d] gds:hash:cache_job_info for nspace %s", + "[%s:%d] gds:hash:cache_job_info for nspace %s with %lu info", pmix_globals.myid.nspace, pmix_globals.myid.rank, - nptr->nspace); + nptr->nspace, ninfo); - /* find the hash table for this nspace */ - trk = NULL; - PMIX_LIST_FOREACH(t, &myjobs, pmix_job_t) { - if (0 == strcmp(nptr->nspace, t->ns)) { - trk = t; - break; - } - } + trk = get_tracker(nptr->nspace, true); if (NULL == trk) { - /* create a tracker as we will likely need it */ - trk = PMIX_NEW(pmix_job_t); - if (NULL == trk) { - return PMIX_ERR_NOMEM; - } - PMIX_RETAIN(nptr); - trk->nptr = nptr; - trk->ns = strdup(nptr->nspace); - pmix_list_append(&myjobs, &trk->super); + return PMIX_ERR_NOMEM; } /* if there isn't any data, then be content with just @@ -1009,107 +1123,22 @@ pmix_status_t hash_cache_job_info(struct pmix_namespace_t *ns, trk->session = s; } } else if (PMIX_CHECK_KEY(&info[n], PMIX_SESSION_INFO_ARRAY)) { - /* array of session-level info */ - if (PMIX_DATA_ARRAY != info[n].value.type) { - PMIX_ERROR_LOG(PMIX_ERR_BAD_PARAM); - rc = PMIX_ERR_TYPE_MISMATCH; - goto release; - } - size = info[n].value.data.darray->size; - iptr = (pmix_info_t*)info[n].value.data.darray->array; - PMIX_CONSTRUCT(&cache, pmix_list_t); - PMIX_CONSTRUCT(&ncache, pmix_list_t); - for (j=0; j < size; j++) { - if (PMIX_CHECK_KEY(&iptr[j], PMIX_SESSION_ID)) { - PMIX_VALUE_GET_NUMBER(rc, &iptr[j].value, sid, uint32_t); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_LIST_DESTRUCT(&cache); - return rc; - } - /* setup a session object */ - if (NULL != s) { - /* does this match the one we were previously given? */ - if (sid != s->session) { - /* no - see if we already have this session */ - PMIX_LIST_FOREACH(sptr, &mysessions, pmix_session_t) { - if (sptr->session == sid) { - s = sptr; - break; - } - } - if (sid != s->session) { - /* wasn't found, so create one */ - s = PMIX_NEW(pmix_session_t); - s->session = sid; - pmix_list_append(&mysessions, &s->super); - } - } - } else { - s = PMIX_NEW(pmix_session_t); - s->session = sid; - pmix_list_append(&mysessions, &s->super); - } - } else if (PMIX_CHECK_KEY(&iptr[j], PMIX_NODE_INFO_ARRAY)) { - if (PMIX_SUCCESS != (rc = process_node_array(&iptr[j], &ncache))) { - PMIX_ERROR_LOG(rc); - PMIX_LIST_DESTRUCT(&cache); - PMIX_LIST_DESTRUCT(&ncache); - goto release; - } - } else { - kp2 = PMIX_NEW(pmix_kval_t); - kp2->key = strdup(iptr[j].key); - kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); - PMIX_VALUE_XFER(rc, kp2->value, &iptr[j].value); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(kp2); - PMIX_LIST_DESTRUCT(&cache); - PMIX_LIST_DESTRUCT(&ncache); - goto release; - } - pmix_list_append(&cache, &kp2->super); - } - } - if (NULL == s) { - /* this is not allowed to happen - they are required - * to provide us with a session ID per the standard */ - PMIX_LIST_DESTRUCT(&cache); - rc = PMIX_ERR_BAD_PARAM; + if (PMIX_SUCCESS != (rc = process_session_array(&info[n].value, trk))) { PMIX_ERROR_LOG(rc); goto release; } - /* point the job at it */ - if (NULL == trk->session) { - PMIX_RETAIN(s); - trk->session = s; - } - /* transfer the data across */ - kp2 = (pmix_kval_t*)pmix_list_remove_first(&cache); - while (NULL != kp2) { - pmix_list_append(&s->sessioninfo, &kp2->super); - kp2 = (pmix_kval_t*)pmix_list_remove_first(&cache); - } - PMIX_LIST_DESTRUCT(&cache); - nd = (pmix_nodeinfo_t*)pmix_list_remove_first(&ncache); - while (NULL != nd) { - pmix_list_append(&s->nodeinfo, &nd->super); - nd = (pmix_nodeinfo_t*)pmix_list_remove_first(&ncache); - } - PMIX_LIST_DESTRUCT(&ncache); } else if (PMIX_CHECK_KEY(&info[n], PMIX_JOB_INFO_ARRAY)) { if (PMIX_SUCCESS != (rc = process_job_array(&info[n], trk, &flags, &procs, &nodes))) { PMIX_ERROR_LOG(rc); goto release; } } else if (PMIX_CHECK_KEY(&info[n], PMIX_APP_INFO_ARRAY)) { - if (PMIX_SUCCESS != (rc = process_app_array(&info[n], trk))) { + if (PMIX_SUCCESS != (rc = process_app_array(&info[n].value, trk))) { PMIX_ERROR_LOG(rc); goto release; } } else if (PMIX_CHECK_KEY(&info[n], PMIX_NODE_INFO_ARRAY)) { - if (PMIX_SUCCESS != (rc = process_node_array(&info[n], &trk->nodeinfo))) { + if (PMIX_SUCCESS != (rc = process_node_array(&info[n].value, &trk->nodeinfo))) { PMIX_ERROR_LOG(rc); goto release; } @@ -1153,7 +1182,7 @@ pmix_status_t hash_cache_job_info(struct pmix_namespace_t *ns, } /* mark that we got the map */ flags |= PMIX_HASH_PROC_MAP; - } else if (0 == strcmp(info[n].key, PMIX_PROC_DATA)) { + } else if (PMIX_CHECK_KEY(&info[n], PMIX_PROC_DATA)) { flags |= PMIX_HASH_PROC_DATA; /* an array of data pertaining to a specific proc */ if (PMIX_DATA_ARRAY != info[n].value.type) { @@ -1200,6 +1229,10 @@ pmix_status_t hash_cache_job_info(struct pmix_namespace_t *ns, kp2->value->data.bo.size = len; } } + pmix_output_verbose(2, pmix_gds_base_framework.framework_output, + "[%s:%d] gds:hash:cache_job_info proc data for [%s:%u]: key %s", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + trk->ns, rank, kp2->key); /* store it in the hash_table */ if (PMIX_SUCCESS != (rc = pmix_hash_store(ht, rank, kp2))) { PMIX_ERROR_LOG(rc); @@ -1208,6 +1241,60 @@ pmix_status_t hash_cache_job_info(struct pmix_namespace_t *ns, } PMIX_RELEASE(kp2); // maintain acctg } + } else if (pmix_check_node_info(info[n].key)) { + /* they are passing us the node-level info for just this + * node - start by seeing if our node is on the list */ + nd = NULL; + PMIX_LIST_FOREACH(ndptr, &trk->nodeinfo, pmix_nodeinfo_t) { + if (0 == strcmp(pmix_globals.hostname, ndptr->hostname)) { + nd = ndptr; + break; + } + } + /* if not, then add it */ + if (NULL == nd) { + nd = PMIX_NEW(pmix_nodeinfo_t); + nd->hostname = strdup(pmix_globals.hostname); + pmix_list_append(&trk->nodeinfo, &nd->super); + } + /* ensure the value isn't already on the node info */ + PMIX_LIST_FOREACH(kp2, &nd->info, pmix_kval_t) { + if (PMIX_CHECK_KEY(kp2, info[n].key)) { + pmix_list_remove_item(&nd->info, &kp2->super); + PMIX_RELEASE(kp2); + break; + } + } + /* add the provided value */ + kp2 = PMIX_NEW(pmix_kval_t); + kp2->key = strdup(info[n].key); + PMIX_VALUE_XFER(rc, kp2->value, &info[n].value); + pmix_list_append(&nd->info, &kp2->super); + } else if (pmix_check_app_info(info[n].key)) { + /* they are passing us app-level info for a default + * app number - have to assume it is app=0 */ + if (0 == pmix_list_get_size(&trk->apps)) { + apptr = PMIX_NEW(pmix_apptrkr_t); + pmix_list_append(&trk->apps, &apptr->super); + } else if (1 < pmix_list_get_size(&trk->apps)) { + rc = PMIX_ERR_BAD_PARAM; + goto release; + } else { + apptr = (pmix_apptrkr_t*)pmix_list_get_first(&trk->apps); + } + /* ensure the value isn't already on the app info */ + PMIX_LIST_FOREACH(kp2, &apptr->appinfo, pmix_kval_t) { + if (PMIX_CHECK_KEY(kp2, info[n].key)) { + pmix_list_remove_item(&apptr->appinfo, &kp2->super); + PMIX_RELEASE(kp2); + break; + } + } + /* add the provided value */ + kp2 = PMIX_NEW(pmix_kval_t); + kp2->key = strdup(info[n].key); + PMIX_VALUE_XFER(rc, kp2->value, &info[n].value); + pmix_list_append(&apptr->appinfo, &kp2->super); } else { /* just a value relating to the entire job */ kp2 = PMIX_NEW(pmix_kval_t); @@ -1285,14 +1372,10 @@ pmix_status_t hash_cache_job_info(struct pmix_namespace_t *ns, } /* we must have the proc AND node maps */ - if (NULL == procs || NULL == nodes) { - rc = PMIX_ERR_NOT_FOUND; - goto release; - } - - if (PMIX_SUCCESS != (rc = store_map(ht, nodes, procs, flags))) { - PMIX_ERROR_LOG(rc); - goto release; + if (NULL != procs && NULL != nodes) { + if (PMIX_SUCCESS != (rc = store_map(trk, nodes, procs, flags))) { + PMIX_ERROR_LOG(rc); + } } release: @@ -1309,7 +1392,7 @@ static pmix_status_t register_info(pmix_peer_t *peer, pmix_namespace_t *ns, pmix_buffer_t *reply) { - pmix_job_t *trk, *t; + pmix_job_t *trk; pmix_hash_table_t *ht; pmix_value_t *val, blob; pmix_status_t rc = PMIX_SUCCESS; @@ -1318,16 +1401,16 @@ static pmix_status_t register_info(pmix_peer_t *peer, pmix_kval_t kv, *kvptr; pmix_buffer_t buf; pmix_rank_t rank; + pmix_list_t results; + char *hname; - trk = NULL; - PMIX_LIST_FOREACH(t, &myjobs, pmix_job_t) { - if (0 == strcmp(ns->nspace, t->ns)) { - trk = t; - break; - } - } + pmix_output_verbose(2, pmix_gds_base_framework.framework_output, + "REGISTERING FOR PEER %s type %d.%d.%d", PMIX_PNAME_PRINT(&peer->info->pname), + peer->proc_type.major, peer->proc_type.minor, peer->proc_type.release); + + trk = get_tracker(ns->nspace, true); if (NULL == trk) { - return PMIX_ERR_INVALID_NAMESPACE; + return PMIX_ERR_NOMEM; } /* the job data is stored on the internal hash table */ ht = &trk->internal; @@ -1364,8 +1447,63 @@ static pmix_status_t register_info(pmix_peer_t *peer, PMIX_BFROPS_PACK(rc, peer, reply, kvptr, 1, PMIX_KVAL); } + /* get any node-level info for this job */ + PMIX_CONSTRUCT(&results, pmix_list_t); + rc = fetch_nodeinfo(NULL, &trk->nodeinfo, NULL, 0, &results); + if (PMIX_SUCCESS == rc) { + PMIX_LIST_FOREACH(kvptr, &results, pmix_kval_t) { + /* if the peer is earlier than v3.1.5, it is expecting + * node info to be in the form of an array, but with the + * hostname as the key. Detect and convert that here */ + if (PMIX_PEER_IS_EARLIER(peer, 3, 1, 5)) { + info = (pmix_info_t*)kvptr->value->data.darray->array; + ninfo = kvptr->value->data.darray->size; + hname = NULL; + /* find the hostname */ + for (n=0; n < ninfo; n++) { + if (PMIX_CHECK_KEY(&info[n], PMIX_HOSTNAME)) { + free(kvptr->key); + kvptr->key = strdup(info[n].value.data.string); + PMIX_BFROPS_PACK(rc, peer, reply, kvptr, 1, PMIX_KVAL); + hname = kvptr->key; + break; + } + } + if (NULL != hname && 0 == strcmp(pmix_globals.hostname, hname)) { + /* older versions are looking for node-level keys for + * only their own node as standalone keys */ + for (n=0; n < ninfo; n++) { + if (pmix_check_node_info(info[n].key)) { + kv.key = strdup(info[n].key); + kv.value = &info[n].value; + PMIX_BFROPS_PACK(rc, peer, reply, &kv, 1, PMIX_KVAL); + } + } + } + } else { + PMIX_BFROPS_PACK(rc, peer, reply, kvptr, 1, PMIX_KVAL); + } + } + } + PMIX_LIST_DESTRUCT(&results); + + /* get any app-level info for this job */ + PMIX_CONSTRUCT(&results, pmix_list_t); + rc = fetch_appinfo(NULL, &trk->apps, NULL, 0, &results); + if (PMIX_SUCCESS == rc) { + PMIX_LIST_FOREACH(kvptr, &results, pmix_kval_t) { + PMIX_BFROPS_PACK(rc, peer, reply, kvptr, 1, PMIX_KVAL); + } + } + PMIX_LIST_DESTRUCT(&results); + /* get the proc-level data for each proc in the job */ + pmix_output_verbose(2, pmix_gds_base_framework.framework_output, + "FETCHING PROC INFO FOR NSPACE %s NPROCS %u", + ns->nspace, ns->nprocs); for (rank=0; rank < ns->nprocs; rank++) { + pmix_output_verbose(2, pmix_gds_base_framework.framework_output, + "FETCHING PROC INFO FOR RANK %s", PMIX_RANK_PRINT(rank)); val = NULL; rc = pmix_hash_fetch(ht, rank, NULL, &val); if (PMIX_SUCCESS != rc && PMIX_ERR_PROC_ENTRY_NOT_FOUND != rc) { @@ -1412,10 +1550,10 @@ static pmix_status_t hash_register_job_info(struct pmix_peer_t *pr, pmix_namespace_t *ns = peer->nptr; char *msg; pmix_status_t rc; - pmix_job_t *trk, *t2; + pmix_job_t *trk; - if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { /* this function is only available on servers */ PMIX_ERROR_LOG(PMIX_ERR_NOT_SUPPORTED); return PMIX_ERR_NOT_SUPPORTED; @@ -1437,7 +1575,7 @@ static pmix_status_t hash_register_job_info(struct pmix_peer_t *pr, } /* now see if we have delivered it to all our local * clients for this nspace */ - if (ns->ndelivered == ns->nlocalprocs) { + if (!PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer) && ns->ndelivered == ns->nlocalprocs) { /* we have, so let's get rid of the packed * copy of the data */ PMIX_RELEASE(ns->jobbkt); @@ -1448,22 +1586,9 @@ static pmix_status_t hash_register_job_info(struct pmix_peer_t *pr, /* setup a tracker for this nspace as we will likely * need it again */ - trk = NULL; - PMIX_LIST_FOREACH(t2, &myjobs, pmix_job_t) { - if (ns == t2->nptr) { - trk = t2; - if (NULL == trk->ns) { - trk->ns = strdup(ns->nspace); - } - break; - } - } + trk = get_tracker(ns->nspace, true); if (NULL == trk) { - trk = PMIX_NEW(pmix_job_t); - trk->ns = strdup(ns->nspace); - PMIX_RETAIN(ns); - trk->nptr = ns; - pmix_list_append(&myjobs, &trk->super); + return PMIX_ERR_NOMEM; } /* the job info for the specified nspace has @@ -1481,7 +1606,7 @@ static pmix_status_t hash_register_job_info(struct pmix_peer_t *pr, if (PMIX_SUCCESS == rc) { /* if we have more than one local client for this nspace, * save this packed object so we don't do this again */ - if (1 < ns->nlocalprocs) { + if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer) || 1 < ns->nlocalprocs) { PMIX_RETAIN(reply); ns->jobbkt = reply; } @@ -1496,28 +1621,27 @@ static pmix_status_t hash_store_job_info(const char *nspace, pmix_buffer_t *buf) { pmix_status_t rc = PMIX_SUCCESS; - pmix_kval_t *kptr, *kp2, kv; - pmix_value_t *val; + pmix_kval_t *kptr, *kp2, *kp3, kv; int32_t cnt; - size_t nnodes, len, n; + size_t nnodes, len; uint32_t i, j; char **procs = NULL; uint8_t *tmp; pmix_byte_object_t *bo; pmix_buffer_t buf2; int rank; - pmix_job_t *htptr; + pmix_job_t *trk; pmix_hash_table_t *ht; char **nodelist = NULL; - pmix_info_t *info, *iptr; + pmix_nodeinfo_t *nd, *ndptr; pmix_namespace_t *ns, *nptr; pmix_output_verbose(2, pmix_gds_base_framework.framework_output, "[%s:%u] pmix:gds:hash store job info for nspace %s", pmix_globals.myid.nspace, pmix_globals.myid.rank, nspace); - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { /* this function is NOT available on servers */ PMIX_ERROR_LOG(PMIX_ERR_NOT_SUPPORTED); return PMIX_ERR_NOT_SUPPORTED; @@ -1530,7 +1654,13 @@ static pmix_status_t hash_store_job_info(const char *nspace, return rc; } - /* see if we already have this nspace */ + trk = get_tracker(nspace, true); + if (NULL == trk) { + return PMIX_ERR_NOMEM; + } + ht = &trk->internal; + + /* retrieve the nspace pointer */ nptr = NULL; PMIX_LIST_FOREACH(ns, &pmix_globals.nspaces, pmix_namespace_t) { if (0 == strcmp(ns->nspace, nspace)) { @@ -1539,31 +1669,8 @@ static pmix_status_t hash_store_job_info(const char *nspace, } } if (NULL == nptr) { - nptr = PMIX_NEW(pmix_namespace_t); - if (NULL == nptr) { - rc = PMIX_ERR_NOMEM; - return rc; - } - nptr->nspace = strdup(nspace); - pmix_list_append(&pmix_globals.nspaces, &nptr->super); - } - - /* see if we already have a hash table for this nspace */ - ht = NULL; - PMIX_LIST_FOREACH(htptr, &myjobs, pmix_job_t) { - if (0 == strcmp(htptr->ns, nspace)) { - ht = &htptr->internal; - break; - } - } - if (NULL == ht) { - /* nope - create one */ - htptr = PMIX_NEW(pmix_job_t); - htptr->ns = strdup(nspace); - PMIX_RETAIN(nptr); - htptr->nptr = nptr; - pmix_list_append(&myjobs, &htptr->super); - ht = &htptr->internal; + /* only can happen if we are out of mem */ + return PMIX_ERR_NOMEM; } cnt = 1; @@ -1639,8 +1746,8 @@ static pmix_status_t hash_store_job_info(const char *nspace, PMIX_DESTRUCT(&buf2); return rc; } - /* unpack the list of procs on each node */ for (i=0; i < nnodes; i++) { + /* unpack the list of procs on each node */ cnt = 1; PMIX_CONSTRUCT(&kv, pmix_kval_t); PMIX_BFROPS_UNPACK(rc, pmix_client_globals.myserver, @@ -1653,79 +1760,46 @@ static pmix_status_t hash_store_job_info(const char *nspace, } /* track the nodes in this nspace */ pmix_argv_append_nosize(&nodelist, kv.key); - /* save the list of peers for this node - but first - * check to see if we already have some data for this node */ - rc = pmix_hash_fetch(ht, PMIX_RANK_WILDCARD, kv.key, &val); - if (PMIX_SUCCESS == rc) { - /* already have some data, so we need to add to it */ - kp2 = PMIX_NEW(pmix_kval_t); - kp2->key = strdup(kv.key); - kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); - kp2->value->type = PMIX_DATA_ARRAY; - kp2->value->data.darray = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t)); - if (NULL == kp2->value->data.darray) { - PMIX_DESTRUCT(&buf2); - PMIX_DESTRUCT(&kv); - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; - } - kp2->value->data.darray->type = PMIX_INFO; - kp2->value->data.darray->size = val->data.darray->size + 1; - PMIX_INFO_CREATE(info, kp2->value->data.darray->size); - if (NULL == info) { - PMIX_DESTRUCT(&buf2); - PMIX_DESTRUCT(&kv); - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; - } - iptr = (pmix_info_t*)val->data.darray->array; - /* copy the pre-existing data across */ - for (n=0; n < val->data.darray->size; n++) { - PMIX_INFO_XFER(&info[n], &iptr[n]); - } - PMIX_INFO_LOAD(&info[kp2->value->data.darray->size-1], PMIX_LOCAL_PEERS, kv.value->data.string, PMIX_STRING); - kp2->value->data.darray->array = info; - if (PMIX_SUCCESS != (rc = pmix_hash_store(ht, PMIX_RANK_WILDCARD, kp2))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(kp2); - PMIX_DESTRUCT(&kv); - PMIX_DESTRUCT(&buf2); - return rc; - } - PMIX_RELEASE(kp2); // maintain acctg - } else { - /* nope - so add this by itself */ - kp2 = PMIX_NEW(pmix_kval_t); - kp2->key = strdup(kv.key); - kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); - kp2->value->type = PMIX_DATA_ARRAY; - kp2->value->data.darray = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t)); - if (NULL == kp2->value->data.darray) { - PMIX_DESTRUCT(&buf2); - PMIX_DESTRUCT(&kv); - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; - } - kp2->value->data.darray->type = PMIX_INFO; - PMIX_INFO_CREATE(info, 1); - if (NULL == info) { - PMIX_DESTRUCT(&buf2); - PMIX_DESTRUCT(&kv); - PMIX_RELEASE(kp2); - return PMIX_ERR_NOMEM; + /* check and see if we already have this node */ + nd = NULL; + PMIX_LIST_FOREACH(ndptr, &trk->nodeinfo, pmix_nodeinfo_t) { + if (NULL != ndptr->hostname && 0 == strcmp(ndptr->hostname, kv.key)) { + /* we assume that the data is updating the current + * values */ + if (NULL == ndptr->hostname) { + ndptr->hostname = strdup(kv.key); + } + nd = ndptr; + break; } - PMIX_INFO_LOAD(&info[0], PMIX_LOCAL_PEERS, kv.value->data.string, PMIX_STRING); - kp2->value->data.darray->array = info; - kp2->value->data.darray->size = 1; - if (PMIX_SUCCESS != (rc = pmix_hash_store(ht, PMIX_RANK_WILDCARD, kp2))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(kp2); - PMIX_DESTRUCT(&kv); - PMIX_DESTRUCT(&buf2); - return rc; + } + if (NULL == nd) { + nd = PMIX_NEW(pmix_nodeinfo_t); + nd->hostname = strdup(kv.key); + pmix_list_append(&trk->nodeinfo, &nd->super); + } + /* save the list of peers for this node */ + kp2 = PMIX_NEW(pmix_kval_t); + if (NULL == kp2) { + return PMIX_ERR_NOMEM; + } + kp2->key = strdup(PMIX_LOCAL_PEERS); + kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + if (NULL == kp2->value) { + PMIX_RELEASE(kp2); + return PMIX_ERR_NOMEM; + } + kp2->value->type = PMIX_STRING; + kp2->value->data.string = strdup(kv.value->data.string); + /* ensure this item only appears once on the list */ + PMIX_LIST_FOREACH(kp3, &nd->info, pmix_kval_t) { + if (PMIX_CHECK_KEY(kp3, kp2->key)) { + pmix_list_remove_item(&nd->info, &kp3->super); + PMIX_RELEASE(kp3); + break; } - PMIX_RELEASE(kp2); // maintain acctg } + pmix_list_append(&nd->info, &kp2->super); /* split the list of procs so we can store their * individual location data */ procs = pmix_argv_split(kv.value->data.string, ','); @@ -1772,6 +1846,18 @@ static pmix_status_t hash_store_job_info(const char *nspace, } /* cleanup */ PMIX_DESTRUCT(&buf2); + } else if (PMIX_CHECK_KEY(kptr, PMIX_APP_INFO_ARRAY)) { + if (PMIX_SUCCESS != (rc = process_app_array(kptr->value, trk))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(kp2); + return rc; + } + } else if (PMIX_CHECK_KEY(kptr, PMIX_NODE_INFO_ARRAY)) { + if (PMIX_SUCCESS != (rc = process_node_array(kptr->value, &trk->nodeinfo))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(kp2); + return rc; + } } else { /* if the value contains a string that is longer than the * limit, then compress it */ @@ -1823,10 +1909,13 @@ static pmix_status_t hash_store(const pmix_proc_t *proc, pmix_scope_t scope, pmix_kval_t *kv) { - pmix_job_t *trk, *t; + pmix_job_t *trk; pmix_status_t rc; pmix_kval_t *kp; - pmix_namespace_t *ns, *nptr; + pmix_rank_t rank; + size_t j, size, len; + pmix_info_t *iptr; + uint8_t *tmp; pmix_output_verbose(2, pmix_gds_base_framework.framework_output, "%s gds:hash:hash_store for proc %s key %s type %s scope %s", @@ -1839,43 +1928,15 @@ static pmix_status_t hash_store(const pmix_proc_t *proc, } /* find the hash table for this nspace */ - trk = NULL; - PMIX_LIST_FOREACH(t, &myjobs, pmix_job_t) { - if (0 == strcmp(proc->nspace, t->ns)) { - trk = t; - break; - } - } + trk = get_tracker(proc->nspace, true); if (NULL == trk) { - /* create one */ - trk = PMIX_NEW(pmix_job_t); - trk->ns = strdup(proc->nspace); - /* see if we already have this nspace */ - nptr = NULL; - PMIX_LIST_FOREACH(ns, &pmix_globals.nspaces, pmix_namespace_t) { - if (0 == strcmp(ns->nspace, proc->nspace)) { - nptr = ns; - break; - } - } - if (NULL == nptr) { - nptr = PMIX_NEW(pmix_namespace_t); - if (NULL == nptr) { - rc = PMIX_ERR_NOMEM; - PMIX_RELEASE(trk); - return rc; - } - nptr->nspace = strdup(proc->nspace); - pmix_list_append(&pmix_globals.nspaces, &nptr->super); - } - PMIX_RETAIN(nptr); - trk->nptr = nptr; - pmix_list_append(&myjobs, &trk->super); + return PMIX_ERR_NOMEM; } - /* see if the proc is me */ + /* see if the proc is me - cannot use CHECK_PROCID as + * we don't want rank=wildcard to match */ if (proc->rank == pmix_globals.myid.rank && - 0 == strncmp(proc->nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN)) { + PMIX_CHECK_NSPACE(proc->nspace, pmix_globals.myid.nspace)) { if (PMIX_INTERNAL != scope) { /* always maintain a copy of my own info here to simplify * later retrieval */ @@ -1910,6 +1971,67 @@ static pmix_status_t hash_store(const pmix_proc_t *proc, /* store it in the corresponding hash table */ if (PMIX_INTERNAL == scope) { + /* if this is proc data, then we have to expand it and + * store the values on that rank */ + if (PMIX_CHECK_KEY(kv, PMIX_PROC_DATA)) { + /* an array of data pertaining to a specific proc */ + if (PMIX_DATA_ARRAY != kv->value->type) { + PMIX_ERROR_LOG(PMIX_ERR_TYPE_MISMATCH); + return PMIX_ERR_TYPE_MISMATCH; + } + size = kv->value->data.darray->size; + iptr = (pmix_info_t*)kv->value->data.darray->array; + /* first element of the array must be the rank */ + if (0 != strcmp(iptr[0].key, PMIX_RANK) || + PMIX_PROC_RANK != iptr[0].value.type) { + rc = PMIX_ERR_TYPE_MISMATCH; + PMIX_ERROR_LOG(rc); + return rc; + } + rank = iptr[0].value.data.rank; + /* cycle thru the values for this rank and store them */ + for (j=1; j < size; j++) { + kp = PMIX_NEW(pmix_kval_t); + if (NULL == kp) { + rc = PMIX_ERR_NOMEM; + return rc; + } + kp->key = strdup(iptr[j].key); + PMIX_VALUE_XFER(rc, kp->value, &iptr[j].value); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(kp); + return rc; + } + /* if the value contains a string that is longer than the + * limit, then compress it */ + if (PMIX_STRING_SIZE_CHECK(kp->value)) { + if (pmix_util_compress_string(kp->value->data.string, &tmp, &len)) { + if (NULL == tmp) { + PMIX_ERROR_LOG(PMIX_ERR_NOMEM); + rc = PMIX_ERR_NOMEM; + return rc; + } + kp->value->type = PMIX_COMPRESSED_STRING; + free(kp->value->data.string); + kp->value->data.bo.bytes = (char*)tmp; + kp->value->data.bo.size = len; + } + } + pmix_output_verbose(2, pmix_gds_base_framework.framework_output, + "%s gds:hash:STORE data for nspace %s rank %u: key %s", + PMIX_NAME_PRINT(&pmix_globals.myid), + trk->ns, rank, kp->key); + /* store it in the hash_table */ + if (PMIX_SUCCESS != (rc = pmix_hash_store(&trk->internal, rank, kp))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(kp); + return rc; + } + PMIX_RELEASE(kp); // maintain acctg + } + return PMIX_SUCCESS; + } if (PMIX_SUCCESS != (rc = pmix_hash_store(&trk->internal, proc->rank, kv))) { PMIX_ERROR_LOG(rc); return rc; @@ -1976,13 +2098,12 @@ static pmix_status_t _hash_store_modex(void * cbdata, pmix_byte_object_t *bo) { pmix_namespace_t *ns = (pmix_namespace_t*)nspace; - pmix_job_t *trk, *t; + pmix_job_t *trk; pmix_status_t rc = PMIX_SUCCESS; int32_t cnt; pmix_buffer_t pbkt; pmix_proc_t proc; pmix_kval_t *kv; - pmix_namespace_t *ns2, *nptr; pmix_output_verbose(2, pmix_gds_base_framework.framework_output, "[%s:%d] gds:hash:store_modex for nspace %s", @@ -1990,38 +2111,9 @@ static pmix_status_t _hash_store_modex(void * cbdata, ns->nspace); /* find the hash table for this nspace */ - trk = NULL; - PMIX_LIST_FOREACH(t, &myjobs, pmix_job_t) { - if (0 == strcmp(ns->nspace, t->ns)) { - trk = t; - break; - } - } + trk = get_tracker(ns->nspace, true); if (NULL == trk) { - /* create one */ - trk = PMIX_NEW(pmix_job_t); - trk->ns = strdup(ns->nspace); - /* see if we already have this nspace */ - nptr = NULL; - PMIX_LIST_FOREACH(ns2, &pmix_globals.nspaces, pmix_namespace_t) { - if (0 == strcmp(ns->nspace, ns2->nspace)) { - nptr = ns2; - break; - } - } - if (NULL == nptr) { - nptr = PMIX_NEW(pmix_namespace_t); - if (NULL == nptr) { - rc = PMIX_ERR_NOMEM; - PMIX_RELEASE(trk); - return rc; - } - nptr->nspace = strdup(ns->nspace); - pmix_list_append(&pmix_globals.nspaces, &nptr->super); - } - PMIX_RETAIN(nptr); - trk->nptr = nptr; - pmix_list_append(&myjobs, &trk->super); + return PMIX_ERR_NOMEM; } /* this is data returned via the PMIx_Fence call when @@ -2089,7 +2181,7 @@ static pmix_status_t _hash_store_modex(void * cbdata, static pmix_status_t dohash(pmix_hash_table_t *ht, const char *key, pmix_rank_t rank, - bool skip_genvals, + int skip_genvals, pmix_list_t *kvs) { pmix_status_t rc; @@ -2111,12 +2203,24 @@ static pmix_status_t dohash(pmix_hash_table_t *ht, PMIX_RELEASE(val); return PMIX_ERR_NOT_FOUND; } + /* if they want the value returned in its array form, + * then we are done */ + if (2 == skip_genvals) { + kv = PMIX_NEW(pmix_kval_t); + if (NULL == kv) { + PMIX_VALUE_RELEASE(val); + return PMIX_ERR_NOMEM; + } + kv->value = val; + pmix_list_append(kvs, &kv->super); + return PMIX_SUCCESS; + } info = (pmix_info_t*)val->data.darray->array; ninfo = val->data.darray->size; for (n=0; n < ninfo; n++) { /* if the rank is UNDEF, then we don't want * anything that starts with "pmix" */ - if (skip_genvals && + if (1 == skip_genvals && 0 == strncmp(info[n].key, "pmix", 4)) { continue; } @@ -2172,13 +2276,15 @@ static pmix_status_t fetch_nodeinfo(const char *key, pmix_list_t *tgt, pmix_info_t *info, size_t ninfo, pmix_list_t *kvs) { - size_t n; + size_t n, nds; pmix_status_t rc; uint32_t nid=0; char *hostname = NULL; bool found = false; pmix_nodeinfo_t *nd, *ndptr; pmix_kval_t *kv, *kp2; + pmix_data_array_t *darray; + pmix_info_t *iptr; pmix_output_verbose(2, pmix_gds_base_framework.framework_output, "FETCHING NODE INFO"); @@ -2200,17 +2306,75 @@ static pmix_status_t fetch_nodeinfo(const char *key, pmix_list_t *tgt, } } if (!found) { + /* if the key is NULL, then they want all the info from + * all nodes */ + if (NULL == key) { + PMIX_LIST_FOREACH(nd, tgt, pmix_nodeinfo_t) { + kv = PMIX_NEW(pmix_kval_t); + kv->key = strdup(PMIX_NODE_INFO_ARRAY); + kv->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + if (NULL == kv->value) { + PMIX_RELEASE(kv); + return PMIX_ERR_NOMEM; + } + nds = pmix_list_get_size(&nd->info); + if (NULL != nd->hostname) { + ++nds; + } + if (UINT32_MAX != nd->nodeid) { + ++nds; + } + PMIX_DATA_ARRAY_CREATE(darray, nds, PMIX_INFO); + if (NULL == darray) { + PMIX_RELEASE(kv); + return PMIX_ERR_NOMEM; + } + iptr = (pmix_info_t*)darray->array; + n = 0; + if (NULL != nd->hostname) { + PMIX_INFO_LOAD(&iptr[n], PMIX_HOSTNAME, nd->hostname, PMIX_STRING); + ++n; + } + if (UINT32_MAX != nd->nodeid) { + PMIX_INFO_LOAD(&iptr[n], PMIX_NODEID, &nd->nodeid, PMIX_UINT32); + ++n; + } + PMIX_LIST_FOREACH(kp2, &nd->info, pmix_kval_t) { + PMIX_LOAD_KEY(iptr[n].key, kp2->key); + rc = pmix_value_xfer(&iptr[n].value, kp2->value); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_ARRAY_FREE(darray); + PMIX_RELEASE(kv); + return rc; + } + ++n; + } + kv->value->data.darray = darray; + kv->value->type = PMIX_DATA_ARRAY; + pmix_list_append(kvs, &kv->super); + } + return PMIX_SUCCESS; + + } else { + /* assume they want it from this node */ + hostname = pmix_globals.hostname; + goto scan; + } return PMIX_ERR_DATA_VALUE_NOT_FOUND; } + scan: /* scan the list of nodes to find the matching entry */ nd = NULL; PMIX_LIST_FOREACH(ndptr, tgt, pmix_nodeinfo_t) { - if (NULL != hostname && 0 == strcmp(ndptr->hostname, hostname)) { - nd = ndptr; - break; - } - if (NULL == hostname && nid == ndptr->nodeid) { + if (NULL != hostname) { + nds = strlen(hostname) < strlen(ndptr->hostname) ? strlen(hostname) : strlen(ndptr->hostname); + if (0 == strncmp(ndptr->hostname, hostname, nds)) { + nd = ndptr; + break; + } + } else if (nid == ndptr->nodeid) { nd = ndptr; break; } @@ -2218,27 +2382,77 @@ static pmix_status_t fetch_nodeinfo(const char *key, pmix_list_t *tgt, if (NULL == nd) { return PMIX_ERR_NOT_FOUND; } - /* scan the info list of this node to generate the results */ - rc = PMIX_ERR_NOT_FOUND; - PMIX_LIST_FOREACH(kv, &nd->info, pmix_kval_t) { - if (NULL == key || PMIX_CHECK_KEY(kv, key)) { - kp2 = PMIX_NEW(pmix_kval_t); - kp2->key = strdup(kv->key); - kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); - PMIX_VALUE_XFER(rc, kp2->value, kv->value); + + /* if they want it all, give it to them */ + if (NULL == key) { + kv = PMIX_NEW(pmix_kval_t); + kv->key = strdup(PMIX_NODE_INFO_ARRAY); + kv->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + if (NULL == kv->value) { + PMIX_RELEASE(kv); + return PMIX_ERR_NOMEM; + } + nds = pmix_list_get_size(&nd->info); + if (NULL != nd->hostname) { + ++nds; + } + if (UINT32_MAX != nd->nodeid) { + ++nds; + } + PMIX_DATA_ARRAY_CREATE(darray, nds, PMIX_INFO); + if (NULL == darray) { + PMIX_RELEASE(kv); + return PMIX_ERR_NOMEM; + } + iptr = (pmix_info_t*)darray->array; + n = 0; + if (NULL != nd->hostname) { + PMIX_INFO_LOAD(&iptr[n], PMIX_HOSTNAME, nd->hostname, PMIX_STRING); + ++n; + } + if (UINT32_MAX != nd->nodeid) { + PMIX_INFO_LOAD(&iptr[n], PMIX_NODEID, &nd->nodeid, PMIX_UINT32); + ++n; + } + PMIX_LIST_FOREACH(kp2, &nd->info, pmix_kval_t) { + PMIX_LOAD_KEY(iptr[n].key, kp2->key); + rc = pmix_value_xfer(&iptr[n].value, kp2->value); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); - PMIX_RELEASE(kp2); + PMIX_DATA_ARRAY_FREE(darray); + PMIX_RELEASE(kv); return rc; } - pmix_list_append(kvs, &kp2->super); - rc = PMIX_SUCCESS; - if (NULL != key) { - break; - } + ++n; } + kv->value->data.darray = darray; + kv->value->type = PMIX_DATA_ARRAY; + pmix_list_append(kvs, &kv->super); + return PMIX_SUCCESS; } + /* scan the info list of this node to find the key they want */ + rc = PMIX_ERR_NOT_FOUND; + PMIX_LIST_FOREACH(kp2, &nd->info, pmix_kval_t) { + if (PMIX_CHECK_KEY(kp2, key)) { + /* since they only asked for one key, return just that value */ + kv = PMIX_NEW(pmix_kval_t); + kv->key = strdup(kp2->key); + kv->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + if (NULL == kv->value) { + PMIX_RELEASE(kv); + return PMIX_ERR_NOMEM; + } + rc = pmix_value_xfer(kv->value, kp2->value); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(kv); + return rc; + } + pmix_list_append(kvs, &kv->super); + break; + } + } return rc; } @@ -2246,12 +2460,13 @@ static pmix_status_t fetch_appinfo(const char *key, pmix_list_t *tgt, pmix_info_t *info, size_t ninfo, pmix_list_t *kvs) { - size_t n; + size_t n, nds; pmix_status_t rc; uint32_t appnum; bool found = false; pmix_apptrkr_t *app, *apptr; pmix_kval_t *kv, *kp2; + pmix_data_array_t *darray; pmix_output_verbose(2, pmix_gds_base_framework.framework_output, "FETCHING APP INFO"); @@ -2269,6 +2484,46 @@ static pmix_status_t fetch_appinfo(const char *key, pmix_list_t *tgt, } } if (!found) { + /* if the key is NULL, then they want all the info from + * all apps */ + if (NULL == key) { + PMIX_LIST_FOREACH(apptr, tgt, pmix_apptrkr_t) { + kv = PMIX_NEW(pmix_kval_t); + kv->key = strdup(PMIX_APP_INFO_ARRAY); + kv->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + if (NULL == kv->value) { + PMIX_RELEASE(kv); + return PMIX_ERR_NOMEM; + } + nds = pmix_list_get_size(&apptr->appinfo) + 1; + PMIX_DATA_ARRAY_CREATE(darray, nds, PMIX_INFO); + if (NULL == darray) { + PMIX_RELEASE(kv); + return PMIX_ERR_NOMEM; + } + info = (pmix_info_t*)darray->array; + n = 0; + /* put in the appnum */ + PMIX_INFO_LOAD(&info[n], PMIX_APPNUM, &apptr->appnum, PMIX_UINT32); + ++n; + PMIX_LIST_FOREACH(kp2, &apptr->appinfo, pmix_kval_t) { + PMIX_LOAD_KEY(info[n].key, kp2->key); + rc = pmix_value_xfer(&info[n].value, kp2->value); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_ARRAY_FREE(darray); + PMIX_RELEASE(kv); + return rc; + } + ++n; + } + kv->value->data.darray = darray; + kv->value->type = PMIX_DATA_ARRAY; + pmix_list_append(kvs, &kv->super); + } + return PMIX_SUCCESS; + + } return PMIX_ERR_DATA_VALUE_NOT_FOUND; } @@ -2298,7 +2553,7 @@ static pmix_status_t fetch_appinfo(const char *key, pmix_list_t *tgt, kp2 = PMIX_NEW(pmix_kval_t); kp2->key = strdup(kv->key); kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); - PMIX_VALUE_XFER(rc, kp2->value, kv->value); + rc = pmix_value_xfer(kp2->value, kv->value); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(kp2); @@ -2321,16 +2576,18 @@ static pmix_status_t hash_fetch(const pmix_proc_t *proc, pmix_info_t qualifiers[], size_t nqual, pmix_list_t *kvs) { - pmix_job_t *trk, *t; + pmix_job_t *trk; pmix_status_t rc; - pmix_value_t *val; pmix_kval_t *kv, *kvptr; - pmix_info_t *info; - size_t n, ninfo; + pmix_info_t *info, *iptr; + size_t m, n, ninfo, niptr; pmix_hash_table_t *ht; pmix_session_t *sptr; uint32_t sid; pmix_rank_t rnk; + pmix_list_t rkvs; + bool nodeinfo = false; + bool appinfo = false; pmix_output_verbose(2, pmix_gds_base_framework.framework_output, "%s pmix:gds:hash fetch %s for proc %s on scope %s", @@ -2344,139 +2601,148 @@ static pmix_status_t hash_fetch(const pmix_proc_t *proc, if (NULL == key && PMIX_RANK_WILDCARD == proc->rank) { /* see if we have a tracker for this nspace - we will * if we already cached the job info for it */ - trk = NULL; - PMIX_LIST_FOREACH(t, &myjobs, pmix_job_t) { - if (0 == strcmp(proc->nspace, t->ns)) { - trk = t; - break; - } - } + trk = get_tracker(proc->nspace, false); if (NULL == trk) { /* let the caller know */ return PMIX_ERR_INVALID_NAMESPACE; } - /* the job data is stored on the internal hash table */ - ht = &trk->internal; /* fetch all values from the hash table tied to rank=wildcard */ - val = NULL; - rc = pmix_hash_fetch(ht, PMIX_RANK_WILDCARD, NULL, &val); - if (PMIX_SUCCESS != rc) { - if (NULL != val) { - PMIX_VALUE_RELEASE(val); + dohash(&trk->internal, NULL, PMIX_RANK_WILDCARD, 0, kvs); + /* also need to add any job-level info */ + PMIX_LIST_FOREACH(kvptr, &trk->jobinfo, pmix_kval_t) { + kv = PMIX_NEW(pmix_kval_t); + kv->key = strdup(kvptr->key); + kv->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + PMIX_VALUE_XFER(rc, kv->value, kvptr->value); + if (PMIX_SUCCESS != rc) { + PMIX_RELEASE(kv); + return rc; } - return rc; + pmix_list_append(kvs, &kv->super); } - if (NULL == val) { - return PMIX_ERR_NOT_FOUND; + /* collect the relevant node-level info */ + rc = fetch_nodeinfo(NULL, &trk->nodeinfo, qualifiers, nqual, kvs); + if (PMIX_SUCCESS != rc) { + return rc; } - /* the data is returned in a pmix_data_array_t of pmix_info_t - * structs. cycle thru and transfer them to the list */ - if (PMIX_DATA_ARRAY != val->type || - NULL == val->data.darray || - PMIX_INFO != val->data.darray->type) { - PMIX_VALUE_RELEASE(val); - return PMIX_ERR_INVALID_VAL; + /* collect the relevant app-level info */ + rc = fetch_appinfo(NULL, &trk->apps, qualifiers, nqual, kvs); + if (PMIX_SUCCESS != rc) { + return rc; } - info = (pmix_info_t*)val->data.darray->array; - ninfo = val->data.darray->size; - for (n=0; n < ninfo; n++) { - kv = PMIX_NEW(pmix_kval_t); - if (NULL == kv) { - rc = PMIX_ERR_NOMEM; - PMIX_VALUE_RELEASE(val); + /* finally, we need the job-level info for each rank in the job */ + for (rnk=0; rnk < trk->nptr->nprocs; rnk++) { + PMIX_CONSTRUCT(&rkvs, pmix_list_t); + rc = dohash(&trk->internal, NULL, rnk, 2, &rkvs); + if (PMIX_ERR_NOMEM == rc) { return rc; } - kv->key = strdup(info[n].key); - PMIX_VALUE_XFER(rc, kv->value, &info[n].value); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(kv); - PMIX_VALUE_RELEASE(val); - return rc; + if (0 == pmix_list_get_size(&rkvs)) { + PMIX_DESTRUCT(&rkvs); + continue; + } + /* should only have one entry on list */ + kvptr = (pmix_kval_t*)pmix_list_get_first(&rkvs); + /* we have to assemble the results into a proc blob + * so the remote end will know what to do with it */ + info = (pmix_info_t*)kvptr->value->data.darray->array; + ninfo = kvptr->value->data.darray->size; + /* setup to return the result */ + kv = PMIX_NEW(pmix_kval_t); + kv->key = strdup(PMIX_PROC_DATA); + kv->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + kv->value->type = PMIX_DATA_ARRAY; + niptr = ninfo + 1; // need space for the rank + PMIX_DATA_ARRAY_CREATE(kv->value->data.darray, niptr, PMIX_INFO); + iptr = (pmix_info_t*)kv->value->data.darray->array; + /* start with the rank */ + PMIX_INFO_LOAD(&iptr[0], PMIX_RANK, &rnk, PMIX_PROC_RANK); + /* now transfer rest of data across */ + for (n=0; n < ninfo; n++) { + PMIX_INFO_XFER(&iptr[n+1], &info[n]); } + /* add to the results */ pmix_list_append(kvs, &kv->super); + /* release the search result */ + PMIX_LIST_DESTRUCT(&rkvs); } - PMIX_VALUE_RELEASE(val); return PMIX_SUCCESS; } - /* if the nspace and rank are undefined, then they are asking - * for session-level information. */ - if (0 == strlen(proc->nspace) && PMIX_RANK_UNDEF == proc->rank) { - /* they must have included something identifying the info - * class they are querying */ - for (n=0; n < nqual; n++) { - if (PMIX_CHECK_KEY(&qualifiers[n], PMIX_SESSION_ID)) { - /* they want session-level info - see if we have - * that session */ - PMIX_VALUE_GET_NUMBER(rc, &qualifiers[n].value, sid, uint32_t); - if (PMIX_SUCCESS != rc) { - /* didn't provide a correct value */ - PMIX_ERROR_LOG(rc); - return rc; - } - PMIX_LIST_FOREACH(sptr, &mysessions, pmix_session_t) { - if (sptr->session == sid) { - /* see if they want info for a specific node */ - rc = fetch_nodeinfo(key, &sptr->nodeinfo, qualifiers, nqual, kvs); - /* if they did, then we are done */ - if (PMIX_ERR_DATA_VALUE_NOT_FOUND != rc) { - return rc; - } - /* check the session info */ - PMIX_LIST_FOREACH(kvptr, &sptr->sessioninfo, pmix_kval_t) { - if (NULL == key || PMIX_CHECK_KEY(kvptr, key)) { - kv = PMIX_NEW(pmix_kval_t); - kv->key = strdup(kvptr->key); - kv->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); - PMIX_VALUE_XFER(rc, kv->value, kvptr->value); - if (PMIX_SUCCESS != rc) { - PMIX_RELEASE(kv); - return rc; - } - pmix_list_append(kvs, &kv->super); - if (NULL != key) { - /* we are done */ - return PMIX_SUCCESS; + /* see if they are asking for session, node, or app-level info */ + for (n=0; n < nqual; n++) { + if (PMIX_CHECK_KEY(&qualifiers[n], PMIX_SESSION_INFO)) { + /* they must have provided a session ID */ + for (m=0; m < nqual; m++) { + if (PMIX_CHECK_KEY(&qualifiers[m], PMIX_SESSION_ID)) { + /* see if we have this session */ + PMIX_VALUE_GET_NUMBER(rc, &qualifiers[m].value, sid, uint32_t); + if (PMIX_SUCCESS != rc) { + /* didn't provide a correct value */ + PMIX_ERROR_LOG(rc); + return rc; + } + PMIX_LIST_FOREACH(sptr, &mysessions, pmix_session_t) { + if (sptr->session == sid) { + /* see if they want info for a specific node */ + rc = fetch_nodeinfo(key, &sptr->nodeinfo, qualifiers, nqual, kvs); + /* if they did, then we are done */ + if (PMIX_ERR_DATA_VALUE_NOT_FOUND != rc) { + return rc; + } + /* check the session info */ + PMIX_LIST_FOREACH(kvptr, &sptr->sessioninfo, pmix_kval_t) { + if (NULL == key || PMIX_CHECK_KEY(kvptr, key)) { + kv = PMIX_NEW(pmix_kval_t); + kv->key = strdup(kvptr->key); + kv->value = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + PMIX_VALUE_XFER(rc, kv->value, kvptr->value); + if (PMIX_SUCCESS != rc) { + PMIX_RELEASE(kv); + return rc; + } + pmix_list_append(kvs, &kv->super); + if (NULL != key) { + /* we are done */ + return PMIX_SUCCESS; + } } } } } } - /* if we get here, then the session wasn't found */ - return PMIX_ERR_NOT_FOUND; } + /* if we get here, then the session wasn't found */ + return PMIX_ERR_NOT_FOUND; + } else if (PMIX_CHECK_KEY(&qualifiers[n], PMIX_NODE_INFO)) { + nodeinfo = PMIX_INFO_TRUE(&qualifiers[n]); + } else if (PMIX_CHECK_KEY(&qualifiers[n], PMIX_APP_INFO)) { + appinfo = PMIX_INFO_TRUE(&qualifiers[n]); } } /* find the hash table for this nspace */ - trk = NULL; - PMIX_LIST_FOREACH(t, &myjobs, pmix_job_t) { - if (0 == strcmp(proc->nspace, t->ns)) { - trk = t; - break; - } - } + trk = get_tracker(proc->nspace, false); if (NULL == trk) { return PMIX_ERR_INVALID_NAMESPACE; } - /* if the rank isn't specified, check to see if they - * are looking for app-level or node-level info for - * this job */ - if (PMIX_RANK_UNDEF == proc->rank) { - /* see if they want info for a specific node */ + if (nodeinfo) { rc = fetch_nodeinfo(key, &trk->nodeinfo, qualifiers, nqual, kvs); - /* if they did, then we are done */ - if (PMIX_ERR_DATA_VALUE_NOT_FOUND != rc) { - return rc; + if (PMIX_SUCCESS != rc && PMIX_RANK_WILDCARD == proc->rank) { + /* need to check internal as we might have an older peer */ + ht = &trk->internal; + goto doover; } - /* see if they want info for a specific app */ + return rc; + } else if (appinfo) { rc = fetch_appinfo(key, &trk->apps, qualifiers, nqual, kvs); - /* if they did, then we are done */ - if (PMIX_ERR_DATA_VALUE_NOT_FOUND != rc) { - return rc; + if (PMIX_SUCCESS != rc && PMIX_RANK_WILDCARD == proc->rank) { + /* need to check internal as we might have an older peer */ + ht = &trk->internal; + goto doover; } + return rc; } /* fetch from the corresponding hash table - note that @@ -2608,7 +2874,7 @@ static pmix_status_t assemb_kvs_req(const pmix_proc_t *proc, pmix_server_caddy_t *cd = (pmix_server_caddy_t*)cbdata; pmix_kval_t *kv; - if (!PMIX_PROC_IS_V1(cd->peer)) { + if (!PMIX_PEER_IS_V1(cd->peer)) { PMIX_BFROPS_PACK(rc, cd->peer, buf, proc, 1, PMIX_PROC); if (PMIX_SUCCESS != rc) { return rc; @@ -2623,6 +2889,51 @@ static pmix_status_t assemb_kvs_req(const pmix_proc_t *proc, return rc; } +static pmix_status_t store_session_info(pmix_nspace_t nspace, + pmix_kval_t *kv) +{ + pmix_job_t *trk; + pmix_status_t rc; + + /* find the hash table for this nspace */ + trk = get_tracker(nspace, true); + if (NULL == trk) { + return PMIX_ERR_NOMEM; + } + rc = process_session_array(kv->value, trk); + return rc; +} + +static pmix_status_t store_node_info(pmix_nspace_t nspace, + pmix_kval_t *kv) +{ + pmix_job_t *trk; + pmix_status_t rc; + + /* find the hash table for this nspace */ + trk = get_tracker(nspace, true); + if (NULL == trk) { + return PMIX_ERR_NOMEM; + } + rc = process_node_array(kv->value, &trk->nodeinfo); + return rc; +} + +static pmix_status_t store_app_info(pmix_nspace_t nspace, + pmix_kval_t *kv) +{ + pmix_job_t *trk; + pmix_status_t rc; + + /* find the hash table for this nspace */ + trk = get_tracker(nspace, true); + if (NULL == trk) { + return PMIX_ERR_NOMEM; + } + rc = process_app_array(kv->value, trk); + return rc; +} + static pmix_status_t accept_kvs_resp(pmix_buffer_t *buf) { pmix_status_t rc = PMIX_SUCCESS; @@ -2666,11 +2977,17 @@ static pmix_status_t accept_kvs_resp(pmix_buffer_t *buf) PMIX_BFROPS_UNPACK(rc, pmix_client_globals.myserver, &pbkt, kv, &cnt, PMIX_KVAL); while (PMIX_SUCCESS == rc) { - /* let the GDS component for this peer store it - if - * the kval contains shmem connection info, then the - * component will know what to do about it (or else - * we selected the wrong component for this peer!) */ - PMIX_GDS_STORE_KV(rc, pmix_globals.mypeer, &proct, PMIX_INTERNAL, kv); + /* if this is an info array, then store it here as dstore + * doesn't know how to handle it */ + if (PMIX_CHECK_KEY(kv, PMIX_SESSION_INFO_ARRAY)) { + rc = store_session_info(proct.nspace, kv); + } else if (PMIX_CHECK_KEY(kv, PMIX_NODE_INFO_ARRAY)) { + rc = store_node_info(proct.nspace, kv); + } else if (PMIX_CHECK_KEY(kv, PMIX_APP_INFO_ARRAY)) { + rc = store_app_info(proct.nspace, kv); + } else { + rc = hash_store(&proct, PMIX_INTERNAL, kv); + } if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(kv); diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/pdl/configure.m4 b/opal/mca/pmix/pmix3x/pmix/src/mca/pdl/configure.m4 index 1e749df5b2d..c84738f189b 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/pdl/configure.m4 +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/pdl/configure.m4 @@ -1,7 +1,7 @@ dnl -*- shell-script -*- dnl dnl Copyright (c) 2010-2015 Cisco Systems, Inc. All rights reserved. -dnl Copyright (c) 2016-2019 Intel, Inc. All rights reserved. +dnl Copyright (c) 2016-2017 Intel, Inc. All rights reserved. dnl Copyright (c) 2016-2019 Research Organization for Information Science dnl and Technology (RIST). All rights reserved. dnl $COPYRIGHT$ diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/plog/base/plog_base_stubs.c b/opal/mca/pmix/pmix3x/pmix/src/mca/plog/base/plog_base_stubs.c index 221ec775f87..5d3419a616d 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/plog/base/plog_base_stubs.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/plog/base/plog_base_stubs.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2018-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2018 Intel, Inc. All rights reserved. * * $COPYRIGHT$ * diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/plog/stdfd/plog_stdfd.c b/opal/mca/pmix/pmix3x/pmix/src/mca/plog/stdfd/plog_stdfd.c index 2aceac179ad..f7dfc65c7fa 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/plog/stdfd/plog_stdfd.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/plog/stdfd/plog_stdfd.c @@ -83,7 +83,7 @@ static pmix_status_t mylog(const pmix_proc_t *source, } /* if we are not a gateway, then we don't handle this */ - if (!PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { return PMIX_ERR_TAKE_NEXT_OPTION; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/plog/syslog/plog_syslog.c b/opal/mca/pmix/pmix3x/pmix/src/mca/plog/syslog/plog_syslog.c index 64edceed18c..3da5dea649c 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/plog/syslog/plog_syslog.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/plog/syslog/plog_syslog.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -129,7 +129,7 @@ static pmix_status_t mylog(const pmix_proc_t *source, } } else if (0 == strncmp(data[n].key, PMIX_LOG_GLOBAL_SYSLOG, PMIX_MAX_KEYLEN)) { /* only do this if we are a gateway server */ - if (PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { rc = write_local(source, timestamp, pri, data[n].value.data.string, data, ndata); if (PMIX_SUCCESS == rc) { /* flag that we did this one */ diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/base/pnet_base_fns.c b/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/base/pnet_base_fns.c index 447a8e1ca14..adc11e9e4a2 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/base/pnet_base_fns.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/base/pnet_base_fns.c @@ -56,7 +56,7 @@ pmix_status_t pmix_pnet_base_allocate(char *nspace, if (NULL == nspace || NULL == ilist) { return PMIX_ERR_BAD_PARAM; } - if (PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { nptr = NULL; /* find this nspace - note that it may not have * been registered yet */ diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/opa/pnet_opa.c b/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/opa/pnet_opa.c index 712b1644219..c7ae9276257 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/opa/pnet_opa.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/opa/pnet_opa.c @@ -524,7 +524,7 @@ static pmix_status_t collect_inventory(pmix_info_t directives[], size_t ndirs, query: #if 0 #if PMIX_WANT_OPAMGT - if (PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { /* collect the switch information from the FM */ OMGT_STATUS_T status = OMGT_STATUS_SUCCESS; struct omgt_port * port = NULL; @@ -562,7 +562,7 @@ static pmix_status_t collect_inventory(pmix_info_t directives[], size_t ndirs, #else // have_hwloc #if 0 #if PMIX_WANT_OPAMGT - if (PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { /* query the FM for the inventory */ } diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/tcp/pnet_tcp.c b/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/tcp/pnet_tcp.c index 81e823ad245..617a1c0455a 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/tcp/pnet_tcp.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/tcp/pnet_tcp.c @@ -200,7 +200,7 @@ static pmix_status_t tcp_init(void) /* if we are not the "gateway", then there is nothing * for us to do */ - if (!PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { return PMIX_SUCCESS; } @@ -258,7 +258,7 @@ static void tcp_finalize(void) { pmix_output_verbose(2, pmix_pnet_base_framework.framework_output, "pnet: tcp finalize"); - if (PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { PMIX_LIST_DESTRUCT(&allocations); PMIX_LIST_DESTRUCT(&available); } @@ -320,7 +320,7 @@ static pmix_status_t allocate(pmix_namespace_t *nptr, /* if I am not the gateway, then ignore this call - should never * happen, but check to be safe */ - if (!PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { return PMIX_SUCCESS; } @@ -847,7 +847,7 @@ static void deregister_nspace(pmix_namespace_t *nptr) /* if we are not the "gateway", then there is nothing * for us to do */ - if (!PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { return; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/test/pnet_test.c b/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/test/pnet_test.c index 830e0c02e59..119110f8762 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/test/pnet_test.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/pnet/test/pnet_test.c @@ -117,7 +117,7 @@ static pmix_status_t allocate(pmix_namespace_t *nptr, /* if I am not the gateway, then ignore this call - should never * happen, but check to be safe */ - if (!PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { return PMIX_SUCCESS; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/preg/native/preg_native.c b/opal/mca/pmix/pmix3x/pmix/src/mca/preg/native/preg_native.c index 0c9d6188a0d..8704191ecd6 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/preg/native/preg_native.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/preg/native/preg_native.c @@ -88,6 +88,7 @@ static pmix_status_t generate_node_regex(const char *input, pmix_list_t vids; char **regexargs = NULL, *tmp, *tmp2; char *cptr; + pmix_status_t rc; /* define the default */ *regexp = NULL; @@ -302,17 +303,22 @@ static pmix_status_t generate_node_regex(const char *input, } /* assemble final result */ - tmp = pmix_argv_join(regexargs, ','); - if (0 > asprintf(regexp, "pmix[%s]", tmp)) { - return PMIX_ERR_NOMEM; - } - free(tmp); + if (NULL != regexargs) { + tmp = pmix_argv_join(regexargs, ','); + if (0 > asprintf(regexp, "pmix[%s]", tmp)) { + return PMIX_ERR_NOMEM; + } + free(tmp); - /* cleanup */ - pmix_argv_free(regexargs); + /* cleanup */ + pmix_argv_free(regexargs); + rc = PMIX_SUCCESS; + } else { + rc = PMIX_ERR_TAKE_NEXT_OPTION; + } PMIX_DESTRUCT(&vids); - return PMIX_SUCCESS; + return rc; } static pmix_status_t generate_ppn(const char *input, diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake.c b/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake.c index ae1f9b62e59..0e707049f6c 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake.c @@ -2,7 +2,6 @@ /* * Copyright (c) 2019 Mellanox Technologies, Inc. * All rights reserved. - * Copyright (c) 2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake.h b/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake.h index 74cc3632213..5a8ea519108 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake.h +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake.h @@ -2,7 +2,6 @@ /* * Copyright (c) 2019 Mellanox Technologies, Inc. * All rights reserved. - * Copyright (c) 2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake_component.c b/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake_component.c index 53fb13b6fed..fd826b817cb 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake_component.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/psec/dummy_handshake/psec_dummy_handshake_component.c @@ -2,7 +2,6 @@ /* * Copyright (c) 2019 Mellanox Technologies, Inc. * All rights reserved. - * Copyright (c) 2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/psec/psec.h b/opal/mca/pmix/pmix3x/pmix/src/mca/psec/psec.h index 10c31e9bfa3..e088cd5ff4a 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/psec/psec.h +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/psec/psec.h @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/base.h b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/base.h index 718a377fc5c..197658f7f37 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/base.h +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/base.h @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -118,7 +118,7 @@ PMIX_EXPORT void pmix_ptl_base_connection_handler(int sd, short args, void *cbda PMIX_EXPORT pmix_status_t pmix_ptl_base_send_connect_ack(int sd); PMIX_EXPORT pmix_status_t pmix_ptl_base_recv_connect_ack(int sd); PMIX_EXPORT void pmix_ptl_base_lost_connection(pmix_peer_t *peer, pmix_status_t err); - +PMIX_EXPORT bool pmix_ptl_base_peer_is_earlier(pmix_peer_t *peer, uint8_t major, uint8_t minor, uint8_t release); END_C_DECLS diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_frame.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_frame.c index 2e6a101752e..0723e49b1ff 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_frame.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_frame.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -43,6 +43,7 @@ #include "src/mca/base/pmix_mca_base_framework.h" #include "src/class/pmix_list.h" #include "src/client/pmix_client_ops.h" +#include "src/mca/ptl/ptl_types.h" #include "src/mca/ptl/base/base.h" /* @@ -208,7 +209,11 @@ static void pccon(pmix_pending_connection_t *p) p->gds = NULL; p->ptl = NULL; p->cred = NULL; - p->proc_type = PMIX_PROC_UNDEF; + p->proc_type.type = PMIX_PROC_UNDEF; + p->proc_type.major = PMIX_MAJOR_WILDCARD; + p->proc_type.minor = PMIX_MINOR_WILDCARD; + p->proc_type.release = PMIX_RELEASE_WILDCARD; + p->proc_type.padding = 0; } static void pcdes(pmix_pending_connection_t *p) { diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c index 043a68e1388..93e16ba53e8 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c @@ -87,8 +87,8 @@ void pmix_ptl_base_lost_connection(pmix_peer_t *peer, pmix_status_t err) } CLOSE_THE_SOCKET(peer->sd); - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_TOOL(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_TOOL(pmix_globals.mypeer)) { /* if I am a server, then we need to ensure that * we properly account for the loss of this client * from any local collectives in which it was @@ -185,7 +185,7 @@ void pmix_ptl_base_lost_connection(pmix_peer_t *peer, pmix_status_t err) /* purge any notifications cached for this client */ pmix_server_purge_events(peer, NULL); - if (PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { /* only connection I can lose is to my server, so mark it */ pmix_globals.connected = false; } else { @@ -193,7 +193,7 @@ void pmix_ptl_base_lost_connection(pmix_peer_t *peer, pmix_status_t err) pmix_psensor.stop(peer, NULL); } - if (!peer->finalized && !PMIX_PROC_IS_TOOL(peer) && !pmix_globals.mypeer->finalized) { + if (!peer->finalized && !PMIX_PEER_IS_TOOL(peer) && !pmix_globals.mypeer->finalized) { /* if this peer already called finalize, then * we are just seeing their connection go away * when they terminate - so do not generate diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_stubs.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_stubs.c index f2334e21a11..a653d4c8524 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_stubs.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_stubs.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,6 +30,52 @@ #include "src/mca/ptl/base/base.h" +bool pmix_ptl_base_peer_is_earlier(pmix_peer_t *peer, uint8_t major, + uint8_t minor, uint8_t release) +{ + /* if they don't care, then don't check */ + if (PMIX_MAJOR_WILDCARD != major) { + if (PMIX_PEER_MAJOR_VERSION(peer) == PMIX_MAJOR_WILDCARD) { + /* we don't know what it is - assume earlier */ + return true; + } + if (PMIX_PEER_MAJOR_VERSION(peer) > major) { + return false; + } + if (PMIX_PEER_MAJOR_VERSION(peer) < major) { + return true; + } + } + /* major value must be equal, so check minor */ + if (PMIX_MINOR_WILDCARD != minor) { + if (PMIX_PEER_MINOR_VERSION(peer) == PMIX_MINOR_WILDCARD) { + /* we don't know what it is - assume earlier */ + return true; + } + if (PMIX_PEER_MINOR_VERSION(peer) > minor) { + return false; + } + if (PMIX_PEER_MINOR_VERSION(peer) < minor) { + return true; + } + } + /* major and minor must be equal - check release */ + if (PMIX_RELEASE_WILDCARD != release) { + if (PMIX_PEER_REL_VERSION(peer) == PMIX_RELEASE_WILDCARD) { + /* we don't know what it is - assume earlier */ + return true; + } + if (PMIX_PEER_REL_VERSION(peer) > release) { + return false; + } + if (PMIX_PEER_REL_VERSION(peer) < release) { + return true; + } + } + /* must be equal */ + return false; +} + pmix_status_t pmix_ptl_base_setup_fork(const pmix_proc_t *proc, char ***env) { pmix_ptl_base_active_t *active; diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/ptl_types.h b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/ptl_types.h index 0017c5b8134..a0e500655ce 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/ptl_types.h +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/ptl_types.h @@ -62,34 +62,104 @@ struct pmix_peer_t; struct pmix_ptl_module_t; /* define a process type */ -typedef uint16_t pmix_proc_type_t; - -#define PMIX_PROC_UNDEF 0x0000 -#define PMIX_PROC_CLIENT 0x0001 // simple client process -#define PMIX_PROC_SERVER 0x0002 // simple server process -#define PMIX_PROC_TOOL 0x0004 // simple tool -#define PMIX_PROC_V1 0x0008 // process is using PMIx v1 protocols -#define PMIX_PROC_V20 0x0010 // process is using PMIx v2.0 protocols -#define PMIX_PROC_V21 0x0020 // process is using PMIx v2.1 protocols -#define PMIX_PROC_V3 0x0040 // process is using PMIx v3 protocols -#define PMIX_PROC_LAUNCHER_ACT 0x1000 // process acting as launcher +typedef struct { + uint32_t type; + uint8_t major; + uint8_t minor; + uint8_t release; + uint8_t padding; // make the struct be 64-bits for addressing +} pmix_proc_type_t; + +#define PMIX_MAJOR_WILDCARD 255 +#define PMIX_MINOR_WILDCARD 255 +#define PMIX_RELEASE_WILDCARD 255 + +/* use 255 as WILDCARD for the release triplet values */ +#define PMIX_PROC_TYPE_STATIC_INIT \ + { \ + .type = PMIX_PROC_UNDEF, \ + .major = PMIX_MAJOR_WILDCARD, \ + .minor = PMIX_MINOR_WILDCARD, \ + .release = PMIX_RELEASE_WILDCARD, \ + .padding = 0 \ + } + +/* Define process types - we use a bit-mask as procs can + * span multiple types */ +#define PMIX_PROC_UNDEF 0x00000000 +#define PMIX_PROC_CLIENT 0x00000001 // simple client process +#define PMIX_PROC_SERVER 0x00000002 // simple server process +#define PMIX_PROC_TOOL 0x00000004 // simple tool +#define PMIX_PROC_LAUNCHER_ACT 0x10000000 // process acting as launcher #define PMIX_PROC_LAUNCHER (PMIX_PROC_TOOL | PMIX_PROC_SERVER | PMIX_PROC_LAUNCHER_ACT) -#define PMIX_PROC_CLIENT_TOOL_ACT 0x2000 +#define PMIX_PROC_CLIENT_LAUNCHER (PMIX_PROC_LAUNCHER | PMIX_PROC_CLIENT) +#define PMIX_PROC_CLIENT_TOOL_ACT 0x20000000 #define PMIX_PROC_CLIENT_TOOL (PMIX_PROC_TOOL | PMIX_PROC_CLIENT | PMIX_PROC_CLIENT_TOOL_ACT) -#define PMIX_PROC_GATEWAY_ACT 0x4000 +#define PMIX_PROC_GATEWAY_ACT 0x40000000 #define PMIX_PROC_GATEWAY (PMIX_PROC_SERVER | PMIX_PROC_GATEWAY_ACT) -/* defins some convenience macros for testing proc type */ -#define PMIX_PROC_IS_CLIENT(p) (PMIX_PROC_CLIENT & (p)->proc_type) -#define PMIX_PROC_IS_SERVER(p) (PMIX_PROC_SERVER & (p)->proc_type) -#define PMIX_PROC_IS_TOOL(p) (PMIX_PROC_TOOL & (p)->proc_type) -#define PMIX_PROC_IS_V1(p) (PMIX_PROC_V1 & (p)->proc_type) -#define PMIX_PROC_IS_V20(p) (PMIX_PROC_V20 & (p)->proc_type) -#define PMIX_PROC_IS_V21(p) (PMIX_PROC_V21 & (p)->proc_type) -#define PMIX_PROC_IS_V3(p) (PMIX_PROC_V3 & (p)->proc_type) -#define PMIX_PROC_IS_LAUNCHER(p) (PMIX_PROC_LAUNCHER_ACT & (p)->proc_type) -#define PMIX_PROC_IS_CLIENT_TOOL(p) (PMIX_PROC_CLIENT_TOOL_ACT & (p)->proc_type) -#define PMIX_PROC_IS_GATEWAY(p) (PMIX_PROC_GATEWAY_ACT & (p)->proc_type) +#define PMIX_SET_PEER_TYPE(a, b) \ + (a)->proc_type.type |= (b) +#define PMIX_SET_PROC_TYPE(a, b) \ + (a)->type |= (b) + +/* define some convenience macros for testing proc type */ +#define PMIX_PEER_IS_CLIENT(p) (PMIX_PROC_CLIENT & (p)->proc_type.type) +#define PMIX_PEER_IS_SERVER(p) (PMIX_PROC_SERVER & (p)->proc_type.type) +#define PMIX_PEER_IS_TOOL(p) (PMIX_PROC_TOOL & (p)->proc_type.type) +#define PMIX_PEER_IS_LAUNCHER(p) (PMIX_PROC_LAUNCHER_ACT & (p)->proc_type.type) +#define PMIX_PEER_IS_CLIENT_LAUNCHER(p) ((PMIX_PROC_LAUNCHER_ACT & (p)->proc_type.type) && (PMIX_PROC_CLIENT & (p)->proc_type.type)) +#define PMIX_PEER_IS_CLIENT_TOOL(p) ((PMIX_PROC_CLIENT_TOOL_ACT & (p)->proc_type.type) && (PMIX_PROC_CLIENT & (p)->proc_type.type)) +#define PMIX_PEER_IS_GATEWAY(p) (PMIX_PROC_GATEWAY_ACT & (p)->proc_type.type) + +#define PMIX_PROC_IS_CLIENT(p) (PMIX_PROC_CLIENT & (p)->type) +#define PMIX_PROC_IS_SERVER(p) (PMIX_PROC_SERVER & (p)->type) +#define PMIX_PROC_IS_TOOL(p) (PMIX_PROC_TOOL & (p)->type) +#define PMIX_PROC_IS_LAUNCHER(p) (PMIX_PROC_LAUNCHER_ACT & (p)->type) +#define PMIX_PROC_IS_CLIENT_LAUNCHER(p) ((PMIX_PROC_LAUNCHER_ACT & (p)->type) && (PMIX_PROC_CLIENT & (p)->type)) +#define PMIX_PROC_IS_CLIENT_TOOL(p) ((PMIX_PROC_CLIENT_TOOL_ACT & (p)->type) && (PMIX_PROC_CLIENT & (p)->type)) +#define PMIX_PROC_IS_GATEWAY(p) (PMIX_PROC_GATEWAY_ACT & (p)->type) + +/* provide macros for setting the major, minor, and release values + * just so people don't have to deal with the details of the struct */ +#define PMIX_SET_PEER_MAJOR(p, a) \ + (p)->proc_type.major = (a) +#define PMIX_SET_PEER_MINOR(p, a) \ + (p)->proc_type.minor = (a) +#define PMIX_SET_PEER_RELEASE(p, a) \ + (p)->proc_type.release = (a) +#define PMIX_SET_PROC_MAJOR(p, a) \ + (p)->major = (a) +#define PMIX_SET_PROC_MINOR(p, a) \ + (p)->minor = (a) +#define PMIX_SET_PROC_RELEASE(p, a) \ + (p)->release = (a) + +/* define some convenience macros for testing version */ +#define PMIX_PEER_MAJOR_VERSION(p) (p)->proc_type.major +#define PMIX_PEER_MINOR_VERSION(p) (p)->proc_type.minor +#define PMIX_PEER_REL_VERSION(p) (p)->proc_type.release +#define PMIX_PROC_MAJOR_VERSION(p) (p)->major +#define PMIX_PROC_MINOR_VERSION(p) (p)->minor +#define PMIX_PROC_REL_VERSION(p) (p)->release +#define PMIX_PEER_IS_V1(p) ((p)->proc_type.major == 1) +#define PMIX_PEER_IS_V20(p) ((p)->proc_type.major == 2 && (p)->proc_type.minor == 0) +#define PMIX_PEER_IS_V21(p) ((p)->proc_type.major == 2 && (p)->proc_type.minor == 1) +#define PMIX_PEER_IS_V3(p) ((p)->proc_type.major == 3) + + +#define PMIX_PEER_TRIPLET(p, a, b, c) \ + ((PMIX_PEER_MAJOR_VERSION(p) == PMIX_MAJOR_WILDCARD || (a) == PMIX_MAJOR_WILDCARD || PMIX_PEER_MAJOR_VERSION(p) == (a)) && \ + (PMIX_PEER_MINOR_VERSION(p) == PMIX_MINOR_WILDCARD || (b) == PMIX_MINOR_WILDCARD || PMIX_PEER_MINOR_VERSION(p) == (b)) && \ + (PMIX_PEER_REL_VERSION(p) == PMIX_RELEASE_WILDCARD || (c) == PMIX_RELEASE_WILDCARD || PMIX_PEER_REL_VERSION(p) == (c))) + +#define PMIX_PROC_TRIPLET(p, a, b, c) \ + ((PMIX_PROC_MAJOR_VERSION(p) == PMIX_MAJOR_WILDCARD || PMIX_PROC_MAJOR_VERSION(p) == (a)) && \ + (PMIX_PROC_MINOR_VERSION(p) == PMIX_MINOR_WILDCARD || PMIX_PROC_MINOR_VERSION(p) == (b)) && \ + (PMIX_PROC_REL_VERSION(p) == PMIX_RELEASE_WILDCARD || PMIX_PROC_REL_VERSION(p) == (c))) + +#define PMIX_PEER_IS_EARLIER(p, a, b, c) \ + pmix_ptl_base_peer_is_earlier(p, a, b, c) /**** MESSAGING STRUCTURES ****/ diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp.c index e86a4126405..af6ccc69f1f 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp.c @@ -141,6 +141,7 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer, pmix_info_t *iptr = NULL, mypidinfo, mycmdlineinfo, launcher; size_t niptr = 0; pmix_kval_t *urikv = NULL; + int major, minor, release; pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, "ptl:tcp: connecting to server"); @@ -151,10 +152,11 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer, /* if I am a client, then we need to look for the appropriate * connection info in the environment */ - if (PMIX_PROC_IS_CLIENT(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_CLIENT(pmix_globals.mypeer)) { if (NULL != (evar = getenv("PMIX_SERVER_URI3"))) { /* we are talking to a v3 server */ - pmix_client_globals.myserver->proc_type = PMIX_PROC_SERVER | PMIX_PROC_V3; + PMIX_SET_PEER_TYPE(pmix_client_globals.myserver, PMIX_PROC_SERVER); + PMIX_SET_PEER_MAJOR(pmix_client_globals.myserver, 3); pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, "V3 SERVER DETECTED"); /* must use the v3 bfrops module */ @@ -164,7 +166,9 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer, } } else if (NULL != (evar = getenv("PMIX_SERVER_URI21"))) { /* we are talking to a v2.1 server */ - pmix_client_globals.myserver->proc_type = PMIX_PROC_SERVER | PMIX_PROC_V21; + PMIX_SET_PEER_TYPE(pmix_client_globals.myserver, PMIX_PROC_SERVER); + PMIX_SET_PEER_MAJOR(pmix_client_globals.myserver, 2); + PMIX_SET_PEER_MINOR(pmix_client_globals.myserver, 1); pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, "V21 SERVER DETECTED"); /* must use the v21 bfrops module */ @@ -174,7 +178,9 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer, } } else if (NULL != (evar = getenv("PMIX_SERVER_URI2"))) { /* we are talking to a v2.0 server */ - pmix_client_globals.myserver->proc_type = PMIX_PROC_SERVER | PMIX_PROC_V20; + PMIX_SET_PEER_TYPE(pmix_client_globals.myserver, PMIX_PROC_SERVER); + PMIX_SET_PEER_MAJOR(pmix_client_globals.myserver, 2); + PMIX_SET_PEER_MINOR(pmix_client_globals.myserver, 0); pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, "V20 SERVER DETECTED"); /* must use the v20 bfrops module */ @@ -191,6 +197,18 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer, /* mark that we are using the V2 (i.e., tcp) protocol */ pmix_globals.mypeer->protocol = PMIX_PROTOCOL_V2; + /* see if they set their version in the env */ + if (NULL != (p2 = getenv("PMIX_VERSION"))) { + major = strtoul(p2, &p, 10); + ++p; + minor = strtoul(p, &p, 10); + ++p; + release = strtoul(p, NULL, 10); + PMIX_SET_PEER_MAJOR(pmix_client_globals.myserver, major); + PMIX_SET_PEER_MINOR(pmix_client_globals.myserver, minor); + PMIX_SET_PEER_RELEASE(pmix_client_globals.myserver, release); + } + /* the URI consists of the following elements: * - server nspace.rank * - ptl rendezvous URI @@ -306,7 +324,7 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer, pmix_list_append(&ilist, &kv->super); /* if I am a launcher, tell them so */ - if (PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { kv = PMIX_NEW(pmix_info_caddy_t); PMIX_INFO_LOAD(&launcher, PMIX_LAUNCHER, NULL, PMIX_BOOL); kv->info = &launcher; @@ -716,7 +734,7 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer, /* tools setup their server info in try_connect because they * utilize a broader handshake */ - if (PMIX_PROC_IS_CLIENT(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_CLIENT(pmix_globals.mypeer)) { /* setup the server info */ if (NULL == pmix_client_globals.myserver->info) { pmix_client_globals.myserver->info = PMIX_NEW(pmix_rank_info_t); @@ -827,12 +845,12 @@ static pmix_status_t parse_uri_file(char *filename, pmix_rank_t *rank) { FILE *fp; - char *srvr, *p, *p2; + char *srvr, *p, *p2, *p3; pmix_lock_t lock; pmix_event_t ev; struct timeval tv; int retries; - int major; + int major, minor, release; fp = fopen(filename, "r"); if (NULL == fp) { @@ -881,27 +899,29 @@ static pmix_status_t parse_uri_file(char *filename, /* see if this file contains the server's version */ p2 = pmix_getline(fp); if (NULL == p2) { - pmix_client_globals.myserver->proc_type = PMIX_PROC_SERVER | PMIX_PROC_V20; + PMIX_SET_PEER_TYPE(pmix_client_globals.myserver, PMIX_PROC_SERVER); + PMIX_SET_PEER_MAJOR(pmix_client_globals.myserver, 2); + PMIX_SET_PEER_MINOR(pmix_client_globals.myserver, 0); pmix_client_globals.myserver->protocol = PMIX_PROTOCOL_V2; pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, "V20 SERVER DETECTED"); } else { /* convert the version to a number */ if ('v' == p2[0]) { - major = strtoul(&p2[1], NULL, 10); + major = strtoul(&p2[1], &p3, 10); } else { - major = strtoul(p2, NULL, 10); - } - if (2 == major) { - pmix_client_globals.myserver->proc_type = PMIX_PROC_SERVER | PMIX_PROC_V21; + major = strtoul(p2, &p3, 10); + } + minor = strtoul(p3, &p3, 10); + release = strtoul(p3, NULL, 10); + PMIX_SET_PEER_TYPE(pmix_client_globals.myserver, PMIX_PROC_SERVER); + PMIX_SET_PEER_MAJOR(pmix_client_globals.myserver, major); + PMIX_SET_PEER_MINOR(pmix_client_globals.myserver, minor); + PMIX_SET_PEER_RELEASE(pmix_client_globals.myserver, release); + if (2 <= major) { pmix_client_globals.myserver->protocol = PMIX_PROTOCOL_V2; pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, - "V21 SERVER DETECTED"); - } else if (3 <= major) { - pmix_client_globals.myserver->proc_type = PMIX_PROC_SERVER | PMIX_PROC_V3; - pmix_client_globals.myserver->protocol = PMIX_PROTOCOL_V2; - pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, - "V3 SERVER DETECTED"); + "V2 PROTOCOL SERVER DETECTED"); } } if (NULL != p2) { @@ -1068,8 +1088,8 @@ static pmix_status_t send_connect_ack(int sd, uint8_t *myflag, "pmix:tcp SEND CONNECT ACK"); /* if we are a server, then we shouldn't be here */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) && - !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && + !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { PMIX_ERROR_LOG(PMIX_ERR_NOT_SUPPORTED); return PMIX_ERR_NOT_SUPPORTED; } @@ -1107,8 +1127,8 @@ static pmix_status_t send_connect_ack(int sd, uint8_t *myflag, * 7 => self-started launcher that was given an identifier by caller * 8 => launcher that was started by a PMIx server - identifier specified by server */ - if (PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { - if (PMIX_PROC_IS_CLIENT(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_CLIENT(pmix_globals.mypeer)) { /* if we are both launcher and client, then we need * to tell the server we are both */ flag = 8; @@ -1129,8 +1149,8 @@ static pmix_status_t send_connect_ack(int sd, uint8_t *myflag, } } - } else if (PMIX_PROC_IS_CLIENT(pmix_globals.mypeer) && - !PMIX_PROC_IS_TOOL(pmix_globals.mypeer)) { + } else if (PMIX_PEER_IS_CLIENT(pmix_globals.mypeer) && + !PMIX_PEER_IS_TOOL(pmix_globals.mypeer)) { /* we are a simple client */ flag = 0; /* reserve space for our nspace and rank info */ @@ -1139,7 +1159,7 @@ static pmix_status_t send_connect_ack(int sd, uint8_t *myflag, } else { // must be a tool of some sort /* add space for our uid/gid for ACL purposes */ sdsize += 2*sizeof(uint32_t); - if (PMIX_PROC_IS_CLIENT(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_CLIENT(pmix_globals.mypeer)) { /* if we are both tool and client, then we need * to tell the server we are both */ flag = 5; diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp_component.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp_component.c index cb800a6fdf9..8032c272f30 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp_component.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp_component.c @@ -254,8 +254,8 @@ static pmix_status_t component_open(void) /* check for environ-based directives * on system tmpdir to use */ - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) || - PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) || + PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { mca_ptl_tcp_component.session_tmpdir = strdup(pmix_server_globals.tmpdir); } else { if (NULL != (tdir = getenv("PMIX_SERVER_TMPDIR"))) { @@ -265,8 +265,8 @@ static pmix_status_t component_open(void) } } - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) || - PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) || + PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { mca_ptl_tcp_component.system_tmpdir = strdup(pmix_server_globals.system_tmpdir); } else { if (NULL != (tdir = getenv("PMIX_SYSTEM_TMPDIR"))) { @@ -281,6 +281,14 @@ static pmix_status_t component_open(void) 0 != strcmp(mca_ptl_tcp_component.report_uri, "+")) { urifile = strdup(mca_ptl_tcp_component.report_uri); } + + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) || + PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { + if (NULL != (tdir = getenv("PMIX_LAUNCHER_RENDEZVOUS_FILE"))) { + mca_ptl_tcp_component.rendezvous_filename = strdup(tdir); + } + } + return PMIX_SUCCESS; } @@ -369,7 +377,7 @@ static pmix_status_t setup_listener(pmix_info_t info[], size_t ninfo, "ptl:tcp setup_listener"); /* if we are not a server, then we shouldn't be doing this */ - if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { return PMIX_ERR_NOT_SUPPORTED; } @@ -420,7 +428,7 @@ static pmix_status_t setup_listener(pmix_info_t info[], size_t ninfo, session_tool = PMIX_INFO_TRUE(&info[n]); } else if (PMIX_CHECK_KEY(&info[n], PMIX_SERVER_SYSTEM_SUPPORT)) { system_tool = PMIX_INFO_TRUE(&info[n]); - } else if (PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer) && + } else if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer) && PMIX_CHECK_KEY(&info[n], PMIX_LAUNCHER_RENDEZVOUS_FILE)) { mca_ptl_tcp_component.rendezvous_filename = strdup(info[n].value.data.string); } @@ -687,7 +695,16 @@ static pmix_status_t setup_listener(pmix_info_t info[], size_t ninfo, /* if we were given a rendezvous file, then drop it */ if (NULL != mca_ptl_tcp_component.rendezvous_filename) { FILE *fp; - + /* if we are a tool and the file already exists, then we + * just use it as providing the rendezvous info for our + * server */ + if (PMIX_PEER_IS_TOOL(pmix_globals.mypeer)) { + struct stat buf; + /* coverity[toctou] */ + if (0 == stat(mca_ptl_tcp_component.rendezvous_filename, &buf)) { + goto nextstep; + } + } pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, "WRITING RENDEZVOUS FILE %s", mca_ptl_tcp_component.rendezvous_filename); @@ -716,6 +733,7 @@ static pmix_status_t setup_listener(pmix_info_t info[], size_t ninfo, } } + nextstep: /* if we are going to support tools, then drop contact file(s) */ if (system_tool) { FILE *fp; @@ -859,7 +877,7 @@ static pmix_status_t setup_listener(pmix_info_t info[], size_t ninfo, } } /* if we are a tool and connected, then register any rendezvous files for cleanup */ - if (PMIX_PROC_IS_TOOL(pmix_globals.mypeer) && pmix_globals.connected) { + if (PMIX_PEER_IS_TOOL(pmix_globals.mypeer) && pmix_globals.connected) { char **clnup = NULL, *cptr = NULL; pmix_info_t dir; if (NULL != mca_ptl_tcp_component.nspace_filename) { @@ -1009,9 +1027,10 @@ static void connection_handler(int sd, short args, void *cbdata) pmix_rank_info_t *info; pmix_proc_t proc; pmix_info_t ginfo; - pmix_proc_type_t proc_type; + pmix_proc_type_t proc_type = PMIX_PROC_TYPE_STATIC_INIT; pmix_byte_object_t cred; pmix_buffer_t buf; + uint8_t major, minor, release; /* acquire the object */ PMIX_ACQUIRE_OBJECT(pnd); @@ -1118,7 +1137,7 @@ static void connection_handler(int sd, short args, void *cbdata) if (0 == pnd->flag) { /* they must be a client, so get their nspace/rank */ - proc_type = PMIX_PROC_CLIENT; + PMIX_SET_PROC_TYPE(&proc_type, PMIX_PROC_CLIENT); PMIX_STRNLEN(msglen, mg, cnt); if (msglen < cnt) { nspace = mg; @@ -1145,7 +1164,7 @@ static void connection_handler(int sd, short args, void *cbdata) } } else if (1 == pnd->flag) { /* they are a tool */ - proc_type = PMIX_PROC_TOOL; + PMIX_SET_PROC_TYPE(&proc_type, PMIX_PROC_TOOL); /* extract the uid/gid */ if (sizeof(uint32_t) <= cnt) { memcpy(&u32, mg, sizeof(uint32_t)); @@ -1171,7 +1190,7 @@ static void connection_handler(int sd, short args, void *cbdata) } } else if (2 == pnd->flag) { /* they are a launcher */ - proc_type = PMIX_PROC_LAUNCHER; + PMIX_SET_PROC_TYPE(&proc_type, PMIX_PROC_LAUNCHER); /* extract the uid/gid */ if (sizeof(uint32_t) <= cnt) { memcpy(&u32, mg, sizeof(uint32_t)); @@ -1198,9 +1217,9 @@ static void connection_handler(int sd, short args, void *cbdata) } else if (3 == pnd->flag || 6 == pnd->flag) { /* they are a tool or launcher that needs an identifier */ if (3 == pnd->flag) { - proc_type = PMIX_PROC_TOOL; + PMIX_SET_PROC_TYPE(&proc_type, PMIX_PROC_TOOL); } else { - proc_type = PMIX_PROC_LAUNCHER; + PMIX_SET_PROC_TYPE(&proc_type, PMIX_PROC_LAUNCHER); } /* extract the uid/gid */ if (sizeof(uint32_t) <= cnt) { @@ -1230,9 +1249,9 @@ static void connection_handler(int sd, short args, void *cbdata) } else if (4 == pnd->flag || 5 == pnd->flag || 7 == pnd->flag || 8 == pnd->flag) { /* they are a tool or launcher that has an identifier - start with our ACLs */ if (4 == pnd->flag || 5 == pnd->flag) { - proc_type = PMIX_PROC_TOOL; + PMIX_SET_PROC_TYPE(&proc_type, PMIX_PROC_TOOL); } else { - proc_type = PMIX_PROC_LAUNCHER; + PMIX_SET_PROC_TYPE(&proc_type, PMIX_PROC_LAUNCHER); } /* extract the uid/gid */ if (sizeof(uint32_t) <= cnt) { @@ -1302,26 +1321,21 @@ static void connection_handler(int sd, short args, void *cbdata) rc = PMIX_ERR_BAD_PARAM; goto error; } + major = strtoul(version, &version, 10); + ++version; + minor = strtoul(version, &version, 10); + ++version; + release = strtoul(version, NULL, 10); + PMIX_SET_PROC_MAJOR(&proc_type, major); + PMIX_SET_PROC_MINOR(&proc_type, minor); + PMIX_SET_PROC_RELEASE(&proc_type, release); - if (0 == strncmp(version, "2.0", 3)) { + if (2 == major && 0 == minor) { /* the 2.0 release handshake ends with the version string */ - proc_type = proc_type | PMIX_PROC_V20; bfrops = "v20"; bftype = pmix_bfrops_globals.default_type; // we can't know any better gds = "ds12,hash"; } else { - int major; - major = strtoul(version, NULL, 10); - if (2 == major) { - proc_type = proc_type | PMIX_PROC_V21; - } else if (3 <= major) { - proc_type = proc_type | PMIX_PROC_V3; - } else { - free(msg); - PMIX_ERROR_LOG(PMIX_ERR_NOT_SUPPORTED); - rc = PMIX_ERR_NOT_SUPPORTED; - goto error; - } /* extract the name of the bfrops module they used */ PMIX_STRNLEN(msglen, mg, cnt); if (msglen < cnt) { @@ -1477,7 +1491,7 @@ static void connection_handler(int sd, short args, void *cbdata) } /* pass along the proc_type */ - pnd->proc_type = proc_type; + memcpy(&pnd->proc_type, &proc_type, sizeof(pmix_proc_type_t)); /* pass along the bfrop, buffer_type, and sec fields so * we can assign them once we create a peer object */ pnd->psec = strdup(sec); @@ -1577,7 +1591,7 @@ static void connection_handler(int sd, short args, void *cbdata) return; } /* mark that this peer is a client of the given type */ - peer->proc_type = proc_type; + memcpy(&peer->proc_type, &proc_type, sizeof(pmix_proc_type_t)); /* save the protocol */ peer->protocol = pnd->protocol; /* add in the nspace pointer */ @@ -1850,7 +1864,7 @@ static void process_cbfunc(int sd, short args, void *cbdata) } /* mark the peer proc type */ - peer->proc_type = pnd->proc_type; + memcpy(&peer->proc_type, &pnd->proc_type, sizeof(pmix_proc_type_t)); /* save the protocol */ peer->protocol = pnd->protocol; /* save the uid/gid */ @@ -1906,11 +1920,12 @@ static void process_cbfunc(int sd, short args, void *cbdata) goto done; } PMIX_RETAIN(peer); - req->peer = peer; - req->pname.nspace = strdup(pmix_globals.myid.nspace); - req->pname.rank = pmix_globals.myid.rank; + req->requestor = peer; + req->nprocs = 1; + PMIX_PROC_CREATE(req->procs, req->nprocs); + PMIX_LOAD_PROCID(&req->procs[0], pmix_globals.myid.nspace, pmix_globals.myid.rank); req->channels = PMIX_FWD_STDOUT_CHANNEL | PMIX_FWD_STDERR_CHANNEL | PMIX_FWD_STDDIAG_CHANNEL; - pmix_list_append(&pmix_globals.iof_requests, &req->super); + req->refid = pmix_pointer_array_add(&pmix_globals.iof_requests, req); /* validate the connection */ cred.bytes = pnd->cred; diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock.c index 51417f3e032..9e748e426ee 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock.c @@ -14,6 +14,8 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2019 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -108,21 +110,25 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer, __FILE__, __LINE__); /* if we are not a client, there is nothing we can do */ - if (!PMIX_PROC_IS_CLIENT(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_CLIENT(pmix_globals.mypeer)) { return PMIX_ERR_NOT_SUPPORTED; } + PMIX_SET_PEER_TYPE(pmix_client_globals.myserver, PMIX_PROC_SERVER); /* if we don't have a path to the daemon rendezvous point, * then we need to return an error */ if (NULL != (evar = getenv("PMIX_SERVER_URI2USOCK"))) { /* this is a v2.1+ server */ pmix_globals.mypeer->nptr->compat.bfrops = pmix_bfrops_base_assign_module("v21"); + PMIX_SET_PEER_MAJOR(pmix_client_globals.myserver, 2); if (NULL == pmix_globals.mypeer->nptr->compat.bfrops) { return PMIX_ERR_INIT; } } else if (NULL != (evar = getenv("PMIX_SERVER_URI"))) { /* this is a pre-v2.1 server - must use the v12 bfrops module */ pmix_globals.mypeer->nptr->compat.bfrops = pmix_bfrops_base_assign_module("v12"); + PMIX_SET_PEER_MAJOR(pmix_client_globals.myserver, 1); + PMIX_SET_PEER_MINOR(pmix_client_globals.myserver, 2); if (NULL == pmix_globals.mypeer->nptr->compat.bfrops) { return PMIX_ERR_INIT; } @@ -566,7 +572,7 @@ void pmix_usock_send_handler(int sd, short flags, void *cbdata) if (NULL != msg) { if (!msg->hdr_sent) { - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { /* we have to convert the header back to host-byte order */ msg->hdr.pindex = ntohl(msg->hdr.pindex); msg->hdr.tag = ntohl(msg->hdr.tag); @@ -598,7 +604,7 @@ void pmix_usock_send_handler(int sd, short flags, void *cbdata) /* exit this event and let the event lib progress */ pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, "usock:send_handler RES BUSY OR WOULD BLOCK"); - if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { /* have to convert back again so we are correct when we re-enter */ msg->hdr.pindex = htonl(msg->hdr.pindex); msg->hdr.tag = htonl(msg->hdr.tag); diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock_component.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock_component.c index 36637cc9882..1a0716a8962 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock_component.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock_component.c @@ -151,7 +151,7 @@ pmix_status_t component_close(void) static int component_query(pmix_mca_base_module_t **module, int *priority) { - if (PMIX_PROC_IS_TOOL(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_TOOL(pmix_globals.mypeer)) { return PMIX_ERR_NOT_SUPPORTED; } @@ -177,7 +177,7 @@ static pmix_status_t setup_listener(pmix_info_t info[], size_t ninfo, pmix_status_t rc; socklen_t addrlen; struct sockaddr_un *address; - bool disabled = false; + bool disabled = true; char *pmix_pid; pid_t mypid; @@ -185,7 +185,7 @@ static pmix_status_t setup_listener(pmix_info_t info[], size_t ninfo, "ptl:usock setup_listener"); /* if we are not a server, then we shouldn't be doing this */ - if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer)) { + if (!PMIX_PEER_IS_SERVER(pmix_globals.mypeer)) { return PMIX_ERR_NOT_SUPPORTED; } @@ -592,24 +592,11 @@ static void connection_handler(int sd, short args, void *cbdata) goto error; } /* mark it as being a client of the correct type */ - if (1 == major) { - psave->proc_type = PMIX_PROC_CLIENT | PMIX_PROC_V1; - } else if (2 == major && 0 == minor) { - psave->proc_type = PMIX_PROC_CLIENT | PMIX_PROC_V20; - } else if (2 == major && 1 == minor) { - psave->proc_type = PMIX_PROC_CLIENT | PMIX_PROC_V21; - } else if (3 == major) { - psave->proc_type = PMIX_PROC_CLIENT | PMIX_PROC_V3; - } else { - /* we don't recognize this version */ - pmix_output_verbose(2, pmix_ptl_base_framework.framework_output, - "connection request from client of unrecognized version %s", version); - free(msg); - PMIX_RELEASE(psave); - CLOSE_THE_SOCKET(pnd->sd); - PMIX_RELEASE(pnd); - return; - } + PMIX_SET_PROC_TYPE(&psave->proc_type, PMIX_PROC_CLIENT); + PMIX_SET_PROC_MAJOR(&psave->proc_type, major); + PMIX_SET_PROC_MINOR(&psave->proc_type, minor); + PMIX_SET_PROC_RELEASE(&psave->proc_type, rel); + /* save the protocol */ psave->protocol = pnd->protocol; /* add the nspace tracker */ diff --git a/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_finalize.c b/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_finalize.c index c083ad645f3..828987884ba 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_finalize.c +++ b/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_finalize.c @@ -54,6 +54,7 @@ void pmix_rte_finalize(void) { int i; pmix_notify_caddy_t *cd; + pmix_iof_req_t *req; if( --pmix_initialized != 0 ) { if( pmix_initialized < 0 ) { @@ -115,7 +116,13 @@ void pmix_rte_finalize(void) } } PMIX_DESTRUCT(&pmix_globals.notifications); - PMIX_LIST_DESTRUCT(&pmix_globals.iof_requests); + for (i=0; i < pmix_globals.iof_requests.size; i++) { + if (NULL != (req = (pmix_iof_req_t*)pmix_pointer_array_get_item(&pmix_globals.iof_requests, i))) { + PMIX_RELEASE(req); + } + } + PMIX_DESTRUCT(&pmix_globals.iof_requests); + PMIX_LIST_DESTRUCT(&pmix_globals.stdin_targets); free(pmix_globals.hostname); PMIX_LIST_DESTRUCT(&pmix_globals.nspaces); diff --git a/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_init.c b/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_init.c index b3255e4e5da..368256cfc21 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_init.c +++ b/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_init.c @@ -93,7 +93,7 @@ static void _notification_eviction_cbfunc(struct pmix_hotel_t *hotel, } -int pmix_rte_init(pmix_proc_type_t type, +int pmix_rte_init(uint32_t type, pmix_info_t info[], size_t ninfo, pmix_ptl_cbfunc_t cbfunc) { @@ -161,8 +161,6 @@ int pmix_rte_init(pmix_proc_type_t type, } /* setup the globals structure */ - gethostname(hostname, PMIX_MAXHOSTNAMELEN-1); - pmix_globals.hostname = strdup(hostname); memset(&pmix_globals.myid.nspace, 0, PMIX_MAX_NSLEN+1); pmix_globals.myid.rank = PMIX_RANK_INVALID; PMIX_CONSTRUCT(&pmix_globals.events, pmix_events_t); @@ -175,6 +173,13 @@ int pmix_rte_init(pmix_proc_type_t type, pmix_globals.evbase, pmix_globals.event_eviction_time, _notification_eviction_cbfunc); PMIX_CONSTRUCT(&pmix_globals.nspaces, pmix_list_t); + /* if we were given a hostname in our environment, use it */ + if (NULL != (evar = getenv("PMIX_HOSTNAME"))) { + pmix_globals.hostname = strdup(evar); + } else { + gethostname(hostname, PMIX_MAXHOSTNAMELEN-1); + pmix_globals.hostname = strdup(hostname); + } if (PMIX_SUCCESS != ret) { error = "notification hotel init"; @@ -182,7 +187,10 @@ int pmix_rte_init(pmix_proc_type_t type, } /* and setup the iof request tracking list */ - PMIX_CONSTRUCT(&pmix_globals.iof_requests, pmix_list_t); + PMIX_CONSTRUCT(&pmix_globals.iof_requests, pmix_pointer_array_t); + pmix_pointer_array_init(&pmix_globals.iof_requests, 128, INT_MAX, 128); + /* setup the stdin forwarding target list */ + PMIX_CONSTRUCT(&pmix_globals.stdin_targets, pmix_list_t); /* Setup client verbosities as all procs are allowed to * access client APIs */ @@ -245,7 +253,10 @@ int pmix_rte_init(pmix_proc_type_t type, goto return_error; } /* whatever our declared proc type, we are definitely v3.0 */ - pmix_globals.mypeer->proc_type = type | PMIX_PROC_V3; + PMIX_SET_PEER_TYPE(pmix_globals.mypeer, type); + PMIX_SET_PEER_MAJOR(pmix_globals.mypeer, PMIX_VERSION_MAJOR); + PMIX_SET_PEER_MINOR(pmix_globals.mypeer, PMIX_VERSION_MINOR); + PMIX_SET_PEER_RELEASE(pmix_globals.mypeer, PMIX_VERSION_RELEASE); /* create an nspace object for ourselves - we will * fill in the nspace name later */ pmix_globals.mypeer->nptr = PMIX_NEW(pmix_namespace_t); diff --git a/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_rte.h b/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_rte.h index d1744f05fbe..f7768e39426 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_rte.h +++ b/opal/mca/pmix/pmix3x/pmix/src/runtime/pmix_rte.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,7 +59,7 @@ extern const char pmix_version_string[]; * @retval PMIX_ERROR Upon failure. * */ -PMIX_EXPORT pmix_status_t pmix_rte_init(pmix_proc_type_t type, +PMIX_EXPORT pmix_status_t pmix_rte_init(uint32_t type, pmix_info_t info[], size_t ninfo, pmix_ptl_cbfunc_t cbfunc); diff --git a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server.c b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server.c index 2ea33a056c1..fe1cfc593ba 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server.c +++ b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server.c @@ -180,13 +180,14 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, }; char *evar; pmix_rank_info_t *rinfo; - pmix_proc_type_t ptype = PMIX_PROC_SERVER; + pmix_proc_type_t ptype = PMIX_PROC_TYPE_STATIC_INIT; PMIX_ACQUIRE_THREAD(&pmix_global_lock); pmix_output_verbose(2, pmix_server_globals.base_output, "pmix:server init called"); + PMIX_SET_PROC_TYPE(&ptype, PMIX_PROC_SERVER); /* setup the function pointers */ if (NULL == module) { pmix_host_server = myhostserver; @@ -198,9 +199,9 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, for (n=0; n < ninfo; n++) { if (0 == strncmp(info[n].key, PMIX_SERVER_GATEWAY, PMIX_MAX_KEYLEN)) { if (PMIX_INFO_TRUE(&info[n])) { - ptype |= PMIX_PROC_GATEWAY; + PMIX_SET_PROC_TYPE(&ptype, PMIX_PROC_GATEWAY); } - } else if (0 == strncmp(info[n].key, PMIX_SERVER_TMPDIR, PMIX_MAX_KEYLEN)) { + } else if (PMIX_CHECK_KEY(&info[n], PMIX_SERVER_TMPDIR)) { pmix_server_globals.tmpdir = strdup(info[n].value.data.string); } else if (0 == strncmp(info[n].key, PMIX_SYSTEM_TMPDIR, PMIX_MAX_KEYLEN)) { pmix_server_globals.system_tmpdir = strdup(info[n].value.data.string); @@ -224,7 +225,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, /* setup the runtime - this init's the globals, * opens and initializes the required frameworks */ - if (PMIX_SUCCESS != (rc = pmix_rte_init(ptype, info, ninfo, NULL))) { + if (PMIX_SUCCESS != (rc = pmix_rte_init(ptype.type, info, ninfo, NULL))) { PMIX_ERROR_LOG(rc); PMIX_RELEASE_THREAD(&pmix_global_lock); return rc; @@ -406,7 +407,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, pmix_list_append(&pmix_ptl_globals.posted_recvs, &req->super); /* if we are a gateway, setup our IOF events */ - if (PMIX_PROC_IS_GATEWAY(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_GATEWAY(pmix_globals.mypeer)) { /* setup IOF */ PMIX_IOF_SINK_DEFINE(&pmix_client_globals.iof_stdout, &pmix_globals.myid, 1, PMIX_FWD_STDOUT_CHANNEL, pmix_iof_write_handler); @@ -539,6 +540,10 @@ static void _register_nspace(int sd, short args, void *cbdata) pmix_namespace_t *nptr, *tmp; pmix_status_t rc; size_t i; + bool all_def; + pmix_server_trkr_t *trk; + pmix_namespace_t *ns; + pmix_trkr_caddy_t *tcd; PMIX_ACQUIRE_OBJECT(caddy); @@ -591,6 +596,77 @@ static void _register_nspace(int sd, short args, void *cbdata) PMIX_GDS_CACHE_JOB_INFO(rc, pmix_globals.mypeer, nptr, cd->info, cd->ninfo); + /* check any pending trackers to see if they are + * waiting for us. There is a slight race condition whereby + * the host server could have spawned the local client and + * it called back into the collective -before- our local event + * would fire the register_client callback. Deal with that here. */ + all_def = true; + PMIX_LIST_FOREACH(trk, &pmix_server_globals.collectives, pmix_server_trkr_t) { + /* if this tracker is already complete, then we + * don't need to update it */ + if (trk->def_complete) { + continue; + } + /* the fact that the tracker is here means that the tracker was + * created in response to at least one collective call being received + * from a participant. However, not all local participants may have + * already called the collective. While the collective created the + * tracker, it would not have updated the number of local participants + * from this nspace if they specified PMIX_RANK_WILDCARD in the list of + * participants since the host hadn't yet called "register_nspace". + * Take care of that here */ + for (i=0; i < trk->npcs; i++) { + /* since we have to do this search, let's see + * if the nspaces are all completely registered */ + if (all_def) { + /* so far, they have all been defined - check this one */ + PMIX_LIST_FOREACH(ns, &pmix_globals.nspaces, pmix_namespace_t) { + if (0 == strcmp(trk->pcs[i].nspace, ns->nspace)) { + if (SIZE_MAX == ns->nlocalprocs || + !ns->all_registered) { + all_def = false; + } + break; + } + } + } + /* now see if this nspace is the one we just registered */ + if (0 != strncmp(trk->pcs[i].nspace, nptr->nspace, PMIX_MAX_NSLEN)) { + /* if not, then we really can't say anything more about it as + * we have no new information about this nspace */ + continue; + } + /* if this request was for all participants from this nspace, then + * we handle this case here */ + if (PMIX_RANK_WILDCARD == trk->pcs[i].rank) { + trk->nlocal = nptr->nlocalprocs; + /* the total number of procs in this nspace was provided + * in the data blob delivered to register_nspace, so check + * to see if all the procs are local */ + if (nptr->nprocs != nptr->nlocalprocs) { + trk->local = false; + } + continue; + } + } + /* update this tracker's status */ + trk->def_complete = all_def; + /* is this now locally completed? */ + if (trk->def_complete && pmix_list_get_size(&trk->local_cbs) == trk->nlocal) { + /* it did, so now we need to process it + * we don't want to block someone + * here, so kick any completed trackers into a + * new event for processing */ + PMIX_EXECUTE_COLLECTIVE(tcd, trk, pmix_server_execute_collective); + } + } + /* also check any pending local modex requests to see if + * someone has been waiting for a request on a remote proc + * in one of our nspaces, but we didn't know all the local procs + * and so couldn't determine the proc was remote */ + pmix_pending_nspace_requests(nptr); + release: cd->opcbfunc(rc, cd->cbdata); PMIX_RELEASE(cd); @@ -650,7 +726,7 @@ void pmix_server_purge_events(pmix_peer_t *peer, { pmix_regevents_info_t *reginfo, *regnext; pmix_peer_events_info_t *prev, *pnext; - pmix_iof_req_t *req, *nxt; + pmix_iof_req_t *req; int i; pmix_notify_caddy_t *ncd; size_t n, m, p, ntgs; @@ -676,10 +752,13 @@ void pmix_server_purge_events(pmix_peer_t *peer, /* since the client is finalizing, remove them from any IOF * registrations they may still have on our list */ - PMIX_LIST_FOREACH_SAFE(req, nxt, &pmix_globals.iof_requests, pmix_iof_req_t) { - if ((NULL != peer && PMIX_CHECK_PROCID(&req->peer->info->pname, &peer->info->pname)) || - (NULL != proc && PMIX_CHECK_PROCID(&req->peer->info->pname, proc))) { - pmix_list_remove_item(&pmix_globals.iof_requests, &req->super); + for (i=0; i < pmix_globals.iof_requests.size; i++) { + if (NULL == (req = (pmix_iof_req_t*)pmix_pointer_array_get_item(&pmix_globals.iof_requests, i))) { + continue; + } + if ((NULL != peer && PMIX_CHECK_PROCID(&req->requestor->info->pname, &peer->info->pname)) || + (NULL != proc && PMIX_CHECK_PROCID(&req->requestor->info->pname, proc))) { + pmix_pointer_array_set_item(&pmix_globals.iof_requests, i, NULL); PMIX_RELEASE(req); } } @@ -980,7 +1059,7 @@ void pmix_server_execute_collective(int sd, short args, void *cbdata) static void _register_client(int sd, short args, void *cbdata) { pmix_setup_caddy_t *cd = (pmix_setup_caddy_t*)cbdata; - pmix_rank_info_t *info, *iptr; + pmix_rank_info_t *info; pmix_namespace_t *nptr, *ns; pmix_server_trkr_t *trk; pmix_trkr_caddy_t *tcd; @@ -1004,6 +1083,10 @@ static void _register_client(int sd, short args, void *cbdata) } } if (NULL == nptr) { + /* there is no requirement in the Standard that hosts register + * an nspace prior to registering clients for that nspace. So + * if we didn't find it, just add it to our collection now in + * anticipation of eventually getting a "register_nspace" call */ nptr = PMIX_NEW(pmix_namespace_t); if (NULL == nptr) { rc = PMIX_ERR_NOMEM; @@ -1026,8 +1109,11 @@ static void _register_client(int sd, short args, void *cbdata) info->gid = cd->gid; info->server_object = cd->server_object; pmix_list_append(&nptr->ranks, &info->super); - /* see if we have everyone */ - if (nptr->nlocalprocs == pmix_list_get_size(&nptr->ranks)) { + /* see if we have everyone - not that nlocalprocs is set to + * a default value to ensure we don't execute this + * test until the host calls "register_nspace" */ + if (SIZE_MAX != nptr->nlocalprocs && + nptr->nlocalprocs == pmix_list_get_size(&nptr->ranks)) { nptr->all_registered = true; /* check any pending trackers to see if they are * waiting for us. There is a slight race condition whereby @@ -1041,36 +1127,47 @@ static void _register_client(int sd, short args, void *cbdata) if (trk->def_complete) { continue; } - /* see if any of our procs from this nspace are involved - the tracker will - * have been created because a callback was received, but - * we may or may not have received _all_ callbacks by this - * time. So check and see if any procs from this nspace are - * involved, and add them to the count of local participants */ + /* the fact that the tracker is here means that the tracker was + * created in response to at least one collective call being received + * from a participant. However, not all local participants may have + * already called the collective. While the collective created the + * tracker, it would not have updated the number of local participants + * from this nspace UNLESS the collective involves all procs in the + * nspace (i.e., they specified PMIX_RANK_WILDCARD in the list of + * participants) AND the host already provided the number of local + * procs for this nspace by calling "register_nspace". So avoid that + * scenario here to avoid double-counting */ for (i=0; i < trk->npcs; i++) { /* since we have to do this search, let's see - * if the nspaces are all defined */ + * if the nspaces are all completely registered */ if (all_def) { /* so far, they have all been defined - check this one */ PMIX_LIST_FOREACH(ns, &pmix_globals.nspaces, pmix_namespace_t) { - if (0 < ns->nlocalprocs && - 0 == strcmp(trk->pcs[i].nspace, ns->nspace)) { - all_def = ns->all_registered; + if (0 == strcmp(trk->pcs[i].nspace, ns->nspace)) { + if (SIZE_MAX == ns->nlocalprocs || + !ns->all_registered) { + all_def = false; + } break; } } } - /* now see if this proc is local to us */ + /* now see if this nspace is the one to which the client we just + * registered belongs */ if (0 != strncmp(trk->pcs[i].nspace, nptr->nspace, PMIX_MAX_NSLEN)) { + /* if not, then we really can't say anything more about it as + * we have no new information about this nspace */ continue; } - /* need to check if this rank is one of mine */ - PMIX_LIST_FOREACH(iptr, &nptr->ranks, pmix_rank_info_t) { - if (PMIX_RANK_WILDCARD == trk->pcs[i].rank || - iptr->pname.rank == trk->pcs[i].rank) { - /* this is one of mine - track the count */ - ++trk->nlocal; - break; - } + /* if this request was for all participants from this nspace, then + * we handle this case elsewhere */ + if (PMIX_RANK_WILDCARD == trk->pcs[i].rank) { + continue; + } + /* see if the rank we just registered is a participant */ + if (cd->proc.rank == trk->pcs[i].rank) { + /* yes, we are included */ + ++trk->nlocal; } } /* update this tracker's status */ @@ -1204,7 +1301,7 @@ static void _deregister_client(int sd, short args, void *cbdata) /* resources may have been allocated to them, so * ensure they get cleaned up - this isn't true * for tools, so don't clean them up */ - if (!PMIX_PROC_IS_TOOL(peer)) { + if (!PMIX_PEER_IS_TOOL(peer)) { pmix_pnet.child_finalized(&cd->proc); pmix_psensor.stop(peer, NULL); } @@ -1344,6 +1441,13 @@ PMIX_EXPORT pmix_status_t PMIx_server_setup_fork(const pmix_proc_t *proc, char * return rc; } + /* ensure we agree on our hostname - typically only important in + * test scenarios where we are faking multiple nodes */ + pmix_setenv("PMIX_HOSTNAME", pmix_globals.hostname, true, env); + + /* communicate our version */ + pmix_setenv("PMIX_VERSION", PMIX_VERSION, true, env); + return PMIX_SUCCESS; } @@ -1758,69 +1862,26 @@ static void _iofdeliver(int sd, short args, void *cbdata) { pmix_setup_caddy_t *cd = (pmix_setup_caddy_t*)cbdata; pmix_iof_req_t *req; - pmix_status_t rc; - pmix_buffer_t *msg; bool found = false; - bool cached = false; pmix_iof_cache_t *iof; + int i; + size_t n; pmix_output_verbose(2, pmix_server_globals.iof_output, "PMIX:SERVER delivering IOF from %s on channel %0x", PMIX_NAME_PRINT(cd->procs), cd->channels); - /* cycle across our list of IOF requestors and see who wants + /* cycle across our list of IOF requests and see who wants * this channel from this source */ - PMIX_LIST_FOREACH(req, &pmix_globals.iof_requests, pmix_iof_req_t) { - /* if the channel wasn't included, then ignore it */ - if (!(cd->channels & req->channels)) { + for (i=0; i < pmix_globals.iof_requests.size; i++) { + if (NULL == (req = (pmix_iof_req_t*)pmix_pointer_array_get_item(&pmix_globals.iof_requests, i))) { continue; } - /* see if the source matches the request */ - if (!PMIX_CHECK_PROCID(cd->procs, &req->pname)) { - continue; - } - /* never forward back to the source! This can happen if the source - * is a launcher - also, never forward to a peer that is no - * longer with us */ - if (NULL == req->peer->info || req->peer->finalized) { - continue; - } - if (PMIX_CHECK_PROCID(cd->procs, &req->peer->info->pname)) { - continue; - } - found = true; - /* setup the msg */ - if (NULL == (msg = PMIX_NEW(pmix_buffer_t))) { - PMIX_ERROR_LOG(PMIX_ERR_OUT_OF_RESOURCE); - rc = PMIX_ERR_OUT_OF_RESOURCE; - break; - } - /* provide the source */ - PMIX_BFROPS_PACK(rc, req->peer, msg, cd->procs, 1, PMIX_PROC); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - break; - } - /* provide the channel */ - PMIX_BFROPS_PACK(rc, req->peer, msg, &cd->channels, 1, PMIX_IOF_CHANNEL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - break; - } - /* pack the data */ - PMIX_BFROPS_PACK(rc, req->peer, msg, cd->bo, 1, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - break; - } - /* send it to the requestor */ - PMIX_PTL_SEND_ONEWAY(rc, req->peer, msg, PMIX_PTL_TAG_IOF); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); + if (PMIX_OPERATION_SUCCEEDED == pmix_iof_process_iof(cd->channels, cd->procs, cd->bo, + cd->info, cd->ninfo, req)) { + /* flag that we do have at least one registrant for this info, + * so there is no need to cache it */ + found = true; } } @@ -1838,18 +1899,33 @@ static void _iofdeliver(int sd, short args, void *cbdata) iof = PMIX_NEW(pmix_iof_cache_t); memcpy(&iof->source, cd->procs, sizeof(pmix_proc_t)); iof->channel = cd->channels; - iof->bo = cd->bo; - cd->bo = NULL; // protect the data + /* copy the data */ + PMIX_BYTE_OBJECT_CREATE(iof->bo, 1); + iof->bo->bytes = (char*)malloc(cd->bo->size); + memcpy(iof->bo->bytes, cd->bo->bytes, cd->bo->size); + iof->bo->size = cd->bo->size; + if (0 < cd->ninfo) { + PMIX_INFO_CREATE(iof->info, cd->ninfo); + iof->ninfo = cd->ninfo; + for (n=0; n < iof->ninfo; n++) { + PMIX_INFO_XFER(&iof->info[n], &cd->info[n]); + } + } pmix_list_append(&pmix_server_globals.iof, &iof->super); } if (NULL != cd->opcbfunc) { - cd->opcbfunc(rc, cd->cbdata); - } - if (!cached) { - PMIX_RELEASE(cd); + cd->opcbfunc(PMIX_SUCCESS, cd->cbdata); } + + /* release the caddy */ + cd->procs = NULL; + cd->nprocs = 0; + cd->info = NULL; + cd->ninfo = 0; + cd->bo = NULL; + PMIX_RELEASE(cd); } pmix_status_t PMIx_server_IOF_deliver(const pmix_proc_t *source, @@ -1859,48 +1935,18 @@ pmix_status_t PMIx_server_IOF_deliver(const pmix_proc_t *source, pmix_op_cbfunc_t cbfunc, void *cbdata) { pmix_setup_caddy_t *cd; - size_t n; /* need to threadshift this request */ cd = PMIX_NEW(pmix_setup_caddy_t); if (NULL == cd) { return PMIX_ERR_NOMEM; } - /* unfortunately, we need to copy the input because we - * might have to cache it for later delivery */ - PMIX_PROC_CREATE(cd->procs, 1); - if (NULL == cd->procs) { - PMIX_RELEASE(cd); - return PMIX_ERR_NOMEM; - } + cd->procs = (pmix_proc_t*)source; cd->nprocs = 1; - pmix_strncpy(cd->procs[0].nspace, source->nspace, PMIX_MAX_NSLEN); - cd->procs[0].rank = source->rank; cd->channels = channel; - PMIX_BYTE_OBJECT_CREATE(cd->bo, 1); - if (NULL == cd->bo) { - PMIX_RELEASE(cd); - return PMIX_ERR_NOMEM; - } - cd->nbo = 1; - cd->bo[0].bytes = (char*)malloc(bo->size); - if (NULL == cd->bo[0].bytes) { - PMIX_RELEASE(cd); - return PMIX_ERR_NOMEM; - } - memcpy(cd->bo[0].bytes, bo->bytes, bo->size); - cd->bo[0].size = bo->size; - if (0 < ninfo) { - PMIX_INFO_CREATE(cd->info, ninfo); - if (NULL == cd->info) { - PMIX_RELEASE(cd); - return PMIX_ERR_NOMEM; - } - cd->ninfo = ninfo; - for (n=0; n < ninfo; n++) { - PMIX_INFO_XFER(&cd->info[n], (pmix_info_t*)&info[n]); - } - } + cd->bo = (pmix_byte_object_t*)bo; + cd->info = (pmix_info_t*)info; + cd->ninfo = ninfo; cd->opcbfunc = cbfunc; cd->cbdata = cbdata; PMIX_THREADSHIFT(cd, _iofdeliver); @@ -2670,7 +2716,7 @@ static void _cnct(int sd, short args, void *cbdata) } PMIX_DESTRUCT(&cb); - if (PMIX_PROC_IS_V1(cd->peer) || PMIX_PROC_IS_V20(cd->peer)) { + if (PMIX_PEER_IS_V1(cd->peer) || PMIX_PEER_IS_V20(cd->peer)) { PMIX_BFROPS_PACK(rc, cd->peer, reply, &pbkt, 1, PMIX_BUFFER); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); @@ -3225,6 +3271,8 @@ static void _iofreg(int sd, short args, void *cbdata) pmix_server_caddy_t *scd = (pmix_server_caddy_t*)cd->cbdata; pmix_buffer_t *reply; pmix_status_t rc; + pmix_iof_req_t *req; + pmix_iof_cache_t *iof, *inxt; PMIX_ACQUIRE_OBJECT(cd); @@ -3245,7 +3293,18 @@ static void _iofreg(int sd, short args, void *cbdata) /* was the request a success? */ if (PMIX_SUCCESS != cd->status) { - /* find and remove the tracker(s) */ + /* find and remove the tracker */ + req = (pmix_iof_req_t*)pmix_pointer_array_get_item(&pmix_globals.iof_requests, cd->ncodes); + PMIX_RELEASE(req); + pmix_pointer_array_set_item(&pmix_globals.iof_requests, cd->ncodes, NULL); + } else { + /* return the reference ID for this handler */ + PMIX_BFROPS_PACK(rc, scd->peer, reply, &cd->ncodes, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(reply); + goto cleanup; + } } pmix_output_verbose(2, pmix_server_globals.iof_output, @@ -3256,6 +3315,21 @@ static void _iofreg(int sd, short args, void *cbdata) PMIX_RELEASE(reply); } + /* if the request succeeded, then process any cached IO - doing it here + * guarantees that the IO will be received AFTER the client gets the + * refid response */ + if (PMIX_SUCCESS == cd->status) { + /* get the request */ + req = (pmix_iof_req_t*)pmix_pointer_array_get_item(&pmix_globals.iof_requests, cd->ncodes); + PMIX_LIST_FOREACH_SAFE(iof, inxt, &pmix_server_globals.iof, pmix_iof_cache_t) { + if (PMIX_OPERATION_SUCCEEDED == pmix_iof_process_iof(iof->channel, &iof->source, iof->bo, + iof->info, iof->ninfo, req)) { + pmix_list_remove_item(&pmix_server_globals.iof, &iof->super); + PMIX_RELEASE(iof); + } + } + } + cleanup: /* release the cached info */ if (NULL != cd->procs) { @@ -3352,7 +3426,7 @@ static pmix_status_t server_switchyard(pmix_peer_t *peer, uint32_t tag, if (PMIX_COMMIT_CMD == cmd) { rc = pmix_server_commit(peer, buf); - if (!PMIX_PROC_IS_V1(peer)) { + if (!PMIX_PEER_IS_V1(peer)) { reply = PMIX_NEW(pmix_buffer_t); if (NULL == reply) { PMIX_ERROR_LOG(PMIX_ERR_NOMEM); @@ -3585,6 +3659,14 @@ static pmix_status_t server_switchyard(pmix_peer_t *peer, uint32_t tag, return rc; } + if (PMIX_IOF_DEREG_CMD == cmd) { + PMIX_GDS_CADDY(cd, peer, tag); + if (PMIX_SUCCESS != (rc = pmix_server_iofdereg(peer, buf, op_cbfunc, cd))) { + PMIX_RELEASE(cd); + } + return rc; + } + return PMIX_ERR_NOT_SUPPORTED; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_get.c b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_get.c index c8fe13cdd6e..a968dddf206 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_get.c +++ b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_get.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Artem Y. Polyakov . @@ -121,8 +121,6 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, char *cptr; char nspace[PMIX_MAX_NSLEN+1]; pmix_namespace_t *ns, *nptr; - pmix_info_t *info=NULL; - size_t ninfo=0; pmix_dmdx_local_t *lcd; pmix_dmdx_request_t *req; bool local; @@ -160,34 +158,33 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, } /* retrieve any provided info structs */ cnt = 1; - PMIX_BFROPS_UNPACK(rc, cd->peer, buf, &ninfo, &cnt, PMIX_SIZE); + PMIX_BFROPS_UNPACK(rc, cd->peer, buf, &cd->ninfo, &cnt, PMIX_SIZE); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); return rc; } - if (0 < ninfo) { - PMIX_INFO_CREATE(info, ninfo); - if (NULL == info) { + if (0 < cd->ninfo) { + PMIX_INFO_CREATE(cd->info, cd->ninfo); + if (NULL == cd->info) { PMIX_ERROR_LOG(PMIX_ERR_NOMEM); return PMIX_ERR_NOMEM; } - cnt = ninfo; - PMIX_BFROPS_UNPACK(rc, cd->peer, buf, info, &cnt, PMIX_INFO); + cnt = cd->ninfo; + PMIX_BFROPS_UNPACK(rc, cd->peer, buf, cd->info, &cnt, PMIX_INFO); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); - PMIX_INFO_FREE(info, ninfo); return rc; } } /* search for directives we can deal with here */ - for (n=0; n < ninfo; n++) { - if (0 == strncmp(info[n].key, PMIX_IMMEDIATE, PMIX_MAX_KEYLEN)) { + for (n=0; n < cd->ninfo; n++) { + if (PMIX_CHECK_KEY(&cd->info[n], PMIX_IMMEDIATE)) { /* just check our own data - don't wait * or request it from someone else */ - localonly = PMIX_INFO_TRUE(&info[n]); - } else if (0 == strncmp(info[n].key, PMIX_TIMEOUT, PMIX_MAX_KEYLEN)) { - tv.tv_sec = info[n].value.data.uint32; + localonly = PMIX_INFO_TRUE(&cd->info[n]); + } else if (PMIX_CHECK_KEY(&cd->info[n], PMIX_TIMEOUT)) { + tv.tv_sec = cd->info[n].value.data.uint32; } } @@ -202,16 +199,13 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, /* check if the nspace of the requestor is different from * the nspace of the target process */ - if (!PMIX_CHECK_NSPACE(nspace, cd->peer->info->pname.nspace)) { - diffnspace = true; - } + diffnspace = !PMIX_CHECK_NSPACE(nspace, cd->peer->info->pname.nspace); pmix_output_verbose(2, pmix_server_globals.get_output, - "%s:%d EXECUTE GET FOR %s:%d ON BEHALF OF %s:%d", - pmix_globals.myid.nspace, - pmix_globals.myid.rank, nspace, rank, - cd->peer->info->pname.nspace, - cd->peer->info->pname.rank); + "%s EXECUTE GET FOR %s:%d ON BEHALF OF %s", + PMIX_NAME_PRINT(&pmix_globals.myid), + nspace, rank, + PMIX_PNAME_PRINT(&cd->peer->info->pname)); /* This call flows upward from a local client If we don't * know about this nspace, then it cannot refer to the @@ -238,6 +232,9 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, if (localonly) { /* the user doesn't want us to look for the info, * so we simply return at this point */ + pmix_output_verbose(5, pmix_server_globals.get_output, + "%s UNKNOWN NSPACE: LOCAL ONLY - NOT FOUND", + PMIX_NAME_PRINT(&pmix_globals.myid)); return PMIX_ERR_NOT_FOUND; } /* this is for an nspace we don't know about yet, so @@ -248,13 +245,15 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, * the original requestor so they will get the data * back when we receive it */ rc = create_local_tracker(nspace, rank, - info, ninfo, + cd->info, cd->ninfo, cbfunc, cbdata, &lcd, &req); if (PMIX_ERR_NOMEM == rc) { - PMIX_INFO_FREE(info, ninfo); return rc; } if (PMIX_SUCCESS == rc) { + pmix_output_verbose(5, pmix_server_globals.get_output, + "%s UNKNOWN NSPACE: DUPLICATE REQUEST - WAITING", + PMIX_NAME_PRINT(&pmix_globals.myid)); /* if they specified a timeout for this specific * request, set it up now */ if (0 < tv.tv_sec) { @@ -276,9 +275,11 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, * up on its own, but at worst the direct modex * will simply overwrite the info later */ if (NULL != pmix_host_server.direct_modex) { - rc = pmix_host_server.direct_modex(&lcd->proc, info, ninfo, dmdx_cbfunc, lcd); + pmix_output_verbose(5, pmix_server_globals.get_output, + "%s UNKNOWN NSPACE: REQUEST PASSED TO HOST", + PMIX_NAME_PRINT(&pmix_globals.myid)); + rc = pmix_host_server.direct_modex(&lcd->proc, cd->info, cd->ninfo, dmdx_cbfunc, lcd); if (PMIX_SUCCESS != rc) { - PMIX_INFO_FREE(info, ninfo); pmix_list_remove_item(&pmix_server_globals.local_reqs, &lcd->super); PMIX_RELEASE(lcd); return rc; @@ -292,8 +293,10 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, req->event_active = true; } } else { - /* if we don't have direct modex feature, just respond with "not found" */ - PMIX_INFO_FREE(info, ninfo); + /* if we don't have direct modex feature, just respond with "not found" */ + pmix_output_verbose(5, pmix_server_globals.get_output, + "%s UNKNOWN NSPACE: NO DMODEX AVAILABLE - NOT FOUND", + PMIX_NAME_PRINT(&pmix_globals.myid)); pmix_list_remove_item(&pmix_server_globals.local_reqs, &lcd->super); PMIX_RELEASE(lcd); return PMIX_ERR_NOT_FOUND; @@ -306,6 +309,10 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, * if the rank is wildcard, or the nspace is different, then * they are asking for the job-level info for this nspace - provide it */ if (PMIX_RANK_WILDCARD == rank || diffnspace) { + pmix_output_verbose(5, pmix_server_globals.get_output, + "%s LOOKING FOR %s", + PMIX_NAME_PRINT(&pmix_globals.myid), + diffnspace ? "WILDCARD RANK" : "DIFF NSPACE"); /* see if we have the job-level info - we won't have it * if we have no local procs and haven't already asked * for it, so there is no guarantee we have it */ @@ -317,29 +324,30 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, * can retrieve the info from that GDS. Otherwise, * we need to retrieve it from our own */ PMIX_CONSTRUCT(&cb, pmix_cb_t); + pmix_output_verbose(5, pmix_server_globals.get_output, + "%s GETTING JOB-DATA FOR %s", + PMIX_NAME_PRINT(&pmix_globals.myid), + PMIX_NAME_PRINT(&proc)); /* this data is for a local client, so give the gds the * option of returning a complete copy of the data, * or returning a pointer to local storage */ cb.proc = &proc; cb.scope = PMIX_SCOPE_UNDEF; cb.copy = false; + cb.info = cd->info; + cb.ninfo = cd->ninfo; PMIX_GDS_FETCH_KV(rc, pmix_globals.mypeer, &cb); if (PMIX_SUCCESS != rc) { + cb.info = NULL; + cb.ninfo = 0; PMIX_DESTRUCT(&cb); return rc; } - /* if the requested rank is not WILDCARD, then retrieve the - * job-specific data for that rank - a scope of UNDEF - * will direct the GDS to provide it. Anything found will - * simply be added to the cb.kvs list */ - if (PMIX_RANK_WILDCARD != rank) { - proc.rank = rank; - PMIX_GDS_FETCH_KV(rc, pmix_globals.mypeer, &cb); - if (PMIX_SUCCESS != rc) { - PMIX_DESTRUCT(&cb); - return rc; - } - } + /* store this as a byte object in the eventual data to + * be returned */ + PMIX_CONSTRUCT(&pbkt, pmix_buffer_t); + cb.info = NULL; + cb.ninfo = 0; PMIX_CONSTRUCT(&pkt, pmix_buffer_t); /* assemble the provided data into a byte object */ PMIX_GDS_ASSEMB_KVS_REQ(rc, pmix_globals.mypeer, &proc, &cb.kvs, &pkt, cd); @@ -351,8 +359,7 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, PMIX_UNLOAD_BUFFER(&pkt, bo.bytes, bo.size); PMIX_DESTRUCT(&pkt); /* pack it into the payload */ - PMIX_CONSTRUCT(&pbkt, pmix_buffer_t); - PMIX_BFROPS_PACK(rc, pmix_globals.mypeer, &pbkt, &bo, 1, PMIX_BYTE_OBJECT); + PMIX_BFROPS_PACK(rc, cd->peer, &pbkt, &bo, 1, PMIX_BYTE_OBJECT); free(bo.bytes); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); @@ -360,6 +367,52 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, PMIX_DESTRUCT(&cb); return rc; } + PMIX_DESTRUCT(&cb); + /* if the requested rank is not WILDCARD, then retrieve any + * posted data for that rank. Anything found will + * be added to the cb.kvs list */ + if (PMIX_RANK_WILDCARD != rank) { + PMIX_CONSTRUCT(&cb, pmix_cb_t); + proc.rank = rank; + cb.proc = &proc; + cb.scope = PMIX_LOCAL; + cb.copy = false; + cb.info = cd->info; + cb.ninfo = cd->ninfo; + pmix_output_verbose(5, pmix_server_globals.get_output, + "%s GETTING DATA FOR %s", + PMIX_NAME_PRINT(&pmix_globals.myid), + PMIX_NAME_PRINT(&proc)); + PMIX_GDS_FETCH_KV(rc, pmix_globals.mypeer, &cb); + if (PMIX_SUCCESS != rc) { + cb.info = NULL; + cb.ninfo = 0; + PMIX_DESTRUCT(&cb); + return rc; + } + cb.info = NULL; + cb.ninfo = 0; + PMIX_CONSTRUCT(&pkt, pmix_buffer_t); + /* assemble the provided data into a byte object */ + PMIX_GDS_ASSEMB_KVS_REQ(rc, pmix_globals.mypeer, &proc, &cb.kvs, &pkt, cd); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DESTRUCT(&cb); + return rc; + } + PMIX_UNLOAD_BUFFER(&pkt, bo.bytes, bo.size); + PMIX_DESTRUCT(&pkt); + /* pack it into the payload */ + PMIX_BFROPS_PACK(rc, cd->peer, &pbkt, &bo, 1, PMIX_BYTE_OBJECT); + free(bo.bytes); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DESTRUCT(&pbkt); + PMIX_DESTRUCT(&cb); + return rc; + } + PMIX_DESTRUCT(&cb); + } /* unload the resulting payload */ PMIX_UNLOAD_BUFFER(&pbkt, data, sz); PMIX_DESTRUCT(&pbkt); @@ -393,10 +446,9 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, } /* we cannot do anything further, so just track this request * for now */ - rc = create_local_tracker(nspace, rank, info, ninfo, + rc = create_local_tracker(nspace, rank, cd->info, cd->ninfo, cbfunc, cbdata, &lcd, &req); if (PMIX_ERR_NOMEM == rc) { - PMIX_INFO_FREE(info, ninfo); return rc; } pmix_output_verbose(2, pmix_server_globals.get_output, @@ -418,8 +470,6 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, /* if everyone has registered, see if we already have this data */ rc = _satisfy_request(nptr, rank, cd, cbfunc, cbdata, &local); if( PMIX_SUCCESS == rc ){ - /* request was successfully satisfied */ - PMIX_INFO_FREE(info, ninfo); /* return success as the satisfy_request function * calls the cbfunc for us, and it will have * released the cbdata object */ @@ -443,11 +493,10 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, /* Check to see if we already have a pending request for the data - if * we do, then we can just wait for it to arrive */ - rc = create_local_tracker(nspace, rank, info, ninfo, + rc = create_local_tracker(nspace, rank, cd->info, cd->ninfo, cbfunc, cbdata, &lcd, &req); if (PMIX_ERR_NOMEM == rc || NULL == lcd) { /* we have a problem */ - PMIX_INFO_FREE(info, ninfo); return PMIX_ERR_NOMEM; } /* if they specified a timeout, set it up now */ @@ -477,10 +526,9 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, * resource manager server to please get the info for us from * whomever is hosting the target process */ if (NULL != pmix_host_server.direct_modex) { - rc = pmix_host_server.direct_modex(&lcd->proc, info, ninfo, dmdx_cbfunc, lcd); + rc = pmix_host_server.direct_modex(&lcd->proc, cd->info, cd->ninfo, dmdx_cbfunc, lcd); if (PMIX_SUCCESS != rc) { /* may have a function entry but not support the request */ - PMIX_INFO_FREE(info, ninfo); pmix_list_remove_item(&pmix_server_globals.local_reqs, &lcd->super); PMIX_RELEASE(lcd); } @@ -490,7 +538,6 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, pmix_globals.myid.nspace, pmix_globals.myid.rank); /* if we don't have direct modex feature, just respond with "not found" */ - PMIX_INFO_FREE(info, ninfo); pmix_list_remove_item(&pmix_server_globals.local_reqs, &lcd->super); PMIX_RELEASE(lcd); rc = PMIX_ERR_NOT_FOUND; @@ -635,21 +682,22 @@ static pmix_status_t _satisfy_request(pmix_namespace_t *nptr, pmix_rank_t rank, * a remote peer, or due to data from a local client * having been committed */ PMIX_CONSTRUCT(&pbkt, pmix_buffer_t); - pmix_strncpy(proc.nspace, nptr->nspace, PMIX_MAX_NSLEN); + PMIX_LOAD_NSPACE(proc.nspace, nptr->nspace); if (!PMIX_CHECK_NSPACE(nptr->nspace, cd->peer->info->pname.nspace)) { diffnspace = true; } - /* if rank is PMIX_RANK_UNDEF, then it was stored in our GDS */ - if (PMIX_RANK_UNDEF == rank) { + /* if rank is PMIX_RANK_UNDEF or is from a different nspace, + * then it was stored in our GDS */ + if (PMIX_RANK_UNDEF == rank || diffnspace) { scope = PMIX_GLOBAL; // we have to search everywhere peer = pmix_globals.mypeer; } else if (0 < nptr->nlocalprocs) { /* if we have local clients of this nspace, then we use * the corresponding GDS to retrieve the data. Otherwise, * the data will have been stored under our GDS */ - if (local) { + if (NULL != local) { *local = true; } if (PMIX_RANK_WILDCARD != rank) { @@ -668,9 +716,9 @@ static pmix_status_t _satisfy_request(pmix_namespace_t *nptr, pmix_rank_t rank, break; } } - if (PMIX_LOCAL != scope) { + if (NULL == peer) { /* this must be a remote rank */ - if (local) { + if (NULL != local) { *local = false; } scope = PMIX_REMOTE; @@ -678,7 +726,7 @@ static pmix_status_t _satisfy_request(pmix_namespace_t *nptr, pmix_rank_t rank, } } } else { - if (local) { + if (NULL != local) { *local = false; } peer = pmix_globals.mypeer; @@ -697,7 +745,11 @@ static pmix_status_t _satisfy_request(pmix_namespace_t *nptr, pmix_rank_t rank, cb.proc = &proc; cb.scope = PMIX_INTERNAL; cb.copy = false; + cb.info = cd->info; + cb.ninfo = cd->ninfo; PMIX_GDS_FETCH_KV(rc, pmix_globals.mypeer, &cb); + cb.info = NULL; + cb.ninfo = 0; if (PMIX_SUCCESS == rc) { PMIX_CONSTRUCT(&pkt, pmix_buffer_t); /* assemble the provided data into a byte object */ @@ -709,7 +761,7 @@ static pmix_status_t _satisfy_request(pmix_namespace_t *nptr, pmix_rank_t rank, PMIX_DESTRUCT(&cb); return rc; } - if (PMIX_PROC_IS_V1(cd->peer)) { + if (PMIX_PEER_IS_V1(cd->peer)) { /* if the client is using v1, then it expects the * data returned to it as the rank followed by abyte object containing * a buffer - so we have to do a little gyration */ @@ -747,8 +799,7 @@ static pmix_status_t _satisfy_request(pmix_namespace_t *nptr, pmix_rank_t rank, /* retrieve the data for the specific rank they are asking about */ if (PMIX_RANK_WILDCARD != rank) { - if (!PMIX_PROC_IS_SERVER(peer) && 0 == peer->commit_cnt) { - PMIX_ERROR_LOG(PMIX_ERR_NOT_FOUND); + if (!PMIX_PEER_IS_SERVER(peer) && 0 == peer->commit_cnt) { /* this condition works only for local requests, server does * count commits for local ranks, and check this count when * local request. @@ -766,7 +817,11 @@ static pmix_status_t _satisfy_request(pmix_namespace_t *nptr, pmix_rank_t rank, cb.proc = &proc; cb.scope = scope; cb.copy = false; + cb.info = cd->info; + cb.ninfo = cd->ninfo; PMIX_GDS_FETCH_KV(rc, peer, &cb); + cb.info = NULL; + cb.ninfo = 0; if (PMIX_SUCCESS == rc) { found = true; PMIX_CONSTRUCT(&pkt, pmix_buffer_t); @@ -783,7 +838,7 @@ static pmix_status_t _satisfy_request(pmix_namespace_t *nptr, pmix_rank_t rank, PMIX_DESTRUCT(&cb); return rc; } - if (PMIX_PROC_IS_V1(cd->peer)) { + if (PMIX_PEER_IS_V1(cd->peer)) { /* if the client is using v1, then it expects the * data returned to it in a different order than v2 * - so we have to do a little gyration */ @@ -930,7 +985,7 @@ static void _process_dmdx_reply(int fd, short args, void *cbdata) /* find the nspace object for the proc whose data is being received */ nptr = NULL; PMIX_LIST_FOREACH(ns, &pmix_globals.nspaces, pmix_namespace_t) { - if (0 == strcmp(caddy->lcd->proc.nspace, ns->nspace)) { + if (PMIX_CHECK_NSPACE(caddy->lcd->proc.nspace, ns->nspace)) { nptr = ns; break; } @@ -968,7 +1023,7 @@ static void _process_dmdx_reply(int fd, short args, void *cbdata) cd = (pmix_server_caddy_t*)dm->cbdata; found = false; PMIX_LIST_FOREACH(nm, &nspaces, pmix_nspace_caddy_t) { - if (0 == strcmp(nm->ns->nspace, cd->peer->nptr->nspace)) { + if (PMIX_CHECK_NSPACE(nm->ns->nspace, cd->peer->nptr->nspace)) { found = true; break; } @@ -982,9 +1037,12 @@ static void _process_dmdx_reply(int fd, short args, void *cbdata) } } /* now go thru each unique nspace and store the data using its - * assigned GDS component */ + * assigned GDS component - note that if the nspace of the requesting + * proc is different from the nspace of the proc whose data is being + * returned, then we have to store it into our hash tables */ PMIX_LIST_FOREACH(nm, &nspaces, pmix_nspace_caddy_t) { - if (NULL == nm->ns->compat.gds || 0 == nm->ns->nlocalprocs) { + if (NULL == nm->ns->compat.gds || 0 == nm->ns->nlocalprocs || + !PMIX_CHECK_NSPACE(nptr->nspace, nm->ns->nspace)) { peer = pmix_globals.mypeer; } else { /* there must be at least one local proc */ @@ -993,35 +1051,36 @@ static void _process_dmdx_reply(int fd, short args, void *cbdata) } PMIX_CONSTRUCT(&pbkt, pmix_buffer_t); if (NULL == caddy->data) { - /* we assume that the data was provided via a call to - * register_nspace, so what we need to do now is simply - * transfer it across to the individual nspace storage - * components */ - PMIX_CONSTRUCT(&cb, pmix_cb_t); - PMIX_PROC_CREATE(cb.proc, 1); - if (NULL == cb.proc) { - PMIX_ERROR_LOG(PMIX_ERR_NOMEM); - PMIX_DESTRUCT(&cb); - goto complete; - } - pmix_strncpy(cb.proc->nspace, nm->ns->nspace, PMIX_MAX_NSLEN); - cb.proc->rank = PMIX_RANK_WILDCARD; - cb.scope = PMIX_INTERNAL; - cb.copy = false; - PMIX_GDS_FETCH_KV(rc, pmix_globals.mypeer, &cb); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DESTRUCT(&cb); - goto complete; - } - PMIX_LIST_FOREACH(kv, &cb.kvs, pmix_kval_t) { - PMIX_GDS_STORE_KV(rc, peer, &caddy->lcd->proc, PMIX_INTERNAL, kv); + if (peer != pmix_globals.mypeer) { + /* we assume that the data was provided via a call to + * register_nspace, so what we need to do now is simply + * transfer it across to the individual nspace storage + * components */ + PMIX_CONSTRUCT(&cb, pmix_cb_t); + PMIX_PROC_CREATE(cb.proc, 1); + if (NULL == cb.proc) { + PMIX_ERROR_LOG(PMIX_ERR_NOMEM); + PMIX_DESTRUCT(&cb); + goto complete; + } + PMIX_LOAD_PROCID(cb.proc, nm->ns->nspace, PMIX_RANK_WILDCARD); + cb.scope = PMIX_INTERNAL; + cb.copy = false; + PMIX_GDS_FETCH_KV(rc, pmix_globals.mypeer, &cb); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); - break; + PMIX_DESTRUCT(&cb); + goto complete; + } + PMIX_LIST_FOREACH(kv, &cb.kvs, pmix_kval_t) { + PMIX_GDS_STORE_KV(rc, peer, &caddy->lcd->proc, PMIX_INTERNAL, kv); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + break; + } } + PMIX_DESTRUCT(&cb); } - PMIX_DESTRUCT(&cb); } else { PMIX_LOAD_BUFFER(pmix_globals.mypeer, &pbkt, caddy->data, caddy->ndata); /* unpack and store it*/ diff --git a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c index 37ec6c5b412..31de2c365d0 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c +++ b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c @@ -1,13 +1,13 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Artem Y. Polyakov . * All rights reserved. * Copyright (c) 2016-2017 Mellanox Technologies, Inc. * All rights reserved. - * Copyright (c) 2016 IBM Corporation. All rights reserved. + * Copyright (c) 2016-2019 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,6 +56,7 @@ #include "src/mca/bfrops/bfrops.h" #include "src/mca/plog/plog.h" #include "src/mca/psensor/psensor.h" +#include "src/mca/ptl/base/base.h" #include "src/util/argv.h" #include "src/util/error.h" #include "src/util/output.h" @@ -372,10 +373,11 @@ static pmix_server_trkr_t* new_tracker(char *id, pmix_proc_t *procs, { pmix_server_trkr_t *trk; size_t i; - bool all_def; + bool all_def, found; pmix_namespace_t *nptr, *ns; pmix_rank_info_t *info; - pmix_rank_t ns_local = 0; + pmix_nspace_caddy_t *nm; + pmix_nspace_t first; pmix_output_verbose(5, pmix_server_globals.base_output, "new_tracker called with %d procs", (int)nprocs); @@ -413,16 +415,12 @@ static pmix_server_trkr_t* new_tracker(char *id, pmix_proc_t *procs, trk->npcs = nprocs; } trk->type = type; + trk->local = false; + trk->nlocal = 0; all_def = true; + PMIX_LOAD_NSPACE(first, NULL); for (i=0; i < nprocs; i++) { - if (NULL == id) { - pmix_strncpy(trk->pcs[i].nspace, procs[i].nspace, PMIX_MAX_NSLEN); - trk->pcs[i].rank = procs[i].rank; - } - if (!all_def) { - continue; - } /* is this nspace known to us? */ nptr = NULL; PMIX_LIST_FOREACH(ns, &pmix_globals.nspaces, pmix_namespace_t) { @@ -431,14 +429,96 @@ static pmix_server_trkr_t* new_tracker(char *id, pmix_proc_t *procs, break; } } + /* check if multiple nspaces are involved in this operation */ + if (0 == strlen(first)) { + PMIX_LOAD_NSPACE(first, procs[i].nspace); + } else if (!PMIX_CHECK_NSPACE(first, procs[i].nspace)) { + trk->hybrid = true; + } if (NULL == nptr) { - /* cannot be a local proc */ + /* we don't know about this nspace. If there is going to + * be at least one local process participating in a fence, + * they we require that either at least one process must already + * have been registered (via "register client") or that the + * nspace itself have been regisered. So either the nspace + * wasn't registered because it doesn't include any local + * procs, or our host has not been told about this nspace + * because it won't host any local procs. We therefore mark + * this tracker as including non-local participants. + * + * NOTE: It is conceivable that someone might want to review + * this constraint at a future date. I believe it has to be + * required (at least for now) as otherwise we wouldn't have + * a way of knowing when all local procs have participated. + * It is possible that a new nspace could come along at some + * later time and add more local participants - but we don't + * know how long to wait. + * + * The only immediately obvious alternative solutions would + * be to either require that RMs always inform all daemons + * about the launch of nspaces, regardless of whether or + * not they will host local procs; or to drop the aggregation + * of local participants and just pass every fence call + * directly to the host. Neither of these seems palatable + * at this time. */ + trk->local = false; + /* we don't know any more info about this nspace, so + * there isn't anything more we can do */ + continue; + } + /* it is possible we know about this nspace because the host + * has registered one or more clients via "register_client", + * but the host has not yet called "register_nspace". There is + * a very tiny race condition whereby this can happen due + * to event-driven processing, but account for it here */ + if (SIZE_MAX == nptr->nlocalprocs) { + /* delay processing until this nspace is registered */ + all_def = false; + continue; + } + if (0 == nptr->nlocalprocs) { + /* the host has informed us that this nspace has no local procs */ pmix_output_verbose(5, pmix_server_globals.base_output, "new_tracker: unknown nspace %s", procs[i].nspace); continue; } - /* have all the clients for this nspace been defined? */ + /* check and add uniq ns into trk nslist */ + found = false; + PMIX_LIST_FOREACH(nm, &trk->nslist, pmix_nspace_caddy_t) { + if (0 == strcmp(nptr->nspace, nm->ns->nspace)) { + found = true; + break; + } + } + if (!found) { + nm = PMIX_NEW(pmix_nspace_caddy_t); + PMIX_RETAIN(nptr); + nm->ns = nptr; + pmix_list_append(&trk->nslist, &nm->super); + } + + /* if they want all the local members of this nspace, then + * add them in here. They told us how many procs will be + * local to us from this nspace, but we don't know their + * ranks. So as long as they want _all_ of them, we can + * handle that case regardless of whether the individual + * clients have been "registered" */ + if (PMIX_RANK_WILDCARD == procs[i].rank) { + trk->nlocal += nptr->nlocalprocs; + /* the total number of procs in this nspace was provided + * in the data blob delivered to register_nspace, so check + * to see if all the procs are local */ + if (nptr->nprocs != nptr->nlocalprocs) { + trk->local = false; + } + continue; + } + + /* They don't want all the local clients, or they are at + * least listing them individually. Check if all the clients + * for this nspace have been registered via "register_client" + * so we know the specific ranks on this node */ if (!nptr->all_registered) { /* nope, so no point in going further on this one - we'll * process it once all the procs are known */ @@ -446,40 +526,26 @@ static pmix_server_trkr_t* new_tracker(char *id, pmix_proc_t *procs, pmix_output_verbose(5, pmix_server_globals.base_output, "new_tracker: all clients not registered nspace %s", procs[i].nspace); - /* we have to continue processing the list of procs - * to setup the trk->pcs array, so don't break out - * of the loop */ + continue; } /* is this one of my local ranks? */ - ns_local = 0; + found = false; PMIX_LIST_FOREACH(info, &nptr->ranks, pmix_rank_info_t) { - if (procs[i].rank == info->pname.rank || - PMIX_RANK_WILDCARD == procs[i].rank) { - pmix_output_verbose(5, pmix_server_globals.base_output, - "adding local proc %s.%d to tracker", - info->pname.nspace, info->pname.rank); + if (procs[i].rank == info->pname.rank) { + pmix_output_verbose(5, pmix_server_globals.base_output, + "adding local proc %s.%d to tracker", + info->pname.nspace, info->pname.rank); + found = true; /* track the count */ - ns_local++; - if (PMIX_RANK_WILDCARD != procs[i].rank) { - break; - } + trk->nlocal++; + break; } } - - trk->nlocal += ns_local; - if (!ns_local) { + if (!found) { trk->local = false; - } else if (PMIX_RANK_WILDCARD == procs[i].rank) { - /* If proc is a wildcard we need to additionally check - * that all of the processes in the namespace were - * locally found. - * Otherwise this tracker is not local - */ - if (ns_local != nptr->nprocs) { - trk->local = false; - } } } + if (all_def) { trk->def_complete = true; } @@ -664,8 +730,8 @@ pmix_status_t pmix_server_fence(pmix_server_caddy_t *cd, * across all participants has been completed */ if (trk->def_complete && pmix_list_get_size(&trk->local_cbs) == trk->nlocal) { - pmix_output_verbose(2, pmix_server_globals.base_output, - "fence complete"); + pmix_output_verbose(2, pmix_server_globals.fence_output, + "fence LOCALLY complete"); /* if this is a purely local fence (i.e., all participants are local), * then it is done and we notify accordingly */ if (trk->local) { @@ -1148,11 +1214,12 @@ static void spcbfunc(pmix_status_t status, goto cleanup; } PMIX_RETAIN(cd->peer); - req->peer = cd->peer; - req->pname.nspace = strdup(nspace); - req->pname.rank = PMIX_RANK_WILDCARD; + req->requestor = cd->peer; + req->nprocs = 1; + PMIX_PROC_CREATE(req->procs, req->nprocs); + PMIX_LOAD_PROCID(&req->procs[0], nspace, PMIX_RANK_WILDCARD); req->channels = cd->channels; - pmix_list_append(&pmix_globals.iof_requests, &req->super); + req->refid = pmix_pointer_array_add(&pmix_globals.iof_requests, req); /* process any cached IO */ PMIX_LIST_FOREACH_SAFE(iof, ionext, &pmix_server_globals.iof, pmix_iof_cache_t) { /* if the channels don't match, then ignore it */ @@ -1160,18 +1227,19 @@ static void spcbfunc(pmix_status_t status, continue; } /* if the source does not match the request, then ignore it */ - if (!PMIX_CHECK_PROCID(&iof->source, &req->pname)) { + if (!PMIX_CHECK_PROCID(&iof->source, &req->procs[0])) { continue; } /* never forward back to the source! This can happen if the source * is a launcher */ - if (PMIX_CHECK_PROCID(&iof->source, &req->peer->info->pname)) { + if (PMIX_CHECK_PROCID(&iof->source, &req->requestor->info->pname)) { continue; } pmix_output_verbose(2, pmix_server_globals.iof_output, "PMIX:SERVER:SPAWN delivering cached IOF from %s:%d to %s:%d", iof->source.nspace, iof->source.rank, - req->pname.nspace, req->pname.rank); + req->requestor->info->pname.nspace, + req->requestor->info->pname.rank); /* setup the msg */ if (NULL == (msg = PMIX_NEW(pmix_buffer_t))) { PMIX_ERROR_LOG(PMIX_ERR_OUT_OF_RESOURCE); @@ -1179,28 +1247,28 @@ static void spcbfunc(pmix_status_t status, break; } /* provide the source */ - PMIX_BFROPS_PACK(rc, req->peer, msg, &iof->source, 1, PMIX_PROC); + PMIX_BFROPS_PACK(rc, req->requestor, msg, &iof->source, 1, PMIX_PROC); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(msg); break; } /* provide the channel */ - PMIX_BFROPS_PACK(rc, req->peer, msg, &iof->channel, 1, PMIX_IOF_CHANNEL); + PMIX_BFROPS_PACK(rc, req->requestor, msg, &iof->channel, 1, PMIX_IOF_CHANNEL); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(msg); break; } /* pack the data */ - PMIX_BFROPS_PACK(rc, req->peer, msg, iof->bo, 1, PMIX_BYTE_OBJECT); + PMIX_BFROPS_PACK(rc, req->requestor, msg, iof->bo, 1, PMIX_BYTE_OBJECT); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(msg); break; } /* send it to the requestor */ - PMIX_PTL_SEND_ONEWAY(rc, req->peer, msg, PMIX_PTL_TAG_IOF); + PMIX_PTL_SEND_ONEWAY(rc, req->requestor, msg, PMIX_PTL_TAG_IOF); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(msg); @@ -1310,7 +1378,7 @@ pmix_status_t pmix_server_spawn(pmix_peer_t *peer, * as we need the nspace of the spawned application! */ } /* add the directive to the end */ - if (PMIX_PROC_IS_TOOL(peer)) { + if (PMIX_PEER_IS_TOOL(peer)) { PMIX_INFO_LOAD(&cd->info[ninfo], PMIX_REQUESTOR_IS_TOOL, NULL, PMIX_BOOL); /* if the requestor is a tool, we default to forwarding all * output IO channels */ @@ -2541,12 +2609,16 @@ pmix_status_t pmix_server_log(pmix_peer_t *peer, } cd->cbfunc.opcbfn = cbfunc; cd->cbdata = cbdata; - /* unpack the timestamp */ - cnt = 1; - PMIX_BFROPS_UNPACK(rc, peer, buf, ×tamp, &cnt, PMIX_TIME); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto exit; + if (PMIX_PEER_IS_EARLIER(peer, 3, 0, 0)) { + timestamp = -1; + } else { + /* unpack the timestamp */ + cnt = 1; + PMIX_BFROPS_UNPACK(rc, peer, buf, ×tamp, &cnt, PMIX_TIME); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto exit; + } } /* unpack the number of data */ @@ -3207,10 +3279,6 @@ pmix_status_t pmix_server_iofreg(pmix_peer_t *peer, pmix_status_t rc; pmix_setup_caddy_t *cd; pmix_iof_req_t *req; - bool notify, match; - size_t n; - pmix_buffer_t *msg; - pmix_iof_cache_t *iof, *ionext; pmix_output_verbose(2, pmix_server_globals.iof_output, "recvd IOF PULL request from client"); @@ -3269,108 +3337,105 @@ pmix_status_t pmix_server_iofreg(pmix_peer_t *peer, goto exit; } - /* check to see if we have already registered this source/channel combination */ - notify = false; - for (n=0; n < cd->nprocs; n++) { - match = false; - PMIX_LIST_FOREACH(req, &pmix_globals.iof_requests, pmix_iof_req_t) { - /* is this request from the same peer? */ - if (peer != req->peer) { - continue; - } - /* do we already have this source for this peer? */ - if (PMIX_CHECK_PROCID(&cd->procs[n], &req->pname)) { - match = true; - if ((req->channels & cd->channels) != cd->channels) { - /* this is a channel update */ - req->channels |= cd->channels; - /* we need to notify the host */ - notify = true; - } - break; - } - } - /* if we didn't find the matching entry, then add it */ - if (!match) { - /* record the request */ - req = PMIX_NEW(pmix_iof_req_t); - if (NULL == req) { - rc = PMIX_ERR_NOMEM; - goto exit; - } - PMIX_RETAIN(peer); - req->peer = peer; - req->pname.nspace = strdup(cd->procs[n].nspace); - req->pname.rank = cd->procs[n].rank; - req->channels = cd->channels; - pmix_list_append(&pmix_globals.iof_requests, &req->super); - } - /* process any cached IO */ - PMIX_LIST_FOREACH_SAFE(iof, ionext, &pmix_server_globals.iof, pmix_iof_cache_t) { - /* if the channels don't match, then ignore it */ - if (!(iof->channel & req->channels)) { - continue; - } - /* if the source does not match the request, then ignore it */ - if (!PMIX_CHECK_PROCID(&iof->source, &req->pname)) { - continue; - } - /* never forward back to the source! This can happen if the source - * is a launcher */ - if (PMIX_CHECK_PROCID(&iof->source, &req->peer->info->pname)) { - continue; - } - pmix_output_verbose(2, pmix_server_globals.iof_output, - "PMIX:SERVER:IOFREQ delivering cached IOF from %s:%d to %s:%d", - iof->source.nspace, iof->source.rank, - req->peer->info->pname.nspace, req->peer->info->pname.rank); - /* setup the msg */ - if (NULL == (msg = PMIX_NEW(pmix_buffer_t))) { - PMIX_ERROR_LOG(PMIX_ERR_OUT_OF_RESOURCE); - rc = PMIX_ERR_OUT_OF_RESOURCE; - break; - } - /* provide the source */ - PMIX_BFROPS_PACK(rc, req->peer, msg, &iof->source, 1, PMIX_PROC); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - break; - } - /* provide the channel */ - PMIX_BFROPS_PACK(rc, req->peer, msg, &iof->channel, 1, PMIX_IOF_CHANNEL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - break; - } - /* pack the data */ - PMIX_BFROPS_PACK(rc, req->peer, msg, iof->bo, 1, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - break; - } - /* send it to the requestor */ - PMIX_PTL_SEND_ONEWAY(rc, req->peer, msg, PMIX_PTL_TAG_IOF); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - } - /* remove it from the list since it has now been forwarded */ - pmix_list_remove_item(&pmix_server_globals.iof, &iof->super); - PMIX_RELEASE(iof); - } + /* add this peer/source/channel combination */ + req = PMIX_NEW(pmix_iof_req_t); + if (NULL == req) { + rc = PMIX_ERR_NOMEM; + goto exit; + } + PMIX_RETAIN(peer); + req->requestor = peer; + req->nprocs = cd->nprocs; + if (0 < req->nprocs) { + PMIX_PROC_CREATE(req->procs, cd->nprocs); + memcpy(req->procs, cd->procs, req->nprocs * sizeof(pmix_proc_t)); + } + req->channels = cd->channels; + req->refid = pmix_pointer_array_add(&pmix_globals.iof_requests, req); + cd->ncodes = req->refid; + + /* ask the host to execute the request */ + if (PMIX_SUCCESS != (rc = pmix_host_server.iof_pull(cd->procs, cd->nprocs, + cd->info, cd->ninfo, + cd->channels, + cbfunc, cd))) { + goto exit; } - if (notify) { - /* ask the host to execute the request */ - if (PMIX_SUCCESS != (rc = pmix_host_server.iof_pull(cd->procs, cd->nprocs, - cd->info, cd->ninfo, - cd->channels, - cbfunc, cd))) { + return PMIX_SUCCESS; + + exit: + PMIX_RELEASE(cd); + return rc; +} + +pmix_status_t pmix_server_iofdereg(pmix_peer_t *peer, + pmix_buffer_t *buf, + pmix_op_cbfunc_t cbfunc, + void *cbdata) +{ + int32_t cnt; + pmix_status_t rc; + pmix_setup_caddy_t *cd; + pmix_iof_req_t *req; + size_t ninfo, refid; + + pmix_output_verbose(2, pmix_server_globals.iof_output, + "recvd IOF DEREGISTER from client"); + + if (NULL == pmix_host_server.iof_pull) { + return PMIX_ERR_NOT_SUPPORTED; + } + + cd = PMIX_NEW(pmix_setup_caddy_t); + if (NULL == cd) { + return PMIX_ERR_NOMEM; + } + cd->cbdata = cbdata; // this is the pmix_server_caddy_t + + /* unpack the number of directives */ + cnt = 1; + PMIX_BFROPS_UNPACK(rc, peer, buf, &ninfo, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto exit; + } + /* unpack the directives - note that we have to add one + * to tell the server to stop forwarding to this channel */ + cd->ninfo = ninfo + 1; + PMIX_INFO_CREATE(cd->info, cd->ninfo); + if (0 < ninfo) { + cnt = ninfo; + PMIX_BFROPS_UNPACK(rc, peer, buf, cd->info, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); goto exit; } } + /* add the directive to stop forwarding */ + PMIX_INFO_LOAD(&cd->info[ninfo], PMIX_IOF_STOP, NULL, PMIX_BOOL); + + /* unpack the handler ID */ + cnt = 1; + PMIX_BFROPS_UNPACK(rc, peer, buf, &refid, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto exit; + } + + /* get the referenced handler */ + req = (pmix_iof_req_t*)pmix_pointer_array_get_item(&pmix_globals.iof_requests, refid); + if (NULL == req) { + /* already gone? */ + rc = PMIX_ERR_NOT_FOUND; + goto exit; + } + /* tell the server to stop */ + if (PMIX_SUCCESS != (rc = pmix_host_server.iof_pull(cd->procs, cd->nprocs, + cd->info, cd->ninfo, + cd->channels, + cbfunc, cd))) { + goto exit; + } return PMIX_SUCCESS; exit: @@ -3502,6 +3567,7 @@ static void tcon(pmix_server_trkr_t *t) t->pname.rank = PMIX_RANK_UNDEF; t->pcs = NULL; t->npcs = 0; + PMIX_CONSTRUCT(&t->nslist, pmix_list_t); PMIX_CONSTRUCT_LOCK(&t->lock); t->def_complete = false; PMIX_CONSTRUCT(&t->local_cbs, pmix_list_t); @@ -3520,6 +3586,7 @@ static void tdes(pmix_server_trkr_t *t) if (NULL != t->id) { free(t->id); } + PMIX_LIST_DESTRUCT(&t->nslist); PMIX_DESTRUCT_LOCK(&t->lock); if (NULL != t->pcs) { free(t->pcs); @@ -3539,6 +3606,8 @@ static void cdcon(pmix_server_caddy_t *cd) cd->event_active = false; cd->trk = NULL; cd->peer = NULL; + cd->info = NULL; + cd->ninfo = 0; } static void cddes(pmix_server_caddy_t *cd) { @@ -3556,7 +3625,6 @@ PMIX_CLASS_INSTANCE(pmix_server_caddy_t, pmix_list_item_t, cdcon, cddes); - static void scadcon(pmix_setup_caddy_t *p) { p->peer = NULL; @@ -3757,10 +3825,15 @@ PMIX_CLASS_INSTANCE(pmix_inventory_rollup_t, static void iocon(pmix_iof_cache_t *p) { p->bo = NULL; + p->info = NULL; + p->ninfo = 0; } static void iodes(pmix_iof_cache_t *p) { PMIX_BYTE_OBJECT_FREE(p->bo, 1); // macro protects against NULL + if (0 < p->ninfo) { + PMIX_INFO_FREE(p->info, p->ninfo); + } } PMIX_CLASS_INSTANCE(pmix_iof_cache_t, pmix_list_item_t, diff --git a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.h b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.h index 06fddc1fe96..a7f0be96ec6 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.h +++ b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.h @@ -148,6 +148,8 @@ typedef struct { pmix_proc_t source; pmix_iof_channel_t channel; pmix_byte_object_t *bo; + pmix_info_t *info; + size_t ninfo; } pmix_iof_cache_t; PMIX_CLASS_DECLARATION(pmix_iof_cache_t); @@ -323,6 +325,11 @@ pmix_status_t pmix_server_iofstdin(pmix_peer_t *peer, pmix_op_cbfunc_t cbfunc, void *cbdata); +pmix_status_t pmix_server_iofdereg(pmix_peer_t *peer, + pmix_buffer_t *buf, + pmix_op_cbfunc_t cbfunc, + void *cbdata); + pmix_status_t pmix_server_event_recvd_from_client(pmix_peer_t *peer, pmix_buffer_t *buf, pmix_op_cbfunc_t cbfunc, diff --git a/opal/mca/pmix/pmix3x/pmix/src/tool/pmix_tool.c b/opal/mca/pmix/pmix3x/pmix/src/tool/pmix_tool.c index 585ea08fe49..956b16113a1 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/tool/pmix_tool.c +++ b/opal/mca/pmix/pmix3x/pmix/src/tool/pmix_tool.c @@ -196,14 +196,18 @@ static void tool_iof_handler(struct pmix_peer_t *pr, pmix_byte_object_t bo; int32_t cnt; pmix_status_t rc; + size_t refid, ninfo=0; + pmix_iof_req_t *req; + pmix_info_t *info; pmix_output_verbose(2, pmix_client_globals.iof_output, "recvd IOF with %d bytes", (int)buf->bytes_used); - /* if the buffer is empty, they are simply closing the channel */ + /* if the buffer is empty, they are simply closing the socket */ if (0 == buf->bytes_used) { return; } + PMIX_BYTE_OBJECT_CONSTRUCT(&bo); cnt = 1; PMIX_BFROPS_UNPACK(rc, peer, buf, &source, &cnt, PMIX_PROC); @@ -218,13 +222,52 @@ static void tool_iof_handler(struct pmix_peer_t *pr, return; } cnt = 1; - PMIX_BFROPS_UNPACK(rc, peer, buf, &bo, &cnt, PMIX_BYTE_OBJECT); + PMIX_BFROPS_UNPACK(rc, peer, buf, &refid, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return; + } + cnt = 1; + PMIX_BFROPS_UNPACK(rc, peer, buf, &ninfo, &cnt, PMIX_SIZE); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); return; } - if (NULL != bo.bytes && 0 < bo.size) { - pmix_iof_write_output(&source, channel, &bo, NULL); + if (0 < ninfo) { + PMIX_INFO_CREATE(info, ninfo); + cnt = ninfo; + PMIX_BFROPS_UNPACK(rc, peer, buf, info, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + } + cnt = 1; + PMIX_BFROPS_UNPACK(rc, peer, buf, &bo, &cnt, PMIX_BYTE_OBJECT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + /* lookup the handler for this IOF package */ + if (NULL == (req = (pmix_iof_req_t*)pmix_pointer_array_get_item(&pmix_globals.iof_requests, refid))) { + /* something wrong here - should not happen */ + PMIX_ERROR_LOG(PMIX_ERR_NOT_FOUND); + goto cleanup; + } + /* if the handler invokes a callback function, do so */ + if (NULL != req->cbfunc) { + req->cbfunc(refid, channel, &source, &bo, info, ninfo); + } else { + /* otherwise, simply write it out to the specified std IO channel */ + if (NULL != bo.bytes && 0 < bo.size) { + pmix_iof_write_output(&source, channel, &bo, NULL); + } + } + + cleanup: + /* cleanup the memory */ + if (0 < ninfo) { + PMIX_INFO_FREE(info, ninfo); } PMIX_BYTE_OBJECT_DESTRUCT(&bo); } @@ -275,7 +318,7 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, pmix_ptl_posted_recv_t *rcv; pmix_proc_t wildcard; int fd; - pmix_proc_type_t ptype; + pmix_proc_type_t ptype = PMIX_PROC_TYPE_STATIC_INIT; pmix_cb_t cb; pmix_buffer_t *req; pmix_cmd_t cmd = PMIX_REQ_CMD; @@ -301,7 +344,7 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, /* parse the input directives */ gdsfound = false; - ptype = PMIX_PROC_TOOL; + PMIX_SET_PROC_TYPE(&ptype, PMIX_PROC_TOOL); if (NULL != info) { for (n=0; n < ninfo; n++) { if (0 == strncmp(info[n].key, PMIX_GDS_MODULE, PMIX_MAX_KEYLEN)) { @@ -328,7 +371,9 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, /* they want us to forward our stdin to someone */ fwd_stdin = true; } else if (0 == strncmp(info[n].key, PMIX_LAUNCHER, PMIX_MAX_KEYLEN)) { - ptype |= PMIX_PROC_LAUNCHER; + if (PMIX_INFO_TRUE(&info[n])) { + PMIX_SET_PROC_TYPE(&ptype, PMIX_PROC_LAUNCHER); + } } else if (0 == strncmp(info[n].key, PMIX_SERVER_TMPDIR, PMIX_MAX_KEYLEN)) { pmix_server_globals.tmpdir = strdup(info[n].value.data.string); } else if (0 == strncmp(info[n].key, PMIX_SYSTEM_TMPDIR, PMIX_MAX_KEYLEN)) { @@ -389,7 +434,11 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, return PMIX_ERR_BAD_PARAM; } /* flag that this tool is also a client */ - ptype |= PMIX_PROC_CLIENT_TOOL; + if (PMIX_PROC_IS_LAUNCHER(&ptype)) { + PMIX_SET_PROC_TYPE(&ptype, PMIX_PROC_CLIENT_LAUNCHER); + } else { + PMIX_SET_PROC_TYPE(&ptype, PMIX_PROC_CLIENT_TOOL); + } } else if (nspace_in_enviro) { /* this is an error - we can't have one and not * the other */ @@ -407,7 +456,7 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, /* if we are a launcher, then we also need to act as a server, * so setup the server-related structures here */ - if (PMIX_PROC_LAUNCHER_ACT & ptype) { + if (PMIX_PROC_IS_LAUNCHER(&ptype)) { if (PMIX_SUCCESS != (rc = pmix_server_initialize())) { PMIX_ERROR_LOG(rc); if (NULL != nspace) { @@ -425,7 +474,7 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, /* setup the runtime - this init's the globals, * opens and initializes the required frameworks */ - if (PMIX_SUCCESS != (rc = pmix_rte_init(ptype, info, ninfo, + if (PMIX_SUCCESS != (rc = pmix_rte_init(ptype.type, info, ninfo, pmix_tool_notify_recv))) { PMIX_ERROR_LOG(rc); if (NULL != nspace) { @@ -486,7 +535,7 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, pmix_output_verbose(2, pmix_globals.debug_output, "pmix: init called"); - if (PMIX_PROC_IS_CLIENT(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_CLIENT(pmix_globals.mypeer)) { /* if we are a client, then we need to pickup the * rest of the envar-based server assignments */ pmix_globals.pindex = -1; @@ -576,6 +625,25 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, return PMIX_ERR_INIT; } + /* if we are a launcher, then we also need to act as a server, + * so setup the server-related structures here */ + if (PMIX_PROC_IS_LAUNCHER(&ptype) || + PMIX_PROC_IS_CLIENT_LAUNCHER(&ptype)) { + if (PMIX_SUCCESS != (rc = pmix_server_initialize())) { + PMIX_ERROR_LOG(rc); + if (NULL != nspace) { + free(nspace); + } + if (gdsfound) { + PMIX_INFO_DESTRUCT(&ginfo); + } + PMIX_RELEASE_THREAD(&pmix_global_lock); + return rc; + } + /* setup the function pointers */ + memset(&pmix_host_server, 0, sizeof(pmix_server_module_t)); + } + if (do_not_connect) { /* ensure we mark that we are not connected */ pmix_globals.connected = false; @@ -613,7 +681,7 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, pmix_globals.mypeer->info->pname.rank = pmix_globals.myid.rank; /* if we are acting as a server, then start listening */ - if (PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { /* setup the wildcard recv for inbound messages from clients */ rcv = PMIX_NEW(pmix_ptl_posted_recv_t); rcv->tag = UINT32_MAX; @@ -712,7 +780,7 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, * job info - we do this as a non-blocking * transaction because some systems cannot handle very large * blocking operations and error out if we try them. */ - if (PMIX_PROC_IS_CLIENT(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_CLIENT(pmix_globals.mypeer)) { req = PMIX_NEW(pmix_buffer_t); PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, req, &cmd, 1, PMIX_COMMAND); @@ -775,7 +843,7 @@ PMIX_EXPORT int PMIx_tool_init(pmix_proc_t *proc, PMIX_RELEASE_THREAD(&pmix_global_lock); /* if we are acting as a server, then start listening */ - if (PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { /* start listening for connections */ if (PMIX_SUCCESS != pmix_ptl_base_start_listening(info, ninfo)) { pmix_show_help("help-pmix-server.txt", "listener-thread-start", true); @@ -1213,7 +1281,7 @@ PMIX_EXPORT pmix_status_t PMIx_tool_finalize(void) } } - if (PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) { + if (PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) { pmix_ptl_base_stop_listening(); for (n=0; n < pmix_server_globals.clients.size; n++) { diff --git a/opal/mca/pmix/pmix3x/pmix/src/tools/pmix_info/Makefile.am b/opal/mca/pmix/pmix3x/pmix/src/tools/pmix_info/Makefile.am index 343cfaa3b84..71846055983 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/tools/pmix_info/Makefile.am +++ b/opal/mca/pmix/pmix3x/pmix/src/tools/pmix_info/Makefile.am @@ -19,13 +19,22 @@ # $HEADER$ # +if SOURCE_DATE_EPOCH_SET + USER = @PMIX_CONFIGURE_USER@ + PMIX_BUILD_HOST = @PMIX_CONFIGURE_HOST@ + PMIX_BUILD_DATE = @PMIX_CONFIGURE_DATE@ +else + PMIX_BUILD_HOST = `(hostname || uname -n) 2> /dev/null | sed 1q` + PMIX_BUILD_DATE = `date +%Y-%m-%dT%H:%M:%S` +endif + AM_CFLAGS = \ -DPMIX_CONFIGURE_USER="\"@PMIX_CONFIGURE_USER@\"" \ -DPMIX_CONFIGURE_HOST="\"@PMIX_CONFIGURE_HOST@\"" \ -DPMIX_CONFIGURE_DATE="\"@PMIX_CONFIGURE_DATE@\"" \ - -DPMIX_BUILD_USER="\"$$USER\"" \ - -DPMIX_BUILD_HOST="\"`(hostname || uname -n) 2> /dev/null | sed 1q`\"" \ - -DPMIX_BUILD_DATE="\"`date`\"" \ + -DPMIX_BUILD_USER="\"$(USER)\"" \ + -DPMIX_BUILD_HOST="\"$(PMIX_BUILD_HOST)\"" \ + -DPMIX_BUILD_DATE="\"$(PMIX_BUILD_DATE)\"" \ -DPMIX_BUILD_CFLAGS="\"@CFLAGS@\"" \ -DPMIX_BUILD_CPPFLAGS="\"@CPPFLAGS@\"" \ -DPMIX_BUILD_LDFLAGS="\"@LDFLAGS@\"" \ diff --git a/opal/mca/pmix/pmix3x/pmix/src/util/compress.c b/opal/mca/pmix/pmix3x/pmix/src/util/compress.c index d71cdf37c63..7089fb77c32 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/util/compress.c +++ b/opal/mca/pmix/pmix3x/pmix/src/util/compress.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2016 Intel, Inc. All rights reserved. * Copyright (c) 2017 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * @@ -43,7 +43,15 @@ bool pmix_util_compress_string(char *instring, /* get an upper bound on the required output storage */ len = deflateBound(&strm, inlen); + /* if this isn't going to result in a smaller footprint, + * then don't do it */ + if (len >= inlen) { + (void)deflateEnd(&strm); + return false; + } + if (NULL == (tmp = (uint8_t*)malloc(len))) { + (void)deflateEnd(&strm); return false; } strm.next_in = (uint8_t*)instring; @@ -55,8 +63,8 @@ bool pmix_util_compress_string(char *instring, strm.next_out = tmp; rc = deflate (&strm, Z_FINISH); - deflateEnd (&strm); - if (Z_OK != rc) { + (void)deflateEnd (&strm); + if (Z_OK != rc && Z_STREAM_END != rc) { free(tmp); return false; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/util/name_fns.c b/opal/mca/pmix/pmix3x/pmix/src/util/name_fns.c index 96b46ea9d58..7e51f7461ed 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/util/name_fns.c +++ b/opal/mca/pmix/pmix3x/pmix/src/util/name_fns.c @@ -27,6 +27,7 @@ #include "pmix_common.h" #include "src/threads/tsd.h" +#include "src/include/pmix_globals.h" #include "src/util/error.h" #include "src/util/name_fns.h" #include "src/util/printf.h" @@ -94,7 +95,7 @@ get_print_name_buffer(void) return (pmix_print_args_buffers_t*) ptr; } -char* pmix_util_print_name_args(const pmix_proc_t *name) +static char* print_args(char *ns, pmix_rank_t rnk) { pmix_print_args_buffers_t *ptr; char *rank; @@ -107,8 +108,8 @@ char* pmix_util_print_name_args(const pmix_proc_t *name) return pmix_print_args_null; } - /* protect against NULL names */ - if (NULL == name) { + /* protect against NULL nspace */ + if (NULL == ns) { index = ptr->cntr; snprintf(ptr->buffers[index], PMIX_PRINT_NAME_ARGS_MAX_SIZE, "[NO-NAME]"); ptr->cntr++; @@ -118,12 +119,12 @@ char* pmix_util_print_name_args(const pmix_proc_t *name) return ptr->buffers[index]; } - rank = pmix_util_print_rank(name->rank); + rank = pmix_util_print_rank(rnk); index = ptr->cntr; snprintf(ptr->buffers[index], PMIX_PRINT_NAME_ARGS_MAX_SIZE, - "[%s:%s]", name->nspace, rank); + "[%s:%s]", ns, rank); ptr->cntr++; if (PMIX_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) { ptr->cntr = 0; @@ -132,6 +133,24 @@ char* pmix_util_print_name_args(const pmix_proc_t *name) return ptr->buffers[index]; } +char* pmix_util_print_name_args(const pmix_proc_t *name) +{ + if (NULL == name) { + return print_args(NULL, PMIX_RANK_UNDEF); + } + + return print_args((char*)name->nspace, name->rank); +} + +char *pmix_util_print_pname_args(const pmix_name_t *name) +{ + if (NULL == name) { + return print_args(NULL, PMIX_RANK_UNDEF); + } + + return print_args((char*)name->nspace, name->rank); +} + char* pmix_util_print_rank(const pmix_rank_t vpid) { pmix_print_args_buffers_t *ptr; diff --git a/opal/mca/pmix/pmix3x/pmix/src/util/name_fns.h b/opal/mca/pmix/pmix3x/pmix/src/util/name_fns.h index 4fa01e0c08e..e03e4da3b6c 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/util/name_fns.h +++ b/opal/mca/pmix/pmix3x/pmix/src/util/name_fns.h @@ -12,7 +12,7 @@ * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2018 Intel, Inc. All rights reserved. + * Copyright (c) 2018-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,6 +30,7 @@ #endif #include "pmix_common.h" +#include "src/include/pmix_globals.h" BEGIN_C_DECLS @@ -38,6 +39,10 @@ PMIX_EXPORT char* pmix_util_print_name_args(const pmix_proc_t *name); #define PMIX_NAME_PRINT(n) \ pmix_util_print_name_args(n) +PMIX_EXPORT char *pmix_util_print_pname_args(const pmix_name_t *name); +#define PMIX_PNAME_PRINT(n) \ + pmix_util_print_pname_args(n) + PMIX_EXPORT char* pmix_util_print_rank(const pmix_rank_t vpid); #define PMIX_RANK_PRINT(n) \ pmix_util_print_rank(n) diff --git a/opal/mca/pmix/pmix3x/pmix/test/cli_stages.c b/opal/mca/pmix/pmix3x/pmix/test/cli_stages.c index 5fbfec419dc..52885b3b4b5 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/cli_stages.c +++ b/opal/mca/pmix/pmix3x/pmix/test/cli_stages.c @@ -17,6 +17,7 @@ cli_info_t *cli_info = NULL; int cli_info_cnt = 0; bool test_abort = false; +bool test_complete = false; int cli_rank(cli_info_t *cli) { @@ -177,70 +178,6 @@ void cli_cleanup(cli_info_t *cli) } -bool test_terminated(void) -{ - bool ret = true; - int i; - - // All clients should disconnect - for(i=0; i < cli_info_cnt; i++){ - ret = ret && (CLI_TERM <= cli_info[i].state); - } - return (ret || test_abort); -} - -void cli_wait_all(double timeout) -{ - struct timeval tv; - double start_time, cur_time; - - gettimeofday(&tv, NULL); - start_time = tv.tv_sec + 1E-6*tv.tv_usec; - cur_time = start_time; - - //TEST_VERBOSE(("Wait for all children to terminate")) - - // Wait for all children to cleanup after the test. - while( !test_terminated() && ( timeout >= (cur_time - start_time) ) ){ - struct timespec ts; - int status, i; - pid_t pid; - while( 0 < (pid = waitpid(-1, &status, WNOHANG) ) ){ - TEST_VERBOSE(("waitpid = %d", pid)); - for(i=0; i < cli_info_cnt; i++){ - if( cli_info[i].pid == pid ){ - TEST_VERBOSE(("the child with pid = %d has rank = %d, ns = %s\n" - "\t\texited = %d, signalled = %d", pid, - cli_info[i].rank, cli_info[i].ns, - WIFEXITED(status), WIFSIGNALED(status) )); - if( WIFEXITED(status) || WIFSIGNALED(status) ){ - cli_cleanup(&cli_info[i]); - } - } - } - } - if( pid < 0 ){ - if( errno == ECHILD ){ - TEST_VERBOSE(("No more children to wait. Happens on the last cli_wait_all call " - "which is used to ensure that all children terminated.\n")); - if (pmix_test_verbose) { - sleep(1); - } - break; - } else { - TEST_ERROR(("waitpid(): %d : %s", errno, strerror(errno))); - exit(0); - } - } - ts.tv_sec = 0; - ts.tv_nsec = 100000; - nanosleep(&ts, NULL); - // calculate current timestamp - gettimeofday(&tv, NULL); - cur_time = tv.tv_sec + 1E-6*tv.tv_usec; - } -} - void cli_kill_all(void) { int i; diff --git a/opal/mca/pmix/pmix3x/pmix/test/cli_stages.h b/opal/mca/pmix/pmix3x/pmix/test/cli_stages.h index 011023d7a79..c66fc2ca21f 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/cli_stages.h +++ b/opal/mca/pmix/pmix3x/pmix/test/cli_stages.h @@ -48,11 +48,14 @@ typedef struct { cli_state_t next_state[CLI_TERM+1]; pmix_rank_t rank; char *ns; + int exit_code; + bool alive; } cli_info_t; extern cli_info_t *cli_info; extern int cli_info_cnt; extern bool test_abort; +extern bool test_complete; int cli_rank(cli_info_t *cli); void cli_init(int nprocs); @@ -61,11 +64,8 @@ void cli_finalize(cli_info_t *cli); void cli_disconnect(cli_info_t *cli); void cli_terminate(cli_info_t *cli); void cli_cleanup(cli_info_t *cli); -void cli_wait_all(double timeout); void cli_kill_all(void); -bool test_terminated(void); - void errhandler(size_t evhdlr_registration_id, pmix_status_t status, const pmix_proc_t *source, diff --git a/opal/mca/pmix/pmix3x/pmix/test/pmix_client.c b/opal/mca/pmix/pmix3x/pmix/test/pmix_client.c index 3d1b46fcfac..dc7f54c90c1 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/pmix_client.c +++ b/opal/mca/pmix/pmix3x/pmix/test/pmix_client.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. * Copyright (c) 2015-2018 Mellanox Technologies, Inc. All rights reserved. * $COPYRIGHT$ * @@ -41,31 +41,6 @@ #include "test_replace.h" #include "test_internal.h" -static void errhandler(size_t evhdlr_registration_id, - pmix_status_t status, - const pmix_proc_t *source, - pmix_info_t info[], size_t ninfo, - pmix_info_t results[], size_t nresults, - pmix_event_notification_cbfunc_fn_t cbfunc, - void *cbdata) -{ - TEST_ERROR(("PMIX client: Error handler with status = %d", status)) -} - -static void op_callbk(pmix_status_t status, - void *cbdata) -{ - TEST_VERBOSE(( "OP CALLBACK CALLED WITH STATUS %d", status)); -} - -static void errhandler_reg_callbk (pmix_status_t status, - size_t errhandler_ref, - void *cbdata) -{ - TEST_VERBOSE(("PMIX client ERRHANDLER REGISTRATION CALLBACK CALLED WITH STATUS %d, ref=%lu", - status, (unsigned long)errhandler_ref)); -} - int main(int argc, char **argv) { int rc; @@ -97,13 +72,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, info, ninfo))) { TEST_ERROR(("Client ns %s rank %d: PMIx_Init failed: %d", params.nspace, params.rank, rc)); FREE_TEST_PARAMS(params); - exit(0); - } - PMIx_Register_event_handler(NULL, 0, NULL, 0, errhandler, errhandler_reg_callbk, NULL); - if (myproc.rank != params.rank) { - TEST_ERROR(("Client ns %s Rank returned in PMIx_Init %d does not match to rank from command line %d.", myproc.nspace, myproc.rank, params.rank)); - FREE_TEST_PARAMS(params); - exit(0); + exit(rc); } if ( NULL != params.prefix && -1 != params.ns_id) { TEST_SET_FILE(params.prefix, params.ns_id, params.rank); @@ -115,12 +84,12 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) { TEST_ERROR(("rank %d: PMIx_Get universe size failed: %d", myproc.rank, rc)); FREE_TEST_PARAMS(params); - exit(0); + exit(rc); } if (NULL == val) { TEST_ERROR(("rank %d: PMIx_Get universe size returned NULL value", myproc.rank)); FREE_TEST_PARAMS(params); - exit(0); + exit(1); } if (val->type != PMIX_UINT32 || val->data.uint32 != (uint32_t)params.ns_size ) { TEST_ERROR(("rank %d: Universe size value or type mismatch," @@ -128,7 +97,7 @@ int main(int argc, char **argv) myproc.rank, params.ns_size, PMIX_UINT32, val->data.integer, val->type)); FREE_TEST_PARAMS(params); - exit(0); + exit(1); } TEST_VERBOSE(("rank %d: Universe size check: PASSED", myproc.rank)); @@ -136,7 +105,7 @@ int main(int argc, char **argv) if( NULL != params.nspace && 0 != strcmp(myproc.nspace, params.nspace) ) { TEST_ERROR(("rank %d: Bad nspace!", myproc.rank)); FREE_TEST_PARAMS(params); - exit(0); + exit(1); } if (NULL != params.fences) { @@ -144,7 +113,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { FREE_TEST_PARAMS(params); TEST_ERROR(("%s:%d Fence test failed: %d", myproc.nspace, myproc.rank, rc)); - exit(0); + exit(rc); } } @@ -153,7 +122,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { FREE_TEST_PARAMS(params); TEST_ERROR(("%s:%d Job fence test failed: %d", myproc.nspace, myproc.rank, rc)); - exit(0); + exit(rc); } } @@ -162,7 +131,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { FREE_TEST_PARAMS(params); TEST_ERROR(("%s:%d Publish/Lookup test failed: %d", myproc.nspace, myproc.rank, rc)); - exit(0); + exit(rc); } } @@ -171,7 +140,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { FREE_TEST_PARAMS(params); TEST_ERROR(("%s:%d Spawn test failed: %d", myproc.nspace, myproc.rank, rc)); - exit(0); + exit(rc); } } @@ -180,7 +149,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { FREE_TEST_PARAMS(params); TEST_ERROR(("%s:%d Connect/Disconnect test failed: %d", myproc.nspace, myproc.rank, rc)); - exit(0); + exit(rc); } } @@ -189,7 +158,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { FREE_TEST_PARAMS(params); TEST_ERROR(("%s:%d Resolve peers test failed: %d", myproc.nspace, myproc.rank, rc)); - exit(0); + exit(rc); } } @@ -198,7 +167,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { FREE_TEST_PARAMS(params); TEST_ERROR(("%s:%d error registration and event handling test failed: %d", myproc.nspace, myproc.rank, rc)); - exit(0); + exit(rc); } } @@ -207,7 +176,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { FREE_TEST_PARAMS(params); TEST_ERROR(("%s:%d error key replace test failed: %d", myproc.nspace, myproc.rank, rc)); - exit(0); + exit(rc); } } @@ -216,12 +185,11 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { FREE_TEST_PARAMS(params); TEST_ERROR(("%s:%d error key store internal test failed: %d", myproc.nspace, myproc.rank, rc)); - exit(0); + exit(rc); } } TEST_VERBOSE(("Client ns %s rank %d: PASSED", myproc.nspace, myproc.rank)); - PMIx_Deregister_event_handler(1, op_callbk, NULL); /* In case of direct modex we want to delay Finalize until everybody has finished. Otherwise some processes @@ -240,5 +208,5 @@ int main(int argc, char **argv) TEST_OUTPUT_CLEAR(("OK\n")); TEST_CLOSE_FILE(); FREE_TEST_PARAMS(params); - exit(0); + exit(rc); } diff --git a/opal/mca/pmix/pmix3x/pmix/test/pmix_regex.c b/opal/mca/pmix/pmix3x/pmix/test/pmix_regex.c index 35a671305d2..b117140465e 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/pmix_regex.c +++ b/opal/mca/pmix/pmix3x/pmix/test/pmix_regex.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -75,6 +75,7 @@ int main(int argc, char **argv) free(regex); } else { fprintf(stderr, "Node reverse failed: %d\n\n\n", rc); + exit(rc); } fprintf(stderr, "PROCS: %s\n", TEST_PROCS); @@ -91,6 +92,7 @@ int main(int argc, char **argv) free(regex); } else { fprintf(stderr, "PPN reverse failed: %d\n", rc); + exit(rc); } fprintf(stderr, "NODES: %s\n", TEST_NODES2); @@ -107,6 +109,7 @@ int main(int argc, char **argv) free(regex); } else { fprintf(stderr, "Node reverse failed: %d\n\n\n", rc); + exit(rc); } return 0; } diff --git a/opal/mca/pmix/pmix3x/pmix/test/pmix_test.c b/opal/mca/pmix/pmix3x/pmix/test/pmix_test.c index 9ceeb72d539..b8d0825f872 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/pmix_test.c +++ b/opal/mca/pmix/pmix3x/pmix/test/pmix_test.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015-2018 Mellanox Technologies, Inc. @@ -30,6 +30,7 @@ #include #include #include +#include #include "src/util/pmix_environ.h" #include "src/util/output.h" @@ -45,18 +46,13 @@ int main(int argc, char **argv) { char **client_env=NULL; char **client_argv=NULL; - int rc; + int rc, i; struct stat stat_buf; - struct timeval tv; - double test_start; - test_params params; - INIT_TEST_PARAMS(params); - int test_fail = 0; char *tmp; int ns_nprocs; + sigset_t unblock; - gettimeofday(&tv, NULL); - test_start = tv.tv_sec + 1E-6*tv.tv_usec; + INIT_TEST_PARAMS(params); /* smoke test */ if (PMIX_SUCCESS != 0) { @@ -92,6 +88,20 @@ int main(int argc, char **argv) return 0; } + /* ensure that SIGCHLD is unblocked as we need to capture it */ + if (0 != sigemptyset(&unblock)) { + fprintf(stderr, "SIGEMPTYSET FAILED\n"); + exit(1); + } + if (0 != sigaddset(&unblock, SIGCHLD)) { + fprintf(stderr, "SIGADDSET FAILED\n"); + exit(1); + } + if (0 != sigprocmask(SIG_UNBLOCK, &unblock, NULL)) { + fprintf(stderr, "SIG_UNBLOCK FAILED\n"); + exit(1); + } + if (PMIX_SUCCESS != (rc = server_init(¶ms))) { FREE_TEST_PARAMS(params); return rc; @@ -136,26 +146,15 @@ int main(int argc, char **argv) TEST_ERROR(("Total number of processes doesn't correspond number specified by ns_dist parameter.")); cli_kill_all(); test_fail = 1; + goto done; } /* hang around until the client(s) finalize */ - while (!test_terminated()) { - // To avoid test hang we want to interrupt the loop each 0.1s - double test_current; - - // check if we exceed the max time - gettimeofday(&tv, NULL); - test_current = tv.tv_sec + 1E-6*tv.tv_usec; - if( (test_current - test_start) > params.timeout ){ - break; - } - cli_wait_all(0); - } - - if( !test_terminated() ){ - TEST_ERROR(("Test exited by a timeout!")); - cli_kill_all(); - test_fail = 1; + while (!test_complete) { + struct timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = 100000; + nanosleep(&ts, NULL); } if( test_abort ){ @@ -170,17 +169,25 @@ int main(int argc, char **argv) if (0 != params.test_spawn) { PMIX_WAIT_FOR_COMPLETION(spawn_wait); } + for(i=0; i < cli_info_cnt; i++){ + if (cli_info[i].exit_code != 0) { + ++test_fail; + } + } /* deregister the errhandler */ - PMIx_Deregister_event_handler(0, op_callbk, NULL); - - cli_wait_all(1.0); +// PMIx_Deregister_event_handler(0, op_callbk, NULL); + done: + TEST_VERBOSE(("srv #%d: call server_finalize!", my_server_id)); test_fail += server_finalize(¶ms); FREE_TEST_PARAMS(params); pmix_argv_free(client_argv); pmix_argv_free(client_env); + if (0 == test_fail) { + TEST_OUTPUT(("Test SUCCEEDED!")); + } return test_fail; } diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests00.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests00.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests00.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests00.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests01.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests01.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests01.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests01.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests02.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests02.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests02.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests02.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests03.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests03.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests03.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests03.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests04.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests04.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests04.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests04.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests05.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests05.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests05.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests05.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests06.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests06.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests06.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests06.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests07.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests07.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests07.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests07.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests08.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests08.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests08.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests08.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests09.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests09.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests09.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests09.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests10.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests10.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests10.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests10.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests11.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests11.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests11.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests11.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests12.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests12.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests12.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests12.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests13.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests13.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests13.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests13.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests14.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests14.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests14.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests14.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/run_tests15.pl.in b/opal/mca/pmix/pmix3x/pmix/test/run_tests15.pl.in index fb139c9ce7d..d472b21b1fc 100755 --- a/opal/mca/pmix/pmix3x/pmix/test/run_tests15.pl.in +++ b/opal/mca/pmix/pmix3x/pmix/test/run_tests15.pl.in @@ -23,14 +23,15 @@ my @tests = ("-n 4 --ns-dist 3:1 --fence \"[db | 0:0-2;1:0]\"", "-n 5 --test-resolve-peers --ns-dist \"1:2:2\"", "-n 5 --test-replace 100:0,1,10,50,99", "-n 5 --test-internal 10", - "-s 2 -n 2 --job-fence", - "-s 2 -n 2 --job-fence -c"); + "-s 1 -n 2 --job-fence", + "-s 1 -n 2 --job-fence -c"); my $test; my $cmd; my $output; my $status = 0; my $testnum; +my $timeout_cmd = ""; # We are running tests against the build tree (vs. the installation # tree). Autogen gives us a full list of all possible component @@ -63,7 +64,24 @@ $testnum =~ s/.pl//; $testnum = substr($testnum, -2); $test = @tests[$testnum]; -$cmd = "./pmix_test " . $test . " 2>&1"; +# find the timeout or gtimeout cmd so we can timeout the +# test if it hangs +my @paths = split(/:/, $ENV{PATH}); +foreach my $p (@paths) { + my $fullpath = $p . "/" . "gtimeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } else { + my $fullpath = $p . "/" . "timeout"; + if ((-e $fullpath) && (-f $fullpath)) { + $timeout_cmd = $fullpath . " --preserve-status -k 35 30 "; + last; + } + } +} + +$cmd = $timeout_cmd . " ./pmix_test " . $test . " 2>&1"; print $cmd . "\n"; $output = `$cmd`; print $output . "\n"; diff --git a/opal/mca/pmix/pmix3x/pmix/test/server_callbacks.c b/opal/mca/pmix/pmix3x/pmix/test/server_callbacks.c index ae16129ecf2..ee16bb240f0 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/server_callbacks.c +++ b/opal/mca/pmix/pmix3x/pmix/test/server_callbacks.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015-2018 Mellanox Technologies, Inc. @@ -32,9 +32,12 @@ pmix_server_module_t mymodule = { .unpublish = unpublish_fn, .spawn = spawn_fn, .connect = connect_fn, - .disconnect = disconnect_fn, + .disconnect = disconnect_fn +#if 0 +, .register_events = regevents_fn, .deregister_events = deregevents_fn +#endif }; typedef struct { diff --git a/opal/mca/pmix/pmix3x/pmix/test/simple/Makefile.am b/opal/mca/pmix/pmix3x/pmix/test/simple/Makefile.am index 5ab9f568bb0..ee029668031 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/simple/Makefile.am +++ b/opal/mca/pmix/pmix3x/pmix/test/simple/Makefile.am @@ -25,7 +25,8 @@ headers = simptest.h noinst_PROGRAMS = simptest simpclient simppub simpdyn simpft simpdmodex \ test_pmix simptool simpdie simplegacy simptimeout \ - gwtest gwclient stability quietclient simpjctrl + gwtest gwclient stability quietclient simpjctrl \ + pmitest simptest_SOURCES = $(headers) \ simptest.c @@ -122,3 +123,9 @@ simpjctrl_SOURCES = \ simpjctrl_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) simpjctrl_LDADD = \ $(top_builddir)/src/libpmix.la + +pmitest_SOURCES = \ + pmitest.c +pmitest_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) +pmitest_LDADD = \ + $(top_builddir)/src/libpmi.la diff --git a/opal/mca/pmix/pmix3x/pmix/test/simple/pmitest.c b/opal/mca/pmix/pmix3x/pmix/test/simple/pmitest.c new file mode 100644 index 00000000000..2546d550e6c --- /dev/null +++ b/opal/mca/pmix/pmix3x/pmix/test/simple/pmitest.c @@ -0,0 +1,305 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * + * (C) 2001 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ +#include +#include +#include +#include "pmi.h" + +static const char * PMI_Err_str(int error) +{ + static char str[100]; + switch (error) + { + case PMI_SUCCESS: + return "PMI_SUCCESS"; + case PMI_FAIL: + return "PMI_FAIL"; + case PMI_ERR_INVALID_ARG: + return "PMI_ERR_INVALID_ARG"; + case PMI_ERR_INVALID_KEY: + return "PMI_ERR_INVALID_KEY"; + case PMI_ERR_INVALID_KEY_LENGTH: + return "PMI_ERR_INVALID_KEY_LENGTH"; + case PMI_ERR_INVALID_VAL: + return "PMI_ERR_INVALID_VAL"; + case PMI_ERR_INVALID_VAL_LENGTH: + return "PMI_ERR_INVALID_VAL_LENGTH"; + case PMI_ERR_INVALID_LENGTH: + return "PMI_ERR_INVALID_LENGTH"; + case PMI_ERR_INIT: + return "PMI_ERR_INIT"; + case PMI_ERR_NOMEM: + return "PMI_ERR_NOMEM"; + } + sprintf(str, "PMI_ERR_UNKNOWN: %d", error); + return str; +} + +#define PRINT_ERROR(error, fcname) if (error != PMI_SUCCESS) printf("%s failed: %s\n", fcname, PMI_Err_str(error)); else printf("%s unexpectedly succeeded\n", fcname); fflush(stdout); + +int main( int argc, char * argv[] ) +{ + int rc, spawned, size, rank, name_max, id_maxlen, key_maxlen, val_maxlen; + char *kvsname, *id, *domain_id, *key, *val; + + rc = PMI_Init( &spawned ); + if ( rc != PMI_SUCCESS ) + { + printf( "PMI_Init failed with rc = %s\n", PMI_Err_str(rc) ); + return -1 ; + } + else + { + printf( "PMI_Init returned spawned = %d\n", spawned ); + } + + rc = PMI_Get_size( &size ); + if ( rc == PMI_SUCCESS ) + { + rc = PMI_Get_rank( &rank ); + if ( rc == PMI_SUCCESS ) + printf( "size = %d, rank = %d\n", size, rank ); + else + printf( "PMI_Get_Rank failed with rc = %s\n", PMI_Err_str(rc) ); + } + else + printf( "PMI_Get_size failed with rc = %s\n", PMI_Err_str(rc) ); + + rc = PMI_KVS_Get_name_length_max( &name_max ); + if ( rc != PMI_SUCCESS ) + { + printf( "PMI_KVS_Get_name_length_max failed with rc = %s\n", PMI_Err_str(rc) ); + return -1; + } + else + printf( "PMI_KVS_Get_name_length_max got %d\n", name_max ); + + kvsname = (char *) malloc( name_max ); + rc = PMI_KVS_Get_my_name( kvsname, name_max ); + if ( rc != PMI_SUCCESS ) + { + printf( "PMI_KVS_Get_my_name failed with rc = %s\n", PMI_Err_str(rc) ); + return -1; + } + else + printf( "PMI_KVS_Get_my_name got %s\n", kvsname ); + + rc = PMI_Get_id_length_max( &id_maxlen ); + if ( rc != PMI_SUCCESS ) + { + printf("PMI_Get_id_length_max failed with rc = %s\n", PMI_Err_str(rc) ); + return -1; + } + else + printf("PMI_Get_id_length_max got %d\n", id_maxlen); + id = (char *) malloc( id_maxlen ); + rc = PMI_Get_id( id, id_maxlen ); + if ( rc != PMI_SUCCESS ) + { + printf("PMI_Get_id failed with rc = %s\n", PMI_Err_str(rc)); + } + else + printf( "PMI_Get_id got %s\n", id ); + domain_id = (char *) malloc( id_maxlen ); + rc = PMI_Get_kvs_domain_id( domain_id, id_maxlen ); + if ( rc != PMI_SUCCESS ) + { + printf("PMI_Get_kvs_domain_id failed with rc = %s\n", PMI_Err_str(rc)); + } + else + printf( "PMI_Get_kvs_domain_id got %s\n", domain_id ); + + rc = PMI_KVS_Get_key_length_max( &key_maxlen ); + if (rc != PMI_SUCCESS ) + { + printf("PMI_KVS_Get_key_length_max failed with rc = %s\n", PMI_Err_str(rc)); + return -1; + } + else + printf( "PMI_Get_key_maxlen got %d\n", key_maxlen ); + key = (char *) malloc( key_maxlen ); + rc = PMI_KVS_Get_value_length_max( &val_maxlen ); + if (rc != PMI_SUCCESS) + { + printf("PMI_KVS_Get_value_length_max failed with rc = %s\n", PMI_Err_str(rc)); + return -1; + } + else + printf( "PMI_Get_val_maxlen got %d\n", val_maxlen ); + val = (char *) malloc( val_maxlen ); + + sprintf(key, "test_key_%d", rank); + sprintf(val, "test_value_%d", rank); + + rc = PMI_KVS_Put( kvsname, key, val ); + if (rc != PMI_SUCCESS) + { + printf("PMI_KVS_Put failed with rc = %s\n", PMI_Err_str(rc)); + } + rc = PMI_KVS_Commit( kvsname ); + if (rc != PMI_SUCCESS) + { + printf("PMI_KVS_Commit failed with rc = %s\n", PMI_Err_str(rc)); + } + rc = PMI_Barrier(); + if (rc != PMI_SUCCESS) + { + printf("PMI_Barrier failed with rc = %s\n", PMI_Err_str(rc)); + } + + sprintf(key, "test_key_%d", (rank + 1) % size); + rc = PMI_KVS_Get( kvsname, key, val, val_maxlen ); + if (rc != PMI_SUCCESS) + { + printf("PMI_KVS_Get(%s) failed with rc = %s\n", key, PMI_Err_str(rc)); + } + else + printf("PMI_KVS_Get(%s) returned %s\n", key, val); + + /* Test awkward character string put and get */ + if (rank == 0) + { + sprintf(key, "foo"); + sprintf(val, "foo=bar baz=bif name=\"Buzz Bee\" clink=~!@#$\\;':<>,. clank=a b c"); + + rc = PMI_KVS_Put( kvsname, key, val ); + if (rc != PMI_SUCCESS) + { + printf("PMI_KVS_Put failed with rc = %s\n", PMI_Err_str(rc)); + } + rc = PMI_KVS_Commit( kvsname ); + if (rc != PMI_SUCCESS) + { + printf("PMI_KVS_Commit failed with rc = %s\n", PMI_Err_str(rc)); + } + } + + rc = PMI_Barrier(); + if (rc != PMI_SUCCESS) + { + printf("PMI_Barrier failed with rc = %s\n", PMI_Err_str(rc)); + } + + if (rank == size - 1) + { + sprintf(key, "foo"); + rc = PMI_KVS_Get( kvsname, key, val, val_maxlen ); + if (rc != PMI_SUCCESS) + { + printf("PMI_KVS_Get(%s) failed with rc = %s\n", key, PMI_Err_str(rc)); + } + else + printf("PMI_KVS_Get(%s) returned %s\n", key, val); + } + + if ( rank == (size - 1) ) + { + key[0] = '\0'; + val[0] = '\0'; + rc = PMI_KVS_Iter_first(kvsname, key, key_maxlen, val, val_maxlen); + if (rc == PMI_SUCCESS) + { + while (key[0] != '\0') + { + printf("PMI_KVS_Iter got key=%s val=%s\n",key,val); + rc = PMI_KVS_Iter_next(kvsname, key, key_maxlen, val, val_maxlen); + if (rc != PMI_SUCCESS) + { + printf("PMK_KVS_Iter_next failed with rc = %s\n", PMI_Err_str(rc)); + break; + } + } + } + else + { + printf("PMI_KVS_Iter_first failed with rc = %s\n", PMI_Err_str(rc)); + } +} + + /* error testing */ +if (rank != 0) +{ + printf("PMI error testing:\n"); + strcpy(key, "test_key"); + strcpy(val, "test_val"); + rc = PMI_KVS_Put("baloney", key, val); + PRINT_ERROR(rc, "PMI_KVS_Put(baloney, key, val)"); + rc = PMI_KVS_Put(NULL, key, val); + PRINT_ERROR(rc, "PMI_KVS_Put(NULL, key, val)"); + rc = PMI_KVS_Put(kvsname, NULL, val); + PRINT_ERROR(rc, "PMI_KVS_Put(kvsname, NULL, val)"); + rc = PMI_KVS_Put(kvsname, key, NULL); + PRINT_ERROR(rc, "PMI_KVS_Put(kvsname, key, NULL)"); + rc = PMI_KVS_Get("baloney", key, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Get(baloney, key, val, val_maxlen)"); + rc = PMI_KVS_Get(NULL, key, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Get(NULL, key, val, val_maxlen)"); + rc = PMI_KVS_Get(kvsname, NULL, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Get(kvsname, NULL, val, val_maxlen)"); + rc = PMI_KVS_Get(kvsname, key, NULL, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Get(kvsname, key, NULL, val_maxlen)"); + rc = PMI_KVS_Get(kvsname, key, val, -1); + PRINT_ERROR(rc, "PMI_KVS_Get(kvsname, key, val, -1)"); + rc = PMI_KVS_Commit(NULL); + PRINT_ERROR(rc, "PMI_KVS_Commit(NULL)"); + rc = PMI_KVS_Commit("baloney"); + PRINT_ERROR(rc, "PMI_KVS_Commit(baloney)"); + rc = PMI_KVS_Get_my_name(NULL, name_max); + PRINT_ERROR(rc, "PMI_KVS_Get_my_name(NULL, name_max)"); + rc = PMI_KVS_Get_my_name(kvsname, -1); + PRINT_ERROR(rc, "PMI_KVS_Get_my_name(kvsname, -1)"); + rc = PMI_Get_id(NULL, id_maxlen); + PRINT_ERROR(rc, "PMI_Get_id(NULL, id_maxlen)"); + rc = PMI_Get_id(id, -1); + PRINT_ERROR(rc, "PMI_Get_id(id, -1)"); + rc = PMI_Get_kvs_domain_id(NULL, id_maxlen); + PRINT_ERROR(rc, "PMI_Get_domain_id(NULL, id_maxlen)"); + rc = PMI_Get_kvs_domain_id(domain_id, -1); + PRINT_ERROR(rc, "PMI_Get_domain_id(domain_id, -1)"); + rc = PMI_Init(NULL); + PRINT_ERROR(rc, "PMI_Init(NULL)"); + rc = PMI_Get_rank(NULL); + PRINT_ERROR(rc, "PMI_Get_rank(NULL)"); + rc = PMI_Get_size(NULL); + PRINT_ERROR(rc, "PMI_Get_size(NULL)"); + rc = PMI_KVS_Get_name_length_max(NULL); + PRINT_ERROR(rc, "PMI_Get_name_length_max(NULL)"); + rc = PMI_Get_id_length_max(NULL); + PRINT_ERROR(rc, "PMI_Get_id_length_max(NULL)"); + rc = PMI_KVS_Get_key_length_max(NULL); + PRINT_ERROR(rc, "PMI_Get_key_length_max(NULL)"); + rc = PMI_KVS_Get_value_length_max(NULL); + PRINT_ERROR(rc, "PMI_Get_value_length_max(NULL)"); + rc = PMI_KVS_Iter_first("baloney", key, key_maxlen, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_first(baloney, key, key_maxlen, val, val_maxlen)"); + rc = PMI_KVS_Iter_first(NULL, key, key_maxlen, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_first(NULL, key, key_maxlen, val, val_maxlen)"); + rc = PMI_KVS_Iter_first(kvsname, NULL, key_maxlen, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_first(kvsname, NULL, key_maxlen, val, val_maxlen)"); + rc = PMI_KVS_Iter_first(kvsname, key, -1, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_first(kvsname, key, -1, val, val_maxlen)"); + rc = PMI_KVS_Iter_first(kvsname, key, key_maxlen, NULL, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_first(kvsname, key, key_maxlen, NULL, val_maxlen)"); + rc = PMI_KVS_Iter_first(kvsname, key, key_maxlen, val, -1); + PRINT_ERROR(rc, "PMI_KVS_Iter_first(kvsname, key, key_maxlen, val, -1)"); + rc = PMI_KVS_Iter_next("baloney", key, key_maxlen, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_next(baloney, key, key_maxlen, val, val_maxlen)"); + rc = PMI_KVS_Iter_next(NULL, key, key_maxlen, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_next(NULL, key, key_maxlen, val, val_maxlen)"); + rc = PMI_KVS_Iter_next(kvsname, NULL, key_maxlen, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_next(kvsname, NULL, key_maxlen, val, val_maxlen)"); + rc = PMI_KVS_Iter_next(kvsname, key, -1, val, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_next(kvsname, key, -1, val, val_maxlen)"); + rc = PMI_KVS_Iter_next(kvsname, key, key_maxlen, NULL, val_maxlen); + PRINT_ERROR(rc, "PMI_KVS_Iter_next(kvsname, key, key_maxlen, NULL, val_maxlen)"); + rc = PMI_KVS_Iter_next(kvsname, key, key_maxlen, val, -1); + PRINT_ERROR(rc, "PMI_KVS_Iter_next(kvsname, key, key_maxlen, val, -1)"); +} + +rc = PMI_Finalize( ); +return 0; +} diff --git a/opal/mca/pmix/pmix3x/pmix/test/simple/simpclient.c b/opal/mca/pmix/pmix3x/pmix/test/simple/simpclient.c index 80aea143083..45b74292857 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/simple/simpclient.c +++ b/opal/mca/pmix/pmix3x/pmix/test/simple/simpclient.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved. * $COPYRIGHT$ * @@ -34,6 +34,7 @@ #include "src/class/pmix_object.h" #include "src/util/output.h" #include "src/util/printf.h" +#include "src/include/pmix_globals.h" #define MAXCNT 1 @@ -154,16 +155,43 @@ int main(int argc, char **argv) exit(rc); } PMIX_INFO_FREE(iptr, 2); - pmix_output(0, "Client ns %s rank %d: Running", myproc.nspace, myproc.rank); + pmix_output(0, "Client ns %s rank %d: Running on node %s", myproc.nspace, myproc.rank, pmix_globals.hostname); /* test something */ (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &val))) { + pmix_output(0, "Client ns %s rank %d: PMIx_Get job size failed: %s", + myproc.nspace, myproc.rank, PMIx_Error_string(rc)); + exit(rc); + } + nprocs = val->data.uint32; + PMIX_VALUE_RELEASE(val); + pmix_output(0, "Client %s:%d job size %d", myproc.nspace, myproc.rank, nprocs); + + /* test something */ + if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_SERVER_URI, NULL, 0, &val))) { pmix_output(0, "Client ns %s rank %d: PMIx_Get failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); exit(rc); } + pmix_output(0, "CLIENT SERVER URI: %s", val->data.string); + PMIX_VALUE_RELEASE(val); + + if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_LOCAL_RANK, NULL, 0, &val))) { + pmix_output(0, "Client ns %s rank %d: PMIx_Get LOCAL RANK failed: %s", + myproc.nspace, myproc.rank, PMIx_Error_string(rc)); + exit(rc); + } + pmix_output(0, "CLIENT LOCAL RANK: %u", val->data.uint32); + PMIX_VALUE_RELEASE(val); + + if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_HOSTNAME, NULL, 0, &val))) { + pmix_output(0, "Client ns %s rank %d: PMIx_Get HOSTNAME failed: %s", + myproc.nspace, myproc.rank, PMIx_Error_string(rc)); + exit(rc); + } + pmix_output(0, "CLIENT HOSTNAME: %s", val->data.string); PMIX_VALUE_RELEASE(val); /* register a handler specifically for when models declare */ @@ -188,18 +216,6 @@ int main(int argc, char **argv) } - /* get our job size */ - (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); - proc.rank = PMIX_RANK_WILDCARD; - if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &val))) { - pmix_output(0, "Client ns %s rank %d: PMIx_Get job size failed: %s", - myproc.nspace, myproc.rank, PMIx_Error_string(rc)); - goto done; - } - nprocs = val->data.uint32; - PMIX_VALUE_RELEASE(val); - pmix_output(0, "Client %s:%d job size %d", myproc.nspace, myproc.rank, nprocs); - /* put a few values */ (void)asprintf(&tmp, "%s-%d-internal", myproc.nspace, myproc.rank); value.type = PMIX_UINT32; @@ -207,14 +223,14 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != (rc = PMIx_Store_internal(&myproc, tmp, &value))) { pmix_output(0, "Client ns %s rank %d: PMIx_Store_internal failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); - goto done; + exit(rc); } /* get a list of our local peers */ if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_LOCAL_PEERS, NULL, 0, &val))) { pmix_output(0, "Client ns %s rank %d: PMIx_Get local peers failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); - goto done; + exit(rc); } /* split the returned string to get the rank of each local peer */ peers = pmix_argv_split(val->data.string, ','); @@ -232,13 +248,14 @@ int main(int argc, char **argv) pmix_argv_free(peers); for (cnt=0; cnt < MAXCNT; cnt++) { + pmix_output(0, "Client %s:%d executing loop %d", myproc.nspace, myproc.rank, cnt); (void)asprintf(&tmp, "%s-%d-local-%d", myproc.nspace, myproc.rank, cnt); value.type = PMIX_UINT64; value.data.uint64 = 1234; if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_LOCAL, tmp, &value))) { pmix_output(0, "Client ns %s rank %d: PMIx_Put internal failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); - goto done; + exit(rc); } (void)asprintf(&tmp, "%s-%d-remote-%d", myproc.nspace, myproc.rank, cnt); @@ -247,13 +264,13 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) { pmix_output(0, "Client ns %s rank %d: PMIx_Put internal failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); - goto done; + exit(rc); } if (PMIX_SUCCESS != (rc = PMIx_Commit())) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Commit failed: %s", myproc.nspace, myproc.rank, cnt, PMIx_Error_string(rc)); - goto done; + exit(rc); } /* call fence to ensure the data is received */ @@ -263,7 +280,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Fence failed: %s", myproc.nspace, myproc.rank, cnt, PMIx_Error_string(rc)); - goto done; + exit(rc); } /* check the returned data */ @@ -293,19 +310,19 @@ int main(int argc, char **argv) if (NULL == val) { pmix_output(0, "Client ns %s rank %d: NULL value returned", myproc.nspace, myproc.rank); - break; + exit(1); } if (PMIX_UINT64 != val->type) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned wrong type: %d", myproc.nspace, myproc.rank, j, tmp, val->type); PMIX_VALUE_RELEASE(val); free(tmp); - continue; + exit(1); } if (1234 != val->data.uint64) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned wrong value: %d", myproc.nspace, myproc.rank, j, tmp, (int)val->data.uint64); PMIX_VALUE_RELEASE(val); free(tmp); - continue; + exit(1); } pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp); PMIX_VALUE_RELEASE(val); @@ -322,6 +339,7 @@ int main(int argc, char **argv) } else { pmix_output(0, "ERROR: Client ns %s rank %d cnt %d: PMIx_Get %s returned remote data for a local proc", myproc.nspace, myproc.rank, j, tmp); + exit(1); } if (NULL != val) { PMIX_VALUE_RELEASE(val); @@ -329,14 +347,18 @@ int main(int argc, char **argv) free(tmp); } } else { + val = NULL; (void)asprintf(&tmp, "%s-%d-remote-%d", proc.nspace, n, j); - if (PMIX_SUCCESS == (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) { + if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp); } else { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s failed for remote proc", myproc.nspace, myproc.rank, j, tmp); + exit(1); + } + if (NULL != val) { + PMIX_VALUE_RELEASE(val); } - PMIX_VALUE_RELEASE(val); free(tmp); } } @@ -351,14 +373,17 @@ int main(int argc, char **argv) pmix_output(0, "Client ns %s rank %d did not return an array for its internal modex blob", myproc.nspace, myproc.rank); PMIX_VALUE_RELEASE(val); + exit(1); } else if (PMIX_INFO != val->data.darray->type) { pmix_output(0, "Client ns %s rank %d returned an internal modex array of type %s instead of PMIX_INFO", myproc.nspace, myproc.rank, PMIx_Data_type_string(val->data.darray->type)); PMIX_VALUE_RELEASE(val); + exit(1); } else if (0 == val->data.darray->size) { pmix_output(0, "Client ns %s rank %d returned an internal modex array of zero length", myproc.nspace, myproc.rank); PMIX_VALUE_RELEASE(val); + exit(1); } else { pmix_info_t *iptr = (pmix_info_t*)val->data.darray->array; for (n=0; n < val->data.darray->size; n++) { @@ -369,6 +394,7 @@ int main(int argc, char **argv) } else { pmix_output(0, "Client ns %s rank %d internal modex blob FAILED with error %s(%d)", myproc.nspace, myproc.rank, PMIx_Error_string(rc), rc); + exit(rc); } /* log something */ @@ -379,6 +405,7 @@ int main(int argc, char **argv) if (PMIX_SUCCESS != rc) { pmix_output(0, "Client ns %s rank %d - log_nb returned %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); + exit(rc); } else { while (active) { usleep(10); @@ -397,7 +424,6 @@ int main(int argc, char **argv) } } - done: /* finalize us */ pmix_output(0, "Client ns %s rank %d: Finalizing", myproc.nspace, myproc.rank); if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) { diff --git a/opal/mca/pmix/pmix3x/pmix/test/simple/simptest.c b/opal/mca/pmix/pmix3x/pmix/test/simple/simptest.c index b78165f704a..1cbb4e78cea 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/simple/simptest.c +++ b/opal/mca/pmix/pmix3x/pmix/test/simple/simptest.c @@ -198,8 +198,17 @@ static pmix_event_t handler; static pmix_list_t children; static bool istimeouttest = false; static mylock_t globallock; +static bool nettest = false; +static bool model = false; +static bool xversion = false; +static char *hostnames[] = { + "test000", + "test001", + "test002", + NULL +}; -static void set_namespace(int nprocs, char *ranks, char *nspace, +static void set_namespace(int nprocs, char *nspace, pmix_op_cbfunc_t cbfunc, myxfer_t *x); static void errhandler(size_t evhdlr_registration_id, pmix_status_t status, @@ -262,6 +271,32 @@ static void infocbfunc(pmix_status_t status, DEBUG_WAKEUP_THREAD(lock); } +static void setup_cbfunc(pmix_status_t status, + pmix_info_t info[], size_t ninfo, + void *provided_cbdata, + pmix_op_cbfunc_t cbfunc, void *cbdata) +{ + myxfer_t *x = (myxfer_t*)provided_cbdata; + size_t n; + + /* transfer it to the caddy for return to the main thread */ + if (0 < ninfo) { + PMIX_INFO_CREATE(x->info, ninfo); + x->ninfo = ninfo; + for (n=0; n < ninfo; n++) { + PMIX_INFO_XFER(&x->info[n], &info[n]); + } + } + + /* let the library release the data and cleanup from + * the operation */ + if (NULL != cbfunc) { + cbfunc(PMIX_SUCCESS, cbdata); + } + + DEBUG_WAKEUP_THREAD(&x->lock); +} + /* this is an event notification function that we explicitly request * be called when the PMIX_MODEL_DECLARED notification is issued. * We could catch it in the general event notification function and test @@ -297,21 +332,6 @@ static void model_callback(size_t evhdlr_registration_id, DEBUG_WAKEUP_THREAD(&globallock); } -/* event handler registration is done asynchronously */ -static void model_registration_callback(pmix_status_t status, - size_t evhandler_ref, - void *cbdata) -{ - mylock_t *lock = (mylock_t*)cbdata; - - if (PMIX_SUCCESS != status) { - fprintf(stderr, "simptest EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n", - status, (unsigned long)evhandler_ref); - } - lock->status = status; - DEBUG_WAKEUP_THREAD(lock); -} - static void set_handler_default(int sig) { struct sigaction act; @@ -327,7 +347,7 @@ int main(int argc, char **argv) { char **client_env=NULL; char **client_argv=NULL; - char *tmp, **atmp, *executable=NULL; + char *executable=NULL; int rc, nprocs=1, n, k; uid_t myuid; gid_t mygid; @@ -337,8 +357,6 @@ int main(int argc, char **argv) wait_tracker_t *child; pmix_info_t *info; size_t ninfo; - bool cross_version = false; - bool usock = true; bool hwloc = false; #if PMIX_HAVE_HWLOC char *hwloc_file = NULL; @@ -371,14 +389,6 @@ int main(int argc, char **argv) pmix_argv_append_nosize(&client_argv, argv[k]); } n += k; - } else if (0 == strcmp("-x", argv[n])) { - /* cross-version test - we will set one child to - * run at a different version. Requires -n >= 2 */ - cross_version = true; - usock = false; - } else if (0 == strcmp("-u", argv[n])) { - /* enable usock */ - usock = false; #if PMIX_HAVE_HWLOC } else if (0 == strcmp("-hwloc", argv[n]) || 0 == strcmp("--hwloc", argv[n])) { @@ -399,15 +409,32 @@ int main(int argc, char **argv) fprintf(stderr, "usage: simptest \n"); fprintf(stderr, " -n N Number of clients to run\n"); fprintf(stderr, " -e foo Name of the client executable to run (default: simpclient\n"); - fprintf(stderr, " -x Test cross-version support\n"); fprintf(stderr, " -u Enable legacy usock support\n"); fprintf(stderr, " -hwloc Test hwloc support\n"); fprintf(stderr, " -hwloc-file FILE Use file to import topology\n"); + fprintf(stderr, " -net-test Test network endpt assignments\n"); + fprintf(stderr, " -xversion Cross-version test - simulate single node only\n"); exit(0); + } else if (0 == strcmp("-net-test", argv[n]) || + 0 == strcmp("--net-test", argv[n])) { + /* test network support */ + nettest = true; + } else if (0 == strcmp("-model", argv[n]) || + 0 == strcmp("--model", argv[n])) { + /* test network support */ + model = true; + } else if (0 == strcmp("-x", argv[n]) || + 0 == strcmp("-xversion", argv[n]) || + 0 == strcmp("--xversion", argv[n])) { + xversion = true; } } if (NULL == executable) { - executable = strdup("./simpclient"); + if (nettest) { + executable = strdup("./simpcoord"); + } else { + executable = strdup("./simpclient"); + } } /* check for executable existence and permissions */ if (0 != access(executable, X_OK)) { @@ -415,11 +442,6 @@ int main(int argc, char **argv) exit(1); } - if (cross_version && nprocs < 2) { - fprintf(stderr, "Cross-version testing requires at least two clients\n"); - exit(1); - } - #if !PMIX_HAVE_HWLOC if (hwloc) { fprintf(stderr, "PMIx was not configured with HWLOC support - cannot continue\n"); @@ -448,37 +470,45 @@ int main(int argc, char **argv) #if PMIX_HAVE_HWLOC if (hwloc) { #if HWLOC_API_VERSION < 0x20000 - ninfo = 3; + ninfo = 2; #else - ninfo = 4; + ninfo = 3; #endif } else { - ninfo = 2; + ninfo = 1; } #else - ninfo = 2; + ninfo = 1; #endif PMIX_INFO_CREATE(info, ninfo); PMIX_INFO_LOAD(&info[0], PMIX_SERVER_TOOL_SUPPORT, NULL, PMIX_BOOL); - PMIX_INFO_LOAD(&info[1], PMIX_SERVER_GATEWAY, NULL, PMIX_BOOL); #if PMIX_HAVE_HWLOC if (hwloc) { if (NULL != hwloc_file) { - PMIX_INFO_LOAD(&info[2], PMIX_TOPOLOGY_FILE, hwloc_file, PMIX_STRING); + PMIX_INFO_LOAD(&info[1], PMIX_TOPOLOGY_FILE, hwloc_file, PMIX_STRING); } else { - PMIX_INFO_LOAD(&info[2], PMIX_TOPOLOGY, NULL, PMIX_STRING); + PMIX_INFO_LOAD(&info[1], PMIX_TOPOLOGY, NULL, PMIX_STRING); } #if HWLOC_API_VERSION >= 0x20000 - PMIX_INFO_LOAD(&info[3], PMIX_HWLOC_SHARE_TOPO, NULL, PMIX_BOOL); + PMIX_INFO_LOAD(&info[2], PMIX_HWLOC_SHARE_TOPO, NULL, PMIX_BOOL); #endif } #endif + if (nettest) { + /* set a known network configuration for the pnet/test component */ + putenv("PMIX_MCA_pnet_test_planes=plane:d:3;plane:s:2;plane:d:5:2"); + putenv("PMIX_MCA_pnet=test"); + } if (PMIX_SUCCESS != (rc = PMIx_server_init(&mymodule, info, ninfo))) { - fprintf(stderr, "Init failed with error %d\n", rc); + fprintf(stderr, "Init failed with error %s\n", PMIx_Error_string(rc)); return rc; } PMIX_INFO_FREE(info, ninfo); + if (nettest) { + unsetenv("PMIX_MCA_pnet"); + unsetenv("PMIX_MCA_pnet_test_planes"); + } /* register the default errhandler */ DEBUG_CONSTRUCT_LOCK(&mylock); @@ -501,8 +531,7 @@ int main(int argc, char **argv) PMIX_INFO_LOAD(&info[0], PMIX_EVENT_HDLR_NAME, "SIMPTEST-MODEL", PMIX_STRING); code = PMIX_MODEL_DECLARED; PMIx_Register_event_handler(&code, 1, info, ninfo, - model_callback, model_registration_callback, (void*)&mylock); - DEBUG_WAIT_THREAD(&mylock); + model_callback, NULL, NULL); PMIX_INFO_FREE(info, ninfo); if (PMIX_SUCCESS != mylock.status) { exit(mylock.status); @@ -519,16 +548,12 @@ int main(int argc, char **argv) pmix_event_add(&handler, NULL); /* we have a single namespace for all clients */ - atmp = NULL; - for (n=0; n < nprocs; n++) { - asprintf(&tmp, "%d", n); - pmix_argv_append_nosize(&atmp, tmp); - free(tmp); - } - tmp = pmix_argv_join(atmp, ','); - pmix_argv_free(atmp); x = PMIX_NEW(myxfer_t); - set_namespace(nprocs, tmp, "foobar", opcbfunc, x); + set_namespace(nprocs, "foobar", opcbfunc, x); + /* if the nspace registration hasn't completed yet, + * wait for it here */ + DEBUG_WAIT_THREAD(&x->lock); + PMIX_RELEASE(x); /* set common argv and env */ client_env = pmix_argv_copy(environ); @@ -553,33 +578,21 @@ int main(int argc, char **argv) } DEBUG_DESTRUCT_LOCK(&mylock); - /* if the nspace registration hasn't completed yet, - * wait for it here */ - DEBUG_WAIT_THREAD(&x->lock); - free(tmp); - PMIX_RELEASE(x); /* fork/exec the test */ (void)strncpy(proc.nspace, "foobar", PMIX_MAX_NSLEN); for (n = 0; n < nprocs; n++) { proc.rank = n; - if (PMIX_SUCCESS != (rc = PMIx_server_setup_fork(&proc, &client_env))) {//n + if (PMIX_SUCCESS != (rc = PMIx_server_setup_fork(&proc, &client_env))) { fprintf(stderr, "Server fork setup failed with error %d\n", rc); PMIx_server_finalize(); return rc; } - /* if cross-version test is requested, then oscillate PTL support - * by rank */ - if (cross_version) { - if (0 == n % 2) { - pmix_setenv("PMIX_MCA_ptl", "tcp", true, &client_env); - } else { - pmix_setenv("PMIX_MCA_ptl", "usock", true, &client_env); - } - } else if (!usock) { - /* don't disable usock => enable it on client */ - pmix_setenv("PMIX_MCA_ptl", "usock", true, &client_env); + /* add the hostname we want them to use */ + if (!xversion) { + PMIX_SETENV(rc, "PMIX_HOSTNAME", hostnames[n % 3], &client_env); } + x = PMIX_NEW(myxfer_t); if (PMIX_SUCCESS != (rc = PMIx_server_register_client(&proc, myuid, mygid, NULL, opcbfunc, x))) { @@ -699,108 +712,153 @@ int main(int argc, char **argv) return exit_code; } -static void set_namespace(int nprocs, char *ranks, char *nspace, +static void set_namespace(int nprocs, char *nspace, pmix_op_cbfunc_t cbfunc, myxfer_t *x) { - char *regex, *ppn; - int n, m, k; - pmix_info_t *info; + char *regex, *ppn, *rks; + int n, m, k, nnodes; pmix_data_array_t *array; + pmix_info_t *info, *iptr, *ip; + pmix_info_t *isv1, *isv2; + myxfer_t cd, lock; + pmix_status_t rc; + char **map[3] = {NULL, NULL, NULL}; + char *peers[3] = {NULL, NULL, NULL}; + char tmp[50] , **agg = NULL; + + if (xversion) { + /* everything on one node */ + PMIx_generate_regex(pmix_globals.hostname, ®ex); + for (m=0; m < nprocs; m++) { + snprintf(tmp, 50, "%d", m); + pmix_argv_append_nosize(&agg, tmp); + memset(tmp, 0, 50); + } + rks = pmix_argv_join(agg, ','); + pmix_argv_free(agg); + PMIx_generate_ppn(rks, &ppn); + free(rks); + nnodes = 1; + } else { + if (nprocs < 3) { + /* take only the number of hostnames equal to + * the number of procs */ + for (m=0; m < nprocs; m++) { + pmix_argv_append_nosize(&agg, hostnames[m]); + } + ppn = pmix_argv_join(agg, ','); + pmix_argv_free(agg); + agg = NULL; + nnodes = nprocs; + } else { + nnodes = 3; + ppn = pmix_argv_join(hostnames, ','); + } + PMIx_generate_regex(ppn, ®ex); + free(ppn); + /* compute the placement of the procs */ + for (m=0; m < nprocs; m++) { + snprintf(tmp, 50, "%d", m); + pmix_argv_append_nosize(&map[m%3], tmp); + memset(tmp, 0, 50); + } + for (m=0; m < 3; m++) { + if (NULL != map[m]) { + peers[m] = pmix_argv_join(map[m], ','); + pmix_argv_append_nosize(&agg, peers[m]); + pmix_argv_free(map[m]); + } + } + rks = pmix_argv_join(agg, ';'); + pmix_argv_free(agg); + PMIx_generate_ppn(rks, &ppn); + free(rks); + } - x->ninfo = 16 + nprocs; - + x->ninfo = 1 + nprocs + nnodes; PMIX_INFO_CREATE(x->info, x->ninfo); - n = 0; - - PMIx_generate_regex("test000,test001,test002", ®ex); - PMIx_generate_ppn("0;1;2", &ppn); - - (void)strncpy(x->info[n].key, PMIX_NODE_MAP, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_STRING; - x->info[n].value.data.string = regex; - ++n; - - /* if we have some empty nodes, then fill their spots */ - (void)strncpy(x->info[n].key, PMIX_PROC_MAP, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_STRING; - x->info[n].value.data.string = ppn; - ++n; - - (void)strncpy(x->info[n].key, PMIX_UNIV_SIZE, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = nprocs; - ++n; - - (void)strncpy(x->info[n].key, PMIX_SPAWNED, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = 0; - ++n; - - (void)strncpy(x->info[n].key, PMIX_LOCAL_SIZE, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = nprocs; - ++n; - - (void)strncpy(x->info[n].key, PMIX_LOCAL_PEERS, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_STRING; - x->info[n].value.data.string = strdup(ranks); - ++n; - - (void)strncpy(x->info[n].key, PMIX_JOB_SIZE, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = nprocs; - ++n; - - (void)strncpy(x->info[n].key, PMIX_JOBID, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_STRING; - x->info[n].value.data.string = strdup("1234"); - ++n; - - (void)strncpy(x->info[n].key, PMIX_NPROC_OFFSET, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = 0; - ++n; - - (void)strncpy(x->info[n].key, PMIX_NODEID, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = 0; - ++n; - - (void)strncpy(x->info[n].key, PMIX_NODE_SIZE, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = nprocs; - ++n; - (void)strncpy(x->info[n].key, PMIX_NUM_NODES, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = 1; - ++n; - - (void)strncpy(x->info[n].key, PMIX_UNIV_SIZE, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = nprocs; + n = 0; + (void)strncpy(x->info[n].key, PMIX_JOB_INFO_ARRAY, PMIX_MAX_KEYLEN); + x->info[n].value.type = PMIX_DATA_ARRAY; + if (model) { + PMIX_DATA_ARRAY_CREATE(x->info[n].value.data.darray, 10, PMIX_INFO); + } else { + PMIX_DATA_ARRAY_CREATE(x->info[n].value.data.darray, 9, PMIX_INFO); + } + iptr = (pmix_info_t*)x->info[n].value.data.darray->array; + PMIX_INFO_LOAD(&iptr[0], PMIX_NODE_MAP, regex, PMIX_STRING); + isv1 = &iptr[0]; + PMIX_INFO_LOAD(&iptr[1], PMIX_PROC_MAP, ppn, PMIX_STRING); + isv2 = &iptr[1]; + PMIX_INFO_LOAD(&iptr[2], PMIX_JOB_SIZE, &nprocs, PMIX_UINT32); + PMIX_INFO_LOAD(&iptr[3], PMIX_JOBID, "1234", PMIX_STRING); + PMIX_INFO_LOAD(&iptr[4], PMIX_UNIV_SIZE, &nprocs, PMIX_UINT32); + PMIX_INFO_LOAD(&iptr[5], PMIX_MAX_PROCS, &nprocs, PMIX_UINT32); + m = 1; + PMIX_INFO_LOAD(&iptr[6], PMIX_JOB_NUM_APPS, &m, PMIX_UINT32); + PMIX_INFO_LOAD(&iptr[7], PMIX_NUM_NODES, &nnodes, PMIX_UINT32); + PMIX_INFO_LOAD(&iptr[8], PMIX_SPAWNED, NULL, PMIX_BOOL); + if (model) { + PMIX_INFO_LOAD(&iptr[9], PMIX_PROGRAMMING_MODEL, "ompi", PMIX_STRING); + } ++n; - (void)strncpy(x->info[n].key, PMIX_MAX_PROCS, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = nprocs; - ++n; + /* we have the required info to run setup_app, so do that now */ + PMIX_INFO_CREATE(iptr, 4); + PMIX_INFO_XFER(&iptr[0], isv1); + PMIX_INFO_XFER(&iptr[1], isv2); + PMIX_INFO_LOAD(&iptr[2], PMIX_SETUP_APP_ENVARS, NULL, PMIX_BOOL); + PMIX_LOAD_KEY(iptr[3].key, PMIX_ALLOC_NETWORK); + iptr[3].value.type = PMIX_DATA_ARRAY; + PMIX_DATA_ARRAY_CREATE(iptr[3].value.data.darray, 2, PMIX_INFO); + ip = (pmix_info_t*)iptr[3].value.data.darray->array; + asprintf(&rks, "%s.net", nspace); + PMIX_INFO_LOAD(&ip[0], PMIX_ALLOC_NETWORK_ID, rks, PMIX_STRING); + free(rks); + PMIX_INFO_LOAD(&ip[1], PMIX_ALLOC_NETWORK_SEC_KEY, NULL, PMIX_BOOL); + PMIX_CONSTRUCT(&cd, myxfer_t); + if (PMIX_SUCCESS != (rc = PMIx_server_setup_application(nspace, iptr, 4, + setup_cbfunc, &cd))) { + pmix_output(0, "[%s:%d] PMIx_server_setup_application failed: %s", __FILE__, __LINE__, PMIx_Error_string(rc)); + DEBUG_DESTRUCT_LOCK(&cd.lock); + } else { + DEBUG_WAIT_THREAD(&cd.lock); + } + PMIX_INFO_FREE(iptr, 4); - (void)strncpy(x->info[n].key, PMIX_JOB_NUM_APPS, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_UINT32; - x->info[n].value.data.uint32 = 1; - ++n; + /* use the results to setup the local subsystems */ + PMIX_CONSTRUCT(&lock, myxfer_t); + if (PMIX_SUCCESS != (rc = PMIx_server_setup_local_support(nspace, cd.info, cd.ninfo, + opcbfunc, &lock))) { + pmix_output(0, "[%s:%d] PMIx_server_setup_local_support failed: %s", __FILE__, __LINE__, PMIx_Error_string(rc)); + } else { + DEBUG_WAIT_THREAD(&lock.lock); + } + PMIX_DESTRUCT(&lock); + PMIX_DESTRUCT(&cd); - (void)strncpy(x->info[n].key, PMIX_LOCALLDR, PMIX_MAX_KEYLEN); - x->info[n].value.type = PMIX_PROC_RANK; - x->info[n].value.data.uint32 = 0; - ++n; + /* create the node-info arrays */ + for (m=0; m < nnodes; m++) { + (void)strncpy(x->info[n].key, PMIX_NODE_INFO_ARRAY, PMIX_MAX_KEYLEN); + x->info[n].value.type = PMIX_DATA_ARRAY; + PMIX_DATA_ARRAY_CREATE(x->info[n].value.data.darray, 3, PMIX_INFO); + iptr = (pmix_info_t*)x->info[n].value.data.darray->array; + if (xversion) { + PMIX_INFO_LOAD(&iptr[0], PMIX_HOSTNAME, pmix_globals.hostname, PMIX_STRING); + } else { + PMIX_INFO_LOAD(&iptr[0], PMIX_HOSTNAME, hostnames[m % 3], PMIX_STRING); + } + PMIX_INFO_LOAD(&iptr[1], PMIX_NODEID, &m, PMIX_UINT32); + PMIX_INFO_LOAD(&iptr[2], PMIX_NODE_SIZE, &nprocs, PMIX_UINT32); + ++n; + } /* add the proc-specific data */ for (m=0; m < nprocs; m++) { (void)strncpy(x->info[n].key, PMIX_PROC_DATA, PMIX_MAX_KEYLEN); x->info[n].value.type = PMIX_DATA_ARRAY; - PMIX_DATA_ARRAY_CREATE(array, 5, PMIX_INFO); + PMIX_DATA_ARRAY_CREATE(array, 6, PMIX_INFO); x->info[n].value.data.darray = array; info = (pmix_info_t*)array->array; k = 0; @@ -824,11 +882,21 @@ static void set_namespace(int nprocs, char *ranks, char *nspace, (void)strncpy(info[k].key, PMIX_NODEID, PMIX_MAX_KEYLEN); info[k].value.type = PMIX_UINT32; - info[k].value.data.uint32 = 0; + info[k].value.data.uint32 = m % 3; + ++k; + + (void)strncpy(info[k].key, PMIX_HOSTNAME, PMIX_MAX_KEYLEN); + info[k].value.type = PMIX_STRING; + if (xversion) { + info[k].value.data.string = strdup(pmix_globals.hostname); + } else { + info[k].value.data.string = strdup(hostnames[m % 3]); + } ++k; /* move to next proc */ ++n; } + PMIx_server_register_nspace(nspace, nprocs, x->info, x->ninfo, cbfunc, x); } @@ -1124,7 +1192,7 @@ static pmix_status_t spawn_fn(const pmix_proc_t *proc, x->spcbfunc = cbfunc; x->cbdata = cbdata; - set_namespace(2, "0,1", "DYNSPACE", spcbfunc, x); + set_namespace(2, "DYNSPACE", spcbfunc, x); return PMIX_SUCCESS; } diff --git a/opal/mca/pmix/pmix3x/pmix/test/test_cd.c b/opal/mca/pmix/pmix3x/pmix/test/test_cd.c index f1a800202e2..3feb4f50474 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/test_cd.c +++ b/opal/mca/pmix/pmix3x/pmix/test/test_cd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -45,14 +45,14 @@ int test_connect_disconnect(char *my_nspace, int my_rank) rc = PMIx_Connect(&proc, 1, NULL, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: Connect blocking test failed.", my_nspace, my_rank)); - return PMIX_ERROR; + exit(PMIX_ERROR); } TEST_VERBOSE(("%s:%d: Connect blocking test succeded", my_nspace, my_rank)); rc = PMIx_Disconnect(&proc, 1, NULL, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: Disconnect blocking test failed.", my_nspace, my_rank)); - return PMIX_ERROR; + exit(PMIX_ERROR); } TEST_VERBOSE(("%s:%d: Disconnect blocking test succeded.", my_nspace, my_rank)); @@ -64,7 +64,7 @@ int test_connect_disconnect(char *my_nspace, int my_rank) } if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: Connect non-blocking test failed.", my_nspace, my_rank)); - return PMIX_ERROR; + exit(PMIX_ERROR); } TEST_VERBOSE(("%s:%d: Connect non-blocking test succeded.", my_nspace, my_rank)); @@ -76,7 +76,7 @@ int test_connect_disconnect(char *my_nspace, int my_rank) } if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: Disconnect non-blocking test failed.", my_nspace, my_rank)); - return PMIX_ERROR; + exit(PMIX_ERROR); } TEST_VERBOSE(("%s:%d: Disconnect non-blocking test succeded.", my_nspace, my_rank)); return PMIX_SUCCESS; diff --git a/opal/mca/pmix/pmix3x/pmix/test/test_common.c b/opal/mca/pmix/pmix3x/pmix/test/test_common.c index 7b9ac8701d1..ca594bcad50 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/test_common.c +++ b/opal/mca/pmix/pmix3x/pmix/test/test_common.c @@ -21,6 +21,7 @@ #include int pmix_test_verbose = 0; +test_params params; FILE *file; @@ -92,6 +93,10 @@ void parse_cmd(int argc, char **argv, test_params *params) if (NULL != argv[i]) { params->nservers = atoi(argv[i]); } + if (2 < params->nservers) { + fprintf(stderr, "Only support up to 2 servers\n"); + exit(1); + } } else if( 0 == strcmp(argv[i], "--verbose") || 0 == strcmp(argv[i],"-v") ){ TEST_VERBOSE_ON(); params->verbose = 1; diff --git a/opal/mca/pmix/pmix3x/pmix/test/test_common.h b/opal/mca/pmix/pmix3x/pmix/test/test_common.h index 490f68323be..2d6a33af0f5 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/test_common.h +++ b/opal/mca/pmix/pmix3x/pmix/test/test_common.h @@ -133,6 +133,8 @@ typedef struct { uint32_t lsize; } test_params; +extern test_params params; + #define INIT_TEST_PARAMS(params) do { \ params.nprocs = 1; \ params.verbose = 0; \ @@ -219,7 +221,6 @@ extern pmix_list_t test_fences; extern pmix_list_t *noise_range; extern pmix_list_t key_replace; -#define NODE_NAME "node1" int get_total_ns_number(test_params params); int get_all_ranks_from_namespace(test_params params, char *nspace, pmix_proc_t **ranks, size_t *nranks); @@ -284,7 +285,8 @@ typedef struct { nanosleep(&ts,NULL); \ count++; \ } \ - PMIX_ACQUIRE_OBJECT(&cbdata); \ + rc = cbdata.status; \ + PMIX_ACQUIRE_OBJECT(&cbdata); \ } \ } \ if (PMIX_SUCCESS == rc) { \ diff --git a/opal/mca/pmix/pmix3x/pmix/test/test_error.c b/opal/mca/pmix/pmix3x/pmix/test/test_error.c index 24a63da4917..34fa7c3fb48 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/test_error.c +++ b/opal/mca/pmix/pmix3x/pmix/test/test_error.c @@ -69,6 +69,9 @@ int test_error(char *my_nspace, int my_rank, test_params params) pmix_status_t status; pmix_proc_t source; + /* turn OFF event handler testing pending fix of timeout_errhandler */ + return PMIX_SUCCESS; + TEST_VERBOSE(("test-error: running error handling test cases")); /* register specific client error handlers and test their invocation * by trigerring events from server side*/ diff --git a/opal/mca/pmix/pmix3x/pmix/test/test_fence.c b/opal/mca/pmix/pmix3x/pmix/test/test_fence.c index a33d9618b71..f479ce59113 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/test_fence.c +++ b/opal/mca/pmix/pmix3x/pmix/test/test_fence.c @@ -103,42 +103,42 @@ int test_fence(test_params params, char *my_nspace, pmix_rank_t my_rank) if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Put failed: %d", my_nspace, my_rank, rc)); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } PUT(int, fence_num+my_rank, PMIX_GLOBAL, fence_num, put_ind++, params.use_same_keys); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Put failed: %d", my_nspace, my_rank, rc)); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } PUT(float, fence_num+1.1, PMIX_GLOBAL, fence_num, put_ind++, params.use_same_keys); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Put failed: %d", my_nspace, my_rank, rc)); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } PUT(uint32_t, fence_num+14, PMIX_GLOBAL, fence_num, put_ind++, params.use_same_keys); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Put failed: %d", my_nspace, my_rank, rc)); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } PUT(uint16_t, fence_num+15, PMIX_GLOBAL, fence_num, put_ind++, params.use_same_keys); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Put failed: %d", my_nspace, my_rank, rc)); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } /* Submit the data */ if (PMIX_SUCCESS != (rc = PMIx_Commit())) { TEST_ERROR(("%s:%d: PMIx_Commit failed: %d", my_nspace, my_rank, rc)); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } /* setup the fence */ @@ -157,7 +157,7 @@ int test_fence(test_params params, char *my_nspace, pmix_rank_t my_rank) TEST_ERROR(("%s:%d: PMIx_Fence failed: %d", my_nspace, my_rank, rc)); PMIX_LIST_DESTRUCT(&test_fences); PMIX_PROC_FREE(pcs, npcs); - return rc; + exit(rc); } /* replace all items in the list with PMIX_RANK_WILDCARD rank by real ranks to get their data. */ @@ -169,7 +169,7 @@ int test_fence(test_params params, char *my_nspace, pmix_rank_t my_rank) if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: Can't parse --ns-dist value in order to get ranks for namespace %s", my_nspace, my_rank, p->proc.nspace)); PMIX_LIST_DESTRUCT(&test_fences); - return PMIX_ERROR; + exit(PMIX_ERROR); } pmix_list_remove_item(desc->participants, (pmix_list_item_t*)p); for (i = 0; i < nranks; i++) { @@ -192,35 +192,35 @@ int test_fence(test_params params, char *my_nspace, pmix_rank_t my_rank) TEST_ERROR(("%s:%d: PMIx_Get failed (%d) from %s:%d", my_nspace, my_rank, rc, p->proc.nspace, p->proc.rank)); PMIX_PROC_FREE(pcs, npcs); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } GET(int, (int)(fence_num+p->proc.rank), p->proc.nspace, p->proc.rank, fence_num, put_ind++, params.use_same_keys, 0, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Get failed (%d) from %s:%d", my_nspace, my_rank, rc, p->proc.nspace, p->proc.rank)); PMIX_PROC_FREE(pcs, npcs); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } GET(float, fence_num+1.1, p->proc.nspace, p->proc.rank, fence_num, put_ind++, params.use_same_keys, 1, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Get failed (%d) from %s:%d", my_nspace, my_rank, rc, p->proc.nspace, p->proc.rank)); PMIX_PROC_FREE(pcs, npcs); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } GET(uint32_t, (uint32_t)fence_num+14, p->proc.nspace, p->proc.rank, fence_num, put_ind++, params.use_same_keys, 0, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Get failed (%d) from %s:%d", my_nspace, my_rank, rc, p->proc.nspace, p->proc.rank)); PMIX_PROC_FREE(pcs, npcs); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } GET(uint16_t, fence_num+15, p->proc.nspace, p->proc.rank, fence_num, put_ind++, params.use_same_keys, 1, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Get failed (%d) from %s:%d", my_nspace, my_rank, rc, p->proc.nspace, p->proc.rank)); PMIX_PROC_FREE(pcs, npcs); PMIX_LIST_DESTRUCT(&test_fences); - return rc; + exit(rc); } } /* barrier across participating processes to prevent putting new values with the same key @@ -246,35 +246,35 @@ static int get_local_peers(char *my_nspace, int my_rank, pmix_rank_t **_peers, p (void)strncpy(proc.nspace, my_nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; - /* get number of neighbours on this node */ + /* get number of neighbors on this node */ if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_LOCAL_SIZE, NULL, 0, &val))) { TEST_ERROR(("%s:%d: PMIx_Get local peer # failed: %d", my_nspace, my_rank, rc)); - return rc; + exit(rc); } if (NULL == val) { TEST_ERROR(("%s:%d: PMIx_Get local peer # returned NULL value", my_nspace, my_rank)); - return PMIX_ERROR; + exit(PMIX_ERROR); } if (val->type != PMIX_UINT32 ) { TEST_ERROR(("%s:%d: local peer # attribute value type mismatch," " want %d get %d(%d)", my_nspace, my_rank, PMIX_UINT32, val->type)); - return PMIX_ERROR; + exit(PMIX_ERROR); } npeers = val->data.uint32; peers = malloc(sizeof(pmix_rank_t) * npeers); - /* get ranks of neighbours on this node */ + /* get ranks of neighbors on this node */ if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_LOCAL_PEERS, NULL, 0, &val))) { TEST_ERROR(("%s:%d: PMIx_Get local peers failed: %d", my_nspace, my_rank, rc)); free(peers); - return rc; + exit(rc); } if (NULL == val) { TEST_ERROR(("%s:%d: PMIx_Get local peers returned NULL value", my_nspace, my_rank)); free(peers); - return PMIX_ERROR; + exit(PMIX_ERROR); } if (val->type != PMIX_STRING ) { @@ -282,7 +282,7 @@ static int get_local_peers(char *my_nspace, int my_rank, pmix_rank_t **_peers, p " want %d get %d(%d)", my_nspace, my_rank, PMIX_UINT32, val->type)); free(peers); - return PMIX_ERROR; + exit(PMIX_ERROR); } *count = 0; @@ -293,7 +293,7 @@ static int get_local_peers(char *my_nspace, int my_rank, pmix_rank_t **_peers, p TEST_ERROR(("%s:%d: Bad peer ranks number: should be %d, actual %d (%s)", my_nspace, my_rank, npeers, *count, val->data.string)); free(peers); - return PMIX_ERROR; + exit(PMIX_ERROR); } token = strtok_r(str, ",", &sptr); str = NULL; @@ -302,7 +302,7 @@ static int get_local_peers(char *my_nspace, int my_rank, pmix_rank_t **_peers, p if( *eptr != '\0' ){ TEST_ERROR(("%s:%d: Bad peer ranks string", my_nspace, my_rank)); free(peers); - return PMIX_ERROR; + exit(PMIX_ERROR); } } @@ -312,7 +312,7 @@ static int get_local_peers(char *my_nspace, int my_rank, pmix_rank_t **_peers, p TEST_ERROR(("%s:%d: Bad peer ranks number: should be %d, actual %d (%s)", my_nspace, my_rank, npeers, *count, val->data.string)); free(peers); - return PMIX_ERROR; + exit(PMIX_ERROR); } *_peers = peers; return PMIX_SUCCESS; @@ -335,84 +335,83 @@ int test_job_fence(test_params params, char *my_nspace, pmix_rank_t my_rank) PUT(int, 12340 + i, PMIX_LOCAL, 100, i, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Put failed: %d", my_nspace, my_rank, rc)); - return rc; + exit(rc); } (void)snprintf(sval, 50, "%s:%d", my_nspace, my_rank); PUT(string, sval, PMIX_REMOTE, 101, i, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Put failed: %d", my_nspace, my_rank, rc)); - return PMIX_ERROR; + exit(PMIX_ERROR); } PUT(float, (float)12.15 + i, PMIX_GLOBAL, 102, i, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Put failed: %d", my_nspace, my_rank, rc)); - return PMIX_ERROR; + exit(PMIX_ERROR); } } /* Submit the data */ if (PMIX_SUCCESS != (rc = PMIx_Commit())) { TEST_ERROR(("%s:%d: PMIx_Commit failed: %d", my_nspace, my_rank, rc)); - return PMIX_ERROR; + exit(PMIX_ERROR); } /* Perform a fence if was requested */ FENCE(!params.nonblocking, params.collect, NULL, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: PMIx_Fence failed: %d", my_nspace, my_rank, rc)); - return rc; + exit(rc); } if (PMIX_SUCCESS != (rc = get_local_peers(my_nspace, my_rank, &peers, &npeers))) { - return PMIX_ERROR; + exit(rc); } /* Check the predefined output */ for (i=0; i < (int)params.ns_size; i++) { - for (j=0; j < 3; j++) { - int local = 0; pmix_rank_t k; for(k=0; k= ns_num) { TEST_ERROR(("%s:%d: get_total_ns_number function failed", my_nspace, my_rank)); - return PMIX_ERROR; + exit(PMIX_ERROR); } for (n = 0; n < ns_num; n++) { memset(nspace, 0, PMIX_MAX_NSLEN+1); @@ -103,16 +103,20 @@ int test_resolve_peers(char *my_nspace, int my_rank, test_params params) TEST_VERBOSE(("%s:%d: Connect to %s succeeded.", my_nspace, my_rank, nspace)); } else { TEST_ERROR(("%s:%d: Connect to %s failed.", my_nspace, my_rank, nspace)); - return PMIX_ERROR; + exit(rc); } - /* then resolve peers from this namespace. */ - rc = resolve_nspace(nspace, params, my_nspace, my_rank); - if (PMIX_SUCCESS == rc) { - TEST_VERBOSE(("%s:%d: Resolve peers succeeded for ns %s\n", my_nspace, my_rank, nspace)); - } else { - PMIx_Disconnect(procs, 2, NULL, 0); - break; + /* then resolve peers from this namespace - earlier versions cannot handle + * cross-nspace peer resolution because their test servers don't provide + * the info. So check for a marker of either 3.1.5 or above */ + if (NULL != getenv("PMIX_VERSION")) { + rc = resolve_nspace(nspace, params, my_nspace, my_rank); + if (PMIX_SUCCESS == rc) { + TEST_VERBOSE(("%s:%d: Resolve peers succeeded for ns %s\n", my_nspace, my_rank, nspace)); + } else { + TEST_ERROR(("%s:%d: Resolve peers failed for different namespace\n", my_nspace, my_rank)); + exit(rc); + } } /* disconnect from the processes of this namespace. */ @@ -120,8 +124,8 @@ int test_resolve_peers(char *my_nspace, int my_rank, test_params params) if (PMIX_SUCCESS == rc) { TEST_VERBOSE(("%s:%d: Disconnect from %s succeeded %s.", my_nspace, my_rank, nspace)); } else { - TEST_ERROR(("%s:%d: Disconnect from %s failed %s.", my_nspace, my_rank, nspace)); - return PMIX_ERROR; + TEST_ERROR(("%s:%d: Disconnect from %s failed.", my_nspace, my_rank, nspace)); + exit(rc); } } if (PMIX_SUCCESS == rc) { diff --git a/opal/mca/pmix/pmix3x/pmix/test/test_server.c b/opal/mca/pmix/pmix3x/pmix/test/test_server.c index 30d174a9567..0afda702b24 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/test_server.c +++ b/opal/mca/pmix/pmix3x/pmix/test/test_server.c @@ -12,6 +12,7 @@ * */ +#define _GNU_SOURCE #include #include #include @@ -21,6 +22,7 @@ #include "pmix_server.h" #include "src/include/pmix_globals.h" +#include "src/util/error.h" #include "test_server.h" #include "test_common.h" @@ -28,6 +30,7 @@ #include "server_callbacks.h" int my_server_id = 0; +int test_fail = 0; server_info_t *my_server_info = NULL; pmix_list_t *server_list = NULL; @@ -41,10 +44,14 @@ static void sdes(server_info_t *s) event_del(s->evread); } s->evread = NULL; + if (NULL != s->hostname) { + free(s->hostname); + } } static void scon(server_info_t *s) { + s->hostname = NULL; s->idx = 0; s->pid = 0; s->rd_fd = -1; @@ -115,14 +122,29 @@ static void fill_seq_ranks_array(size_t nprocs, int base_rank, char **ranks) } } +static int server_find_id(const char *nspace, int rank) +{ + server_nspace_t *tmp; + + PMIX_LIST_FOREACH(tmp, server_nspace, server_nspace_t) { + if (0 == strcmp(tmp->name, nspace)) { + return tmp->task_map[rank]; + } + } + return -1; +} + static void set_namespace(int local_size, int univ_size, int base_rank, char *name) { size_t ninfo; pmix_info_t *info; ninfo = 8; - char *regex, *ppn; - char *ranks = NULL; + char *regex, *ppn, *tmp; + char *ranks = NULL, **nodes = NULL; + char **rks=NULL; + int i; + int rc; PMIX_INFO_CREATE(info, ninfo); pmix_strncpy(info[0].key, PMIX_UNIV_SIZE, PMIX_MAX_KEYLEN); @@ -145,23 +167,62 @@ static void set_namespace(int local_size, int univ_size, pmix_strncpy(info[3].key, PMIX_LOCAL_PEERS, PMIX_MAX_KEYLEN); info[3].value.type = PMIX_STRING; info[3].value.data.string = strdup(ranks); - free(ranks); - PMIx_generate_regex(NODE_NAME, ®ex); - pmix_strncpy(info[4].key, PMIX_NODE_MAP, PMIX_MAX_KEYLEN); - info[4].value.type = PMIX_STRING; - info[4].value.data.string = strdup(regex); + /* assemble the node and proc map info */ + if (1 == params.nservers) { + pmix_argv_append_nosize(&nodes, my_server_info->hostname); + } else { + char hostname[PMIX_MAXHOSTNAMELEN]; + for (i = 0; i < params.nservers; i++) { + snprintf(hostname, PMIX_MAXHOSTNAMELEN, "node%d", i); + pmix_argv_append_nosize(&nodes, hostname); + } + } + + if (NULL != nodes) { + tmp = pmix_argv_join(nodes, ','); + pmix_argv_free(nodes); + nodes = NULL; + if (PMIX_SUCCESS != (rc = PMIx_generate_regex(tmp, ®ex) )) { + PMIX_ERROR_LOG(rc); + return; + } + free(tmp); + PMIX_INFO_LOAD(&info[4], PMIX_NODE_MAP, regex, PMIX_STRING); + } - /* generate the global proc map */ - fill_seq_ranks_array(univ_size, 0, &ranks); - if (NULL == ranks) { - return; + /* generate the global proc map - if we have two + * servers, then the procs not on this server must + * be on the other */ + if (2 == params.nservers) { + pmix_argv_append_nosize(&rks, ranks); + free(ranks); + nodes = NULL; + if (0 == my_server_id) { + for (i=base_rank+local_size; i < univ_size; i++) { + asprintf(&ppn, "%d", i); + pmix_argv_append_nosize(&nodes, ppn); + free(ppn); + } + ppn = pmix_argv_join(nodes, ','); + pmix_argv_append_nosize(&rks, ppn); + free(ppn); + } else { + for (i=0; i < base_rank; i++) { + asprintf(&ppn, "%d", i); + pmix_argv_append_nosize(&nodes, ppn); + free(ppn); + } + ppn = pmix_argv_join(nodes, ','); + pmix_argv_prepend_nosize(&rks, ppn); + free(ppn); + } + ranks = pmix_argv_join(rks, ';'); } PMIx_generate_ppn(ranks, &ppn); free(ranks); - pmix_strncpy(info[5].key, PMIX_PROC_MAP, PMIX_MAX_KEYLEN); - info[5].value.type = PMIX_STRING; - info[5].value.data.string = strdup(ppn); + PMIX_INFO_LOAD(&info[5], PMIX_PROC_MAP, ppn, PMIX_STRING); + free(ppn); pmix_strncpy(info[6].key, PMIX_JOB_SIZE, PMIX_MAX_KEYLEN); info[6].value.type = PMIX_UINT32; @@ -171,7 +232,7 @@ static void set_namespace(int local_size, int univ_size, info[7].value.type = PMIX_UINT32; info[7].value.data.uint32 = getpid (); - int in_progress = 1, rc; + int in_progress = 1; if (PMIX_SUCCESS == (rc = PMIx_server_register_nspace(name, local_size, info, ninfo, release_cb, &in_progress))) { PMIX_WAIT_FOR_COMPLETION(in_progress); @@ -187,7 +248,7 @@ static void server_unpack_procs(char *buf, size_t size) char *nspace; while ((size_t)(ptr - buf) < size) { - ns_count = *(size_t *)ptr; + memcpy (&ns_count, ptr, sizeof(size_t)); ptr += sizeof(size_t); for (i = 0; i < ns_count; i++) { @@ -195,16 +256,16 @@ static void server_unpack_procs(char *buf, size_t size) size_t ltasks, ntasks; int server_id; - server_id = *(int *)ptr; + memcpy (&server_id, ptr, sizeof(int)); ptr += sizeof(int); nspace = ptr; ptr += PMIX_MAX_NSLEN+1; - ntasks = *(size_t *)ptr; + memcpy (&ntasks, ptr, sizeof(size_t)); ptr += sizeof(size_t); - ltasks = *(size_t *)ptr; + memcpy (<asks, ptr, sizeof(size_t)); ptr += sizeof(size_t); PMIX_LIST_FOREACH(tmp, server_nspace, server_nspace_t) { @@ -226,7 +287,8 @@ static void server_unpack_procs(char *buf, size_t size) } size_t i; for (i = 0; i < ltasks; i++) { - int rank = *(int *)ptr; + int rank; + memcpy (&rank, ptr, sizeof(int)); ptr += sizeof(int); if (ns_item->task_map[rank] >= 0) { continue; @@ -614,18 +676,6 @@ int server_fence_contrib(char *data, size_t ndata, return rc; } -static int server_find_id(const char *nspace, int rank) -{ - server_nspace_t *tmp; - - PMIX_LIST_FOREACH(tmp, server_nspace, server_nspace_t) { - if (0 == strcmp(tmp->name, nspace)) { - return tmp->task_map[rank]; - } - } - return -1; -} - static int server_pack_dmdx(int sender_id, const char *nspace, int rank, char **buf) { @@ -724,9 +774,73 @@ int server_dmdx_get(const char *nspace, int rank, return PMIX_ERROR; } +static void set_handler_default(int sig) +{ + struct sigaction act; + + act.sa_handler = SIG_DFL; + act.sa_flags = 0; + sigemptyset(&act.sa_mask); + + sigaction(sig, &act, (struct sigaction *)0); +} + +static pmix_event_t handler; +static void wait_signal_callback(int fd, short event, void *arg) +{ + pmix_event_t *sig = (pmix_event_t*) arg; + int status; + pid_t pid; + int i; + + if (SIGCHLD != pmix_event_get_signal(sig)) { + return; + } + + /* we can have multiple children leave but only get one + * sigchild callback, so reap all the waitpids until we + * don't get anything valid back */ + while (1) { + pid = waitpid(-1, &status, WNOHANG); + if (-1 == pid && EINTR == errno) { + /* try it again */ + continue; + } + /* if we got garbage, then nothing we can do */ + if (pid <= 0) { + goto done; + } + /* we are already in an event, so it is safe to access the list */ + for(i=0; i < cli_info_cnt; i++){ + if( cli_info[i].pid == pid ){ + /* found it! */ + if (WIFEXITED(status)) { + cli_info[i].exit_code = WEXITSTATUS(status); + } else { + if (WIFSIGNALED(status)) { + cli_info[i].exit_code = WTERMSIG(status) + 128; + } + } + cli_cleanup(&cli_info[i]); + cli_info[i].alive = false; + break; + } + } + } + done: + for(i=0; i < cli_info_cnt; i++){ + if (cli_info[i].alive) { + /* someone is still alive */ + return; + } + } + /* get here if nobody is still alive */ + test_complete = true; +} + int server_init(test_params *params) { - pmix_info_t info[1]; + pmix_info_t info[2]; int rc = PMIX_SUCCESS; /* fork/init servers procs */ @@ -754,7 +868,9 @@ int server_init(test_params *params) } if (pid == 0) { server_list = PMIX_NEW(pmix_list_t); + my_server_info = server_info; my_server_id = i; + asprintf(&server_info->hostname, "node%d", i); server_info->idx = 0; server_info->pid = getppid(); server_info->rd_fd = fd1[0]; @@ -765,6 +881,7 @@ int server_init(test_params *params) pmix_list_append(server_list, &server_info->super); break; } + asprintf(&server_info->hostname, "node%d", i); server_info->idx = i; server_info->pid = pid; server_info->wr_fd = fd1[1]; @@ -774,6 +891,7 @@ int server_init(test_params *params) close(fd2[1]); } else { my_server_info = server_info; + server_info->hostname = strdup("node0"); server_info->pid = getpid(); server_info->idx = 0; server_info->rd_fd = fd1[0]; @@ -791,13 +909,13 @@ int server_init(test_params *params) params->nprocs / params->nservers + 1 : params->nprocs / params->nservers; /* setup the server library */ - (void)strncpy(info[0].key, PMIX_SOCKET_MODE, PMIX_MAX_KEYLEN); - info[0].value.type = PMIX_UINT32; - info[0].value.data.uint32 = 0666; + uint32_t u32 = 0666; + PMIX_INFO_LOAD(&info[0], PMIX_SOCKET_MODE, &u32, PMIX_UINT32); + PMIX_INFO_LOAD(&info[1], PMIX_HOSTNAME, my_server_info->hostname, PMIX_STRING); server_nspace = PMIX_NEW(pmix_list_t); - if (PMIX_SUCCESS != (rc = PMIx_server_init(&mymodule, info, 1))) { + if (PMIX_SUCCESS != (rc = PMIx_server_init(&mymodule, info, 2))) { TEST_ERROR(("Init failed with error %d", rc)); goto error; } @@ -812,9 +930,17 @@ int server_init(test_params *params) } } +#if 0 /* register the errhandler */ PMIx_Register_event_handler(NULL, 0, NULL, 0, errhandler, errhandler_reg_callbk, NULL); +#endif + + /* setup to see sigchld on the forked tests */ + pmix_event_assign(&handler, pmix_globals.evbase, SIGCHLD, + EV_SIGNAL|EV_PERSIST, wait_signal_callback, &handler); + pmix_event_add(&handler, NULL); + if (0 != (rc = server_barrier())) { goto error; @@ -832,6 +958,7 @@ int server_finalize(test_params *params) int rc = PMIX_SUCCESS; int total_ret = 0; + total_ret = test_fail; if (0 != (rc = server_barrier())) { total_ret++; goto exit; @@ -852,11 +979,6 @@ int server_finalize(test_params *params) PMIX_LIST_RELEASE(server_list); TEST_VERBOSE(("SERVER %d FINALIZE PID:%d with status %d", my_server_id, getpid(), ret)); - if (0 == total_ret) { - TEST_OUTPUT(("Test finished OK!")); - } else { - rc = PMIX_ERROR; - } } PMIX_LIST_RELEASE(server_nspace); @@ -866,6 +988,11 @@ int server_finalize(test_params *params) total_ret += rc; goto exit; } + if (0 == total_ret) { + TEST_OUTPUT(("Test finished OK!")); + } else { + TEST_OUTPUT(("Test FAILED!")); + } exit: return total_ret; @@ -932,6 +1059,7 @@ int server_launch_clients(int local_size, int univ_size, int base_rank, cli_kill_all(); return rc; } + TEST_VERBOSE(("run %s:%d", proc.nspace, proc.rank)); cli_info[cli_counter].pid = fork(); if (cli_info[cli_counter].pid < 0) { @@ -966,6 +1094,15 @@ int server_launch_clients(int local_size, int univ_size, int base_rank, pmix_argv_append_nosize(&client_argv, digit); if (cli_info[cli_counter].pid == 0) { + sigset_t sigs; + set_handler_default(SIGTERM); + set_handler_default(SIGINT); + set_handler_default(SIGHUP); + set_handler_default(SIGPIPE); + set_handler_default(SIGCHLD); + sigprocmask(0, 0, &sigs); + sigprocmask(SIG_UNBLOCK, &sigs, 0); + if( !TEST_VERBOSE_GET() ){ // Hide clients stdout if (NULL == freopen("/dev/null","w", stdout)) { @@ -977,6 +1114,7 @@ int server_launch_clients(int local_size, int univ_size, int base_rank, TEST_ERROR(("execve() failed")); return 0; } + cli_info[cli_counter].alive = true; cli_info[cli_counter].state = CLI_FORKED; pmix_argv_free(client_argv); diff --git a/opal/mca/pmix/pmix3x/pmix/test/test_server.h b/opal/mca/pmix/pmix3x/pmix/test/test_server.h index 09767ea56f4..2cd78b88fec 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/test_server.h +++ b/opal/mca/pmix/pmix3x/pmix/test/test_server.h @@ -2,7 +2,7 @@ * Copyright (c) 2018 Mellanox Technologies, Inc. * All rights reserved. * - * Copyright (c) 2018 Intel, Inc. All rights reserved. + * Copyright (c) 2018-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,6 +37,7 @@ typedef struct { struct server_info_t { pmix_list_item_t super; + char *hostname; pid_t pid; int idx; int rd_fd; @@ -64,6 +65,7 @@ extern int my_server_id; extern pmix_list_t *server_list; extern server_info_t *my_server_info; extern pmix_list_t *server_nspace; +extern int test_fail; int server_init(test_params *params); int server_finalize(test_params *params); diff --git a/opal/mca/pmix/pmix3x/pmix/test/test_spawn.c b/opal/mca/pmix/pmix3x/pmix/test/test_spawn.c index 9251d4de0d5..8956752d81e 100644 --- a/opal/mca/pmix/pmix3x/pmix/test/test_spawn.c +++ b/opal/mca/pmix/pmix3x/pmix/test/test_spawn.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. * All rights reserved. * $COPYRIGHT$ @@ -39,7 +39,7 @@ static int test_spawn_common(char *my_nspace, int my_rank, int blocking) if (blocking) { if (PMIX_SUCCESS != (rc = PMIx_Spawn(NULL, 0, apps, napps, nspace))) { PMIX_APP_FREE(apps, napps); - return rc; + exit(rc); } } else { spawn_cbdata cbdata; @@ -48,14 +48,14 @@ static int test_spawn_common(char *my_nspace, int my_rank, int blocking) rc = PMIx_Spawn_nb(NULL, 0, apps, napps, spawn_cb, (void*)&cbdata); if (PMIX_SUCCESS != rc) { PMIX_APP_FREE(apps, napps); - return rc; + exit(rc); } PMIX_WAIT_FOR_COMPLETION(cbdata.in_progress); strncpy(nspace, cbdata.nspace, strlen(cbdata.nspace)+1); } PMIX_APP_FREE(apps, napps); if (strncmp(nspace, "foobar", strlen(nspace)+1)) { - return PMIX_ERROR; + exit(PMIX_ERROR); } return rc; } @@ -66,13 +66,13 @@ int test_spawn(char *my_nspace, int my_rank) rc = test_spawn_common(my_nspace, my_rank, 1); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: Spawn blocking test failed.", my_nspace, my_rank)); - return PMIX_ERROR; + exit(rc); } TEST_VERBOSE(("%s:%d: Spawn blocking test succeded.", my_nspace, my_rank)); rc = test_spawn_common(my_nspace, my_rank, 0); if (PMIX_SUCCESS != rc) { TEST_ERROR(("%s:%d: Spawn non-blocking test failed.", my_nspace, my_rank)); - return PMIX_ERROR; + exit(rc); } TEST_VERBOSE(("%s:%d: Spawn non-blocking test succeded.", my_nspace, my_rank)); return PMIX_SUCCESS;