BlueBrain · olupton · Dec 7, 2021 · Nov 23, 2021 · Nov 18, 2021 · Nov 29, 2021
diff --git a/.clang-format.changes b/.clang-format.changes
@@ -1,2 +1,3 @@
-SortIncludes: false
 IndentCaseLabels: true
+SortIncludes: false
+StatementMacros: [nrn_pragma_acc, nrn_pragma_omp]
diff --git a/.cmake-format.changes.yaml b/.cmake-format.changes.yaml
@@ -1,9 +1,4 @@
 additional_commands:
-  cuda_add_library:
-    pargs: '*'
-    flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"]
-    kwargs:
-      OPTIONS: '*'
   cpp_cc_build_time_copy:
     flags: ['NO_TARGET']
     kwargs:

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
@@ -55,14 +55,20 @@ if(CORENRN_ENABLE_GPU)
   # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
   # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
   # CUDA version as is used for the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
+  set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
   foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
     string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
   endforeach()
+  if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
+    # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+    # for a region then prefer OpenMP.
+    add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
+  endif()
   # avoid PGI adding standard compliant "-A" flags
   set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
   string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11)
 # Build options
 # =============================================================================
 option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
+option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
 option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
 option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
 option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
@@ -104,7 +105,7 @@ option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF)
 option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF)
 
 set(CORENRN_EXTERNAL_BENCHMARK_DATA
-    ""
+    "/gpfs/bbp.cscs.ch/project/proj12/nersc-gpu-hackathon-dec-2021"
     CACHE PATH "Path to input data files and mechanisms for benchmarks")
 set(CORENRN_NMODL_DIR
     ""
@@ -138,7 +139,7 @@ if(CORENRN_ENABLE_GPU)
 
   # Set some sensible default CUDA architectures.
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
+    set(CMAKE_CUDA_ARCHITECTURES 70 80)
     message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
   endif()
 

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
@@ -325,7 +325,7 @@ if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "")
       benchmark_command
       "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
       " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
-      " --tstop 1 &&"
+      " --tstop 1 --gpu &&"
       "diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'"
   )
   add_test(NAME benchmark COMMAND sh -c "${benchmark_command}")

diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
@@ -558,10 +558,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
 #endif
     bool compute_gpu = corenrn_param.gpu;
 
-    // clang-format off
-
-    #pragma acc update device(celsius, secondorder, pi) if (compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(update device(celsius, secondorder, pi) if(compute_gpu))
+    nrn_pragma_omp(target update to(celsius, secondorder, pi) if(compute_gpu))
     {
         double v = corenrn_param.voltage;
         double dt = corenrn_param.dt;

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -27,6 +27,9 @@
 #ifdef _OPENACC
 #include <openacc.h>
 #endif
+#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
+#include <omp.h>
+#endif
 
 #ifdef CRAYPAT
 #include <pat_api.h>
@@ -605,25 +608,36 @@ void update_net_receive_buffer(NrnThread* nt) {
             // instance order to avoid race. setup _displ and _nrb_index
             net_receive_buffer_order(nrb);
 
-#ifdef _OPENACC
             if (nt->compute_gpu) {
                 Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu");
                 // note that dont update nrb otherwise we lose pointers
 
+                // clang-format off
+
                 /* update scalar elements */
-                acc_update_device(&nrb->_cnt, sizeof(int));
-                acc_update_device(&nrb->_displ_cnt, sizeof(int));
-
-                acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt);
-                acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt);
-                acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt);
-                acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt);
-                acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1));
-                acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt);
+                nrn_pragma_acc(update device(nrb->_cnt,
+                                             nrb->_displ_cnt,
+                                             nrb->_pnt_index[:nrb->_cnt],
+                                             nrb->_weight_index[:nrb->_cnt],
+                                             nrb->_nrb_t[:nrb->_cnt],
+                                             nrb->_nrb_flag[:nrb->_cnt],
+                                             nrb->_displ[:nrb->_displ_cnt + 1],
+                                             nrb->_nrb_index[:nrb->_cnt])
+                                             async(nt->stream_id))
+                nrn_pragma_omp(target update to(nrb->_cnt,
+                                                nrb->_displ_cnt,
+                                                nrb->_pnt_index[:nrb->_cnt],
+                                                nrb->_weight_index[:nrb->_cnt],
+                                                nrb->_nrb_t[:nrb->_cnt],
+                                                nrb->_nrb_flag[:nrb->_cnt],
+                                                nrb->_displ[:nrb->_displ_cnt + 1],
+                                                nrb->_nrb_index[:nrb->_cnt]))
+                // clang-format on
             }
-#endif
         }
     }
+    nrn_pragma_acc(wait(nt->stream_id))
+    nrn_pragma_omp(taskwait)
 }
 
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
@@ -894,67 +908,12 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
         size_t n_weight = nt->n_weight;
         if (nt->compute_gpu && n_weight > 0) {
             double* weights = nt->weights;
-            // clang-format off
-
-            #pragma acc update host(weights [0:n_weight])
-            // clang-format on
+            nrn_pragma_acc(update host(weights [0:n_weight]))
+            nrn_pragma_omp(target update from(weights [0:n_weight]))
         }
     }
 }
 
-void update_matrix_from_gpu(NrnThread* _nt) {
-#ifdef _OPENACC
-    if (_nt->compute_gpu && (_nt->end > 0)) {
-        /* before copying, make sure all computations in the stream are completed */
-
-        // clang-format off
-
-        #pragma acc wait(_nt->stream_id)
-
-        /* openacc routine doesn't allow asyn, use pragma */
-        // acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double));
-
-        /* RHS and D are contigious, copy them in one go!
-         * NOTE: in pragma you have to give actual pointer like below and not nt->rhs...
-         */
-        double* rhs = _nt->_actual_rhs;
-        int ne = nrn_soa_padded_size(_nt->end, 0);
-
-        #pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id)
-        #pragma acc wait(_nt->stream_id)
-        // clang-format on
-    }
-#else
-    (void) _nt;
-#endif
-}
-
-void update_matrix_to_gpu(NrnThread* _nt) {
-#ifdef _OPENACC
-    if (_nt->compute_gpu && (_nt->end > 0)) {
-        /* before copying, make sure all computations in the stream are completed */
-
-        // clang-format off
-
-        #pragma acc wait(_nt->stream_id)
-
-        /* while discussion with Michael we found that RHS is also needed on
-         * gpu because nrn_cap_jacob uses rhs which is being updated on GPU
-         */
-        double* v = _nt->_actual_v;
-        double* rhs = _nt->_actual_rhs;
-        int ne = nrn_soa_padded_size(_nt->end, 0);
-
-        #pragma acc update device(v[0 : ne]) async(_nt->stream_id)
-        #pragma acc update device(rhs[0 : ne]) async(_nt->stream_id)
-        #pragma acc wait(_nt->stream_id)
-        // clang-format on
-    }
-#else
-    (void) _nt;
-#endif
-}
-
 /** Cleanup device memory that is being tracked by the OpenACC runtime.
  *
  *  This function painstakingly calls `acc_delete` in reverse order on all
@@ -1343,8 +1302,11 @@ void init_gpu() {
 
     int device_num = local_rank % num_devices_per_node;
     acc_set_device_num(device_num, device_type);
+#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
+    omp_set_default_device(device_num);
+#endif
 
-    if (nrnmpi_myid == 0) {
+    if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
         std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
                   << " ranks per node\n";
     }

diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -23,8 +23,6 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void modify_data_on_device(NrnThread* threads, int nthreads);
 void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);
 
-void update_matrix_from_gpu(NrnThread* _nt);
-void update_matrix_to_gpu(NrnThread* _nt);
 void update_net_receive_buffer(NrnThread* _nt);
 void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);

diff --git a/coreneuron/io/lfp.cpp b/coreneuron/io/lfp.cpp
@@ -7,9 +7,6 @@
 
 
 namespace coreneuron {
-// extern variables require acc declare
-#pragma acc declare create(pi)
-
 namespace lfputils {
 
 double line_source_lfp_factor(const Point3D& e_pos,

diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp
@@ -12,25 +12,9 @@
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/permute/data_layout.hpp"
 
-// clang-format off
-
-#if defined(_OPENACC)
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  \
-    _Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   \
-    _Pragma(                        \
-        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \
-    _Pragma(                        \
-        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#else
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  _Pragma("")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   _Pragma("")
-#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("")
-#endif
-
-// clang-format on
-
+#define _PRAGMA_FOR_INIT_ACC_LOOP_                                                               \
+    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
 #define _STRIDE _cntml_padded + _iml
 
 namespace coreneuron {
@@ -78,15 +62,16 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
     (void) _cntml_padded; /* unused when layout=1*/
 
     double* _vec_d = _nt->_actual_d;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     { /*if (use_cachevec) {*/
         int* ni = ml->nodeindices;
 
         vdata = ml->data;
-        _PRAGMA_FOR_JACOB_ACC_LOOP_
+        nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
+                                             ni [0:_cntml_actual],
+                                             _vec_d [0:_nt->end]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (_iml = 0; _iml < _cntml_actual; _iml++) {
             _vec_d[ni[_iml]] += cfac * cm;
         }
@@ -126,12 +111,13 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
     /* no need to distinguish secondorder */
     int* ni = ml->nodeindices;
     double* _vec_rhs = _nt->_actual_rhs;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     vdata = ml->data;
-    _PRAGMA_FOR_CUR_ACC_LOOP_
+    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
+                                         ni [0:_cntml_actual],
+                                         _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; _iml++) {
         i_cap = cfac * cm * _vec_rhs[ni[_iml]];
     }