From d452e1a59e8a911ac6aa0cbd452b707f5032540d Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 23 Nov 2021 09:00:39 +0100
Subject: [PATCH 01/31] Update nmodl to hackathon_main.

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 85dec3618..794b419f5 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 85dec36180cc8d012db3392c06c065d39de79960
+Subproject commit 794b419f5256f40efcdca1674f712a6e544c235a

From 8ab49e9a22dc0e2c47773de55fdc20c3c95a5be8 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 23 Nov 2021 17:53:09 +0100
Subject: [PATCH 02/31] [Hackathon] disable a lot of CI (#694)

* Disable cmake-format and clang-format checks.
* Disable GitLab CI except for NMODL + GPU.
---
 .../workflows/clang_cmake_format_check.yaml   | 37 --------
 .gitlab-ci.yml                                | 95 -------------------
 2 files changed, 132 deletions(-)
 delete mode 100644 .github/workflows/clang_cmake_format_check.yaml

diff --git a/.github/workflows/clang_cmake_format_check.yaml b/.github/workflows/clang_cmake_format_check.yaml
deleted file mode 100644
index b438a8080..000000000
--- a/.github/workflows/clang_cmake_format_check.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: clang-cmake-format-check
-
-concurrency:
-  group: ${{ github.workflow }}#${{ github.ref }}
-  cancel-in-progress: true
-
-on:
-    push:
-
-jobs:
-  build:
-    name: clang-cmake-format-check
-    runs-on: ubuntu-20.04
-    steps:
-        - name: Fetch repository
-          uses: actions/checkout@v2
-        - name: Install clang-format 11
-          run: |
-              sudo apt-get update
-              sudo apt-get install clang-format-11 python3-pip libboost-all-dev libopenmpi-dev openmpi-bin
-        - name: Install cmake-format 0.6.13
-          run:  python3 -m pip install cmake-format==0.6.13
-        - name: Configure
-          shell: bash
-          working-directory: ${{runner.workspace}}/CoreNeuron
-          run: |
-              export PATH=/home/runner/.local/bin:$PATH
-              mkdir BUILD && cd BUILD
-              cmake -DCORENRN_CLANG_FORMAT=ON -DCORENRN_CMAKE_FORMAT=ON -DCORENRN_ENABLE_MPI=ON -DCORENRN_ENABLE_OPENMP=OFF -DClangFormat_EXECUTABLE=$(which clang-format-11) -DCMakeFormat_EXECUTABLE=$(which cmake-format) ..
-        - name: Run clang-format
-          shell: bash
-          working-directory: ${{runner.workspace}}/CoreNeuron/BUILD
-          run: make check-clang-format VERBOSE=1
-        - name: Run cmake-format
-          shell: bash
-          working-directory: ${{runner.workspace}}/CoreNeuron/BUILD
-          run: make check-cmake-format
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5e3967f7d..1d89f8eca 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -35,9 +35,6 @@ spack_setup:
     - git diff
     - fi
 
-.spack_intel:
-  variables:
-    SPACK_PACKAGE_COMPILER: intel
 .spack_nvhpc:
   variables:
     SPACK_PACKAGE_COMPILER: nvhpc
@@ -50,16 +47,6 @@ spack_setup:
   variables:
     bb5_constraint: volta
 
-build:nmodl:intel:
-  stage: build_nmodl
-  variables:
-    SPACK_PACKAGE: nmodl
-    SPACK_PACKAGE_REF: ''
-    SPACK_PACKAGE_SPEC: ~legacy-unit
-  extends:
-    - .spack_build
-    - .spack_intel
-
 build:nmodl:gpu:
   stage: build_nmodl
   variables:
@@ -71,23 +58,6 @@ build:nmodl:gpu:
     - .spack_build
     - .spack_nvhpc
 
-build:coreneuron+nmodl:intel:
-  variables:
-    SPACK_PACKAGE: coreneuron
-    SPACK_PACKAGE_SPEC: +nmodl+tests~legacy-unit build_type=Debug
-  extends:
-    - .spack_build
-    - .spack_intel
-  needs: ["build:nmodl:intel"]
-
-build:coreneuron:intel:
-  variables:
-    SPACK_PACKAGE: coreneuron
-    SPACK_PACKAGE_SPEC: +tests~legacy-unit build_type=Debug
-  extends:
-    - .spack_build
-    - .spack_intel
-
 build:coreneuron+nmodl:gpu:
   variables:
     SPACK_PACKAGE: coreneuron
@@ -99,48 +69,10 @@ build:coreneuron+nmodl:gpu:
     - .spack_nvhpc
   needs: ["build:nmodl:gpu"]
 
-build:coreneuron:gpu:
-  variables:
-    SPACK_PACKAGE: coreneuron
-    # +report pulls in a lot of dependencies and the tests fail.
-    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +gpu+tests~legacy-unit~report build_type=RelWithDebInfo
-  extends:
-    - .spack_build
-    - .spack_nvhpc
-
-test:coreneuron+nmodl:intel:
-  extends: [.ctest]
-  needs: ["build:coreneuron+nmodl:intel"]
-
-test:coreneuron:intel:
-  extends: [.ctest]
-  needs: ["build:coreneuron:intel"]
-
 test:coreneuron+nmodl:gpu:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron+nmodl:gpu"]
 
-test:coreneuron:gpu:
-  extends: [.ctest, .gpu_node]
-  needs: ["build:coreneuron:gpu"]
-
-build:neuron+nmodl:intel:
-  stage: build_neuron
-  extends:
-    - .spack_build
-    - .spack_neuron
-    - .spack_intel
-  needs: ["build:coreneuron+nmodl:intel"]
-
-build:neuron:intel:
-  stage: build_neuron
-  extends:
-    - .spack_build
-    - .spack_neuron
-    - .spack_intel
-  needs: ["build:coreneuron:intel"]
-
 build:neuron+nmodl:gpu:
   stage: build_neuron
   extends:
@@ -153,34 +85,7 @@ build:neuron+nmodl:gpu:
     - !reference [.spack_build, before_script]
   needs: ["build:coreneuron+nmodl:gpu"]
 
-build:neuron:gpu:
-  stage: build_neuron
-  extends:
-    - .spack_build
-    - .spack_neuron
-    - .spack_nvhpc
-  before_script:
-    # Build py-cython and py-numpy with GCC instead of NVHPC.
-    - SPACK_PACKAGE_DEPENDENCIES="${SPACK_PACKAGE_DEPENDENCIES}^py-cython%gcc^py-numpy%gcc"
-    - !reference [.spack_build, before_script]
-  needs: ["build:coreneuron:gpu"]
-
-test:neuron+nmodl:intel:
-  stage: test_neuron
-  extends: [.ctest]
-  needs: ["build:neuron+nmodl:intel"]
-
-test:neuron:intel:
-  stage: test_neuron
-  extends: [.ctest]
-  needs: ["build:neuron:intel"]
-
 test:neuron+nmodl:gpu:
   stage: test_neuron
   extends: [.ctest, .gpu_node]
   needs: ["build:neuron+nmodl:gpu"]
-
-test:neuron:gpu:
-  stage: test_neuron
-  extends: [.ctest, .gpu_node]
-  needs: ["build:neuron:gpu"]

From 560cc3f2fff6b53c461022ec1f018e5d0781082e Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 25 Nov 2021 19:32:45 +0100
Subject: [PATCH 03/31] [Hackathon] Add a temporary option for benchmark data.
 (#695)

* Add a hackathon-specific argument for benchmarks.
* Add a reference comparison for channel-benchmark.
---
 CMakeLists.txt            |  3 +++
 coreneuron/CMakeLists.txt | 30 +++++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b3edf3a5..4e53a5de6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,9 @@ option(CORENRN_ENABLE_SHARED "Enable shared library build" ON)
 option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF)
 option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF)
 
+set(CORENRN_EXTERNAL_BENCHMARK_DATA
+    ""
+    CACHE PATH "Path to input data files and mechanisms for benchmarks")
 set(CORENRN_NMODL_DIR
     ""
     CACHE PATH "Path to nmodl source-to-source compiler installation")
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 6fd5c98a8..e7337331e 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -293,6 +293,33 @@ set_target_properties(
 # =============================================================================
 # create special-core with halfgap.mod for tests
 # =============================================================================
+set(all_output_binaries)
+if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "")
+  # Hack for the december 2021 hackathon, build an extra special-core with channel-benchmark
+  # mechanisms.
+  set(modfile_directory
+      "${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark/benchmark/channels/lib/modlib")
+  file(GLOB modfiles "${modfile_directory}/*.mod")
+  set(output_binaries "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core"
+                      "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech.a")
+  add_custom_command(
+    OUTPUT ${output_binaries}
+    DEPENDS scopmath coreneuron ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
+    COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b STATIC -m ${CORENRN_MOD2CPP_BINARY} -p 1
+            "${modfile_directory}"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark
+    COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms")
+  list(APPEND all_output_binaries ${output_binaries})
+  string(
+    CONCAT
+      benchmark_command
+      "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
+      " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
+      " --tstop 1 &&"
+      "diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'"
+  )
+  add_test(NAME benchmark COMMAND sh -c "${benchmark_command}")
+endif()
 set(modfile_directory "${CORENEURON_PROJECT_SOURCE_DIR}/tests/integration/ring_gap/mod")
 file(GLOB modfiles "${modfile_directory}/*.mod")
 set(output_binaries "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/special-core"
@@ -304,7 +331,8 @@ add_custom_command(
           "${modfile_directory}"
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin
   COMMENT "Running nrnivmodl-core with halfgap.mod")
-add_custom_target(nrniv-core ALL DEPENDS ${output_binaries})
+list(APPEND all_output_binaries ${output_binaries})
+add_custom_target(nrniv-core ALL DEPENDS ${all_output_binaries})
 
 include_directories(${CORENEURON_PROJECT_SOURCE_DIR})
 

From de4e4337da16e380de81853b108cfacbeb2a6d8b Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Fri, 26 Nov 2021 08:50:58 +0100
Subject: [PATCH 04/31] Minor changes for building on perlmutter (#697)

* create build/benchmark folder before trying to use it
* run nrnivmodl-core in parallel than serially (too slow)
---
 coreneuron/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index e7337331e..5bea0569a 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -302,10 +302,11 @@ if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "")
   file(GLOB modfiles "${modfile_directory}/*.mod")
   set(output_binaries "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core"
                       "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech.a")
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark)
   add_custom_command(
     OUTPUT ${output_binaries}
     DEPENDS scopmath coreneuron ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
-    COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b STATIC -m ${CORENRN_MOD2CPP_BINARY} -p 1
+    COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b STATIC -m ${CORENRN_MOD2CPP_BINARY} -p 6
             "${modfile_directory}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark
     COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms")

From 81dd5ef4bbecbb3b8769d0753c6910785ca82b11 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 29 Nov 2021 15:00:07 +0100
Subject: [PATCH 05/31] Enable OpenMP in CoreNEURON CI. (#698)

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1d89f8eca..84e83c0ac 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -63,7 +63,7 @@ build:coreneuron+nmodl:gpu:
     SPACK_PACKAGE: coreneuron
     # +report pulls in a lot of dependencies and the tests fail.
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +nmodl+gpu+tests~legacy-unit~report build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~report build_type=RelWithDebInfo
   extends:
     - .spack_build
     - .spack_nvhpc

From 3e394c499e9746d3daee5793ae0816eabe39e2e1 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 29 Nov 2021 21:39:55 +0100
Subject: [PATCH 06/31] Set by default the number of warps to execute in a
 large reasonable number and update the related documentation (#700)

---
 coreneuron/apps/corenrn_parameters.cpp | 7 ++++++-
 coreneuron/apps/corenrn_parameters.hpp | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/coreneuron/apps/corenrn_parameters.cpp b/coreneuron/apps/corenrn_parameters.cpp
index c0aa02ab0..4403f44db 100644
--- a/coreneuron/apps/corenrn_parameters.cpp
+++ b/coreneuron/apps/corenrn_parameters.cpp
@@ -47,7 +47,12 @@ corenrn_parameters::corenrn_parameters() {
                  "Print number of instances of each mechanism and detailed memory stats.");
 
     auto sub_gpu = app.add_option_group("GPU", "Commands relative to GPU.");
-    sub_gpu->add_option("-W, --nwarp", this->nwarp, "Number of warps to balance.", true)
+    sub_gpu
+        ->add_option("-W, --nwarp",
+                     this->nwarp,
+                     "Number of warps to execute in parallel the Hines solver. Each warp solves a "
+                     "group of cells. (Only used with cell permute 2)",
+                     true)
         ->check(CLI::Range(0, 1'000'000));
     sub_gpu
         ->add_option("-R, --cell-permute",
diff --git a/coreneuron/apps/corenrn_parameters.hpp b/coreneuron/apps/corenrn_parameters.hpp
index ea7ef8aba..21f2f7767 100644
--- a/coreneuron/apps/corenrn_parameters.hpp
+++ b/coreneuron/apps/corenrn_parameters.hpp
@@ -46,7 +46,7 @@ struct corenrn_parameters {
     unsigned ms_subint = 2;                /// Number of multisend interval. 1 or 2
     unsigned spkcompress = 0;              /// Spike Compression
     unsigned cell_interleave_permute = 0;  /// Cell interleaving permutation
-    unsigned nwarp = 0;     /// Number of warps to balance for cell_interleave_permute == 2
+    unsigned nwarp = 1024;  /// Number of warps to balance for cell_interleave_permute == 2
     unsigned num_gpus = 0;  /// Number of gpus to use per node
     unsigned report_buff_size = report_buff_size_default;  /// Size in MB of the report buffer.
     int seed = -1;  /// Initialization seed for random number generator (int)

From a8bb7164a5b3b8802c97ce11a0c083d463c7dbc5 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 1 Dec 2021 21:05:19 +0100
Subject: [PATCH 07/31] Add memory pool for Random123 streams. (#702)

* Add memory pool for Random123 streams.
   This speeds up initialisation when running on GPU.
* Make Boost optional.
---
 coreneuron/CMakeLists.txt             |  9 +++++
 coreneuron/utils/randoms/nrnran123.cu | 55 ++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 5bea0569a..60bd2b370 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -284,6 +284,15 @@ target_include_directories(coreneuron SYSTEM
 target_include_directories(coreneuron SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
 
+if(CORENRN_ENABLE_GPU)
+  # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available.
+  find_package(Boost QUIET)
+  if(Boost_FOUND)
+    target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
+    target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL)
+  endif()
+endif()
+
 set_target_properties(
   coreneuron scopmath
   PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu
index b13dad7eb..9a3d205a3 100644
--- a/coreneuron/utils/randoms/nrnran123.cu
+++ b/coreneuron/utils/randoms/nrnran123.cu
@@ -15,6 +15,11 @@
 #include <memory>
 #include <mutex>
 
+#ifdef CORENEURON_USE_BOOST_POOL
+#include <boost/pool/pool_alloc.hpp>
+#include <unordered_map>
+#endif
+
 // In a GPU build this file will be compiled by NVCC as CUDA code
 // In a CPU build this file will be compiled by a C++ compiler as C++ code
 #ifdef __CUDACC__
@@ -24,6 +29,48 @@
 #endif
 
 namespace {
+#ifdef CORENEURON_USE_BOOST_POOL
+/** Tag type for use with boost::fast_pool_allocator that forwards to
+ *  coreneuron::[de]allocate_unified(). Using a Random123-specific type here
+ *  makes sure that allocations do not come from the same global pool as other
+ *  usage of boost pools for objects with sizeof == sizeof(nrnran123_State).
+ *
+ *  The messy m_block_sizes map is just because `deallocate_unified` uses sized
+ *  deallocations, but the Boost pool allocators don't. Because this is hidden
+ *  behind the pool mechanism, these methods are not called very often and the
+ *  overhead is minimal.
+ */
+struct random123_allocate_unified {
+    using size_type = std::size_t;
+    using difference_type = std::size_t;
+    static char* malloc(const size_type bytes) {
+        std::lock_guard<std::mutex> const lock{m_mutex};
+        static_cast<void>(lock);
+        auto* buffer = coreneuron::allocate_unified(bytes);
+        m_block_sizes[buffer] = bytes;
+        return reinterpret_cast<char*>(buffer);
+    }
+    static void free(char* const block) {
+        std::lock_guard<std::mutex> const lock{m_mutex};
+        static_cast<void>(lock);
+        auto const iter = m_block_sizes.find(block);
+        assert(iter != m_block_sizes.end());
+        auto const size = iter->second;
+        m_block_sizes.erase(iter);
+        return coreneuron::deallocate_unified(block, size);
+    }
+    static std::mutex m_mutex;
+    static std::unordered_map<void*, std::size_t> m_block_sizes;
+};
+
+std::mutex random123_allocate_unified::m_mutex{};
+std::unordered_map<void*, std::size_t> random123_allocate_unified::m_block_sizes{};
+
+using random123_allocator =
+    boost::fast_pool_allocator<coreneuron::nrnran123_State, random123_allocate_unified>;
+#else
+using random123_allocator = coreneuron::unified_allocator<nrnran123_State>;
+#endif
 /* Global data structure per process. Using a unique_ptr here causes [minor]
  * problems because its destructor can be called very late during application
  * shutdown. If the destructor calls cudaFree and the CUDA runtime has already
@@ -212,9 +259,7 @@ nrnran123_State* nrnran123_newstream3(uint32_t id1,
 #endif
     nrnran123_State* s{nullptr};
     if (use_unified_memory) {
-        s = coreneuron::allocate_unique<nrnran123_State>(
-                coreneuron::unified_allocator<nrnran123_State>{})
-                .release();
+        s = coreneuron::allocate_unique<nrnran123_State>(random123_allocator{}).release();
     } else {
         s = new nrnran123_State{};
     }
@@ -244,9 +289,7 @@ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) {
         --g_instance_count;
     }
     if (use_unified_memory) {
-        std::unique_ptr<nrnran123_State,
-                        coreneuron::alloc_deleter<coreneuron::unified_allocator<nrnran123_State>>>
-            _{s};
+        std::unique_ptr<nrnran123_State, coreneuron::alloc_deleter<random123_allocator>> _{s};
     } else {
         delete s;
     }

From 96498142ce60d15edbd4c5161c7153f41eda6b20 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Thu, 2 Dec 2021 11:04:21 +0100
Subject: [PATCH 08/31] Fix Boost-free compilation. (#703)

This was a silly bug in #702.
---
 coreneuron/utils/randoms/nrnran123.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu
index 9a3d205a3..8a02c4e26 100644
--- a/coreneuron/utils/randoms/nrnran123.cu
+++ b/coreneuron/utils/randoms/nrnran123.cu
@@ -69,7 +69,7 @@ std::unordered_map<void*, std::size_t> random123_allocate_unified::m_block_sizes
 using random123_allocator =
     boost::fast_pool_allocator<coreneuron::nrnran123_State, random123_allocate_unified>;
 #else
-using random123_allocator = coreneuron::unified_allocator<nrnran123_State>;
+using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_State>;
 #endif
 /* Global data structure per process. Using a unique_ptr here causes [minor]
  * problems because its destructor can be called very late during application

From 21dc2c8b40bf817ccaeeab05ae91f0b4ce88145f Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 7 Dec 2021 13:13:36 +0100
Subject: [PATCH 09/31] Basic OpenACC -> OpenMP migration. (#693)

* Simplify unified memory logic.
* Pass -mp=gpu when we pass -acc
* Pass -gpu=lineinfo for better debug information.
* Pass -Minfo=accel,mp for better compile time diagnostics.
* Add nrn_pragma_{acc,omp} macros for single-source Open{ACC,MP} support.
* Call omp_set_default_device.
* Drop cc60 because of OpenMP offload incompatibility.
* Add --gpu to test.
* Default (BB5-valid) CORENRN_EXTERNAL_BENCHMARK_DATA.
* Remove cuda_add_library.
* Don't print number of GPUs when quiet.
* Set OMP_NUM_THREADS=1 for lfp_test.
* Update NMODL to emit nrn_pragma{acc,omp} macros.

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 .clang-format.changes                  |   3 +-
 .cmake-format.changes.yaml             |   5 --
 CMake/OpenAccHelper.cmake              |   8 +-
 CMakeLists.txt                         |   5 +-
 coreneuron/CMakeLists.txt              |   2 +-
 coreneuron/apps/main1.cpp              |   6 +-
 coreneuron/gpu/nrn_acc_manager.cpp     | 100 ++++++++-----------------
 coreneuron/gpu/nrn_acc_manager.hpp     |   2 -
 coreneuron/io/lfp.cpp                  |   3 -
 coreneuron/mechanism/capac.cpp         |  40 ++++------
 coreneuron/mechanism/eion.cpp          |  50 ++++++-------
 coreneuron/mechanism/register_mech.cpp |   4 -
 coreneuron/network/cvodestb.cpp        |   8 +-
 coreneuron/network/netcvode.cpp        |  49 ++++--------
 coreneuron/network/partrans.cpp        |  90 +++++++++++-----------
 coreneuron/nrnconf.h                   |  10 ++-
 coreneuron/permute/cellorder.cpp       |  73 ++++++++----------
 coreneuron/sim/fadvance_core.cpp       |  70 +++++++----------
 coreneuron/sim/fast_imem.cpp           |  13 ++--
 coreneuron/sim/finitialize.cpp         |   9 +--
 coreneuron/sim/solve_core.cpp          |  45 ++++-------
 coreneuron/sim/treeset_core.cpp        |  64 ++++++++--------
 coreneuron/utils/memory.h              |   3 +-
 coreneuron/utils/offload.hpp           |  20 +++++
 external/nmodl                         |   2 +-
 tests/unit/lfp/CMakeLists.txt          |   1 +
 26 files changed, 283 insertions(+), 402 deletions(-)
 create mode 100644 coreneuron/utils/offload.hpp

diff --git a/.clang-format.changes b/.clang-format.changes
index 01b58702d..4c2b11b59 100644
--- a/.clang-format.changes
+++ b/.clang-format.changes
@@ -1,2 +1,3 @@
-SortIncludes: false
 IndentCaseLabels: true
+SortIncludes: false
+StatementMacros: [nrn_pragma_acc, nrn_pragma_omp]
diff --git a/.cmake-format.changes.yaml b/.cmake-format.changes.yaml
index 19ea9c084..2f20247f7 100644
--- a/.cmake-format.changes.yaml
+++ b/.cmake-format.changes.yaml
@@ -1,9 +1,4 @@
 additional_commands:
-  cuda_add_library:
-    pargs: '*'
-    flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"]
-    kwargs:
-      OPTIONS: '*'
   cpp_cc_build_time_copy:
     flags: ['NO_TARGET']
     kwargs:
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 7767a3672..c7f91a7c9 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -55,7 +55,7 @@ if(CORENRN_ENABLE_GPU)
   # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
   # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
   # CUDA version as is used for the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
+  set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
@@ -63,6 +63,12 @@ if(CORENRN_ENABLE_GPU)
   foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
     string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
   endforeach()
+  if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
+    # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+    # for a region then prefer OpenMP.
+    add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
+  endif()
   # avoid PGI adding standard compliant "-A" flags
   set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
   string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e53a5de6..963703975 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11)
 # Build options
 # =============================================================================
 option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
+option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
 option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
 option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
 option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
@@ -104,7 +105,7 @@ option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF)
 option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF)
 
 set(CORENRN_EXTERNAL_BENCHMARK_DATA
-    ""
+    "/gpfs/bbp.cscs.ch/project/proj12/nersc-gpu-hackathon-dec-2021"
     CACHE PATH "Path to input data files and mechanisms for benchmarks")
 set(CORENRN_NMODL_DIR
     ""
@@ -138,7 +139,7 @@ if(CORENRN_ENABLE_GPU)
 
   # Set some sensible default CUDA architectures.
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
+    set(CMAKE_CUDA_ARCHITECTURES 70 80)
     message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
   endif()
 
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 60bd2b370..2308ab99a 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -325,7 +325,7 @@ if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "")
       benchmark_command
       "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
       " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
-      " --tstop 1 &&"
+      " --tstop 1 --gpu &&"
       "diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'"
   )
   add_test(NAME benchmark COMMAND sh -c "${benchmark_command}")
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index 0fdaa509b..6a4d43bea 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -558,10 +558,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
 #endif
     bool compute_gpu = corenrn_param.gpu;
 
-    // clang-format off
-
-    #pragma acc update device(celsius, secondorder, pi) if (compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(update device(celsius, secondorder, pi) if(compute_gpu))
+    nrn_pragma_omp(target update to(celsius, secondorder, pi) if(compute_gpu))
     {
         double v = corenrn_param.voltage;
         double dt = corenrn_param.dt;
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index ac98f5420..b249875dc 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -27,6 +27,9 @@
 #ifdef _OPENACC
 #include <openacc.h>
 #endif
+#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
+#include <omp.h>
+#endif
 
 #ifdef CRAYPAT
 #include <pat_api.h>
@@ -605,25 +608,36 @@ void update_net_receive_buffer(NrnThread* nt) {
             // instance order to avoid race. setup _displ and _nrb_index
             net_receive_buffer_order(nrb);
 
-#ifdef _OPENACC
             if (nt->compute_gpu) {
                 Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu");
                 // note that dont update nrb otherwise we lose pointers
 
+                // clang-format off
+
                 /* update scalar elements */
-                acc_update_device(&nrb->_cnt, sizeof(int));
-                acc_update_device(&nrb->_displ_cnt, sizeof(int));
-
-                acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt);
-                acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt);
-                acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt);
-                acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt);
-                acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1));
-                acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt);
+                nrn_pragma_acc(update device(nrb->_cnt,
+                                             nrb->_displ_cnt,
+                                             nrb->_pnt_index[:nrb->_cnt],
+                                             nrb->_weight_index[:nrb->_cnt],
+                                             nrb->_nrb_t[:nrb->_cnt],
+                                             nrb->_nrb_flag[:nrb->_cnt],
+                                             nrb->_displ[:nrb->_displ_cnt + 1],
+                                             nrb->_nrb_index[:nrb->_cnt])
+                                             async(nt->stream_id))
+                nrn_pragma_omp(target update to(nrb->_cnt,
+                                                nrb->_displ_cnt,
+                                                nrb->_pnt_index[:nrb->_cnt],
+                                                nrb->_weight_index[:nrb->_cnt],
+                                                nrb->_nrb_t[:nrb->_cnt],
+                                                nrb->_nrb_flag[:nrb->_cnt],
+                                                nrb->_displ[:nrb->_displ_cnt + 1],
+                                                nrb->_nrb_index[:nrb->_cnt]))
+                // clang-format on
             }
-#endif
         }
     }
+    nrn_pragma_acc(wait(nt->stream_id))
+    nrn_pragma_omp(taskwait)
 }
 
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
@@ -894,67 +908,12 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
         size_t n_weight = nt->n_weight;
         if (nt->compute_gpu && n_weight > 0) {
             double* weights = nt->weights;
-            // clang-format off
-
-            #pragma acc update host(weights [0:n_weight])
-            // clang-format on
+            nrn_pragma_acc(update host(weights [0:n_weight]))
+            nrn_pragma_omp(target update from(weights [0:n_weight]))
         }
     }
 }
 
-void update_matrix_from_gpu(NrnThread* _nt) {
-#ifdef _OPENACC
-    if (_nt->compute_gpu && (_nt->end > 0)) {
-        /* before copying, make sure all computations in the stream are completed */
-
-        // clang-format off
-
-        #pragma acc wait(_nt->stream_id)
-
-        /* openacc routine doesn't allow asyn, use pragma */
-        // acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double));
-
-        /* RHS and D are contigious, copy them in one go!
-         * NOTE: in pragma you have to give actual pointer like below and not nt->rhs...
-         */
-        double* rhs = _nt->_actual_rhs;
-        int ne = nrn_soa_padded_size(_nt->end, 0);
-
-        #pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id)
-        #pragma acc wait(_nt->stream_id)
-        // clang-format on
-    }
-#else
-    (void) _nt;
-#endif
-}
-
-void update_matrix_to_gpu(NrnThread* _nt) {
-#ifdef _OPENACC
-    if (_nt->compute_gpu && (_nt->end > 0)) {
-        /* before copying, make sure all computations in the stream are completed */
-
-        // clang-format off
-
-        #pragma acc wait(_nt->stream_id)
-
-        /* while discussion with Michael we found that RHS is also needed on
-         * gpu because nrn_cap_jacob uses rhs which is being updated on GPU
-         */
-        double* v = _nt->_actual_v;
-        double* rhs = _nt->_actual_rhs;
-        int ne = nrn_soa_padded_size(_nt->end, 0);
-
-        #pragma acc update device(v[0 : ne]) async(_nt->stream_id)
-        #pragma acc update device(rhs[0 : ne]) async(_nt->stream_id)
-        #pragma acc wait(_nt->stream_id)
-        // clang-format on
-    }
-#else
-    (void) _nt;
-#endif
-}
-
 /** Cleanup device memory that is being tracked by the OpenACC runtime.
  *
  *  This function painstakingly calls `acc_delete` in reverse order on all
@@ -1343,8 +1302,11 @@ void init_gpu() {
 
     int device_num = local_rank % num_devices_per_node;
     acc_set_device_num(device_num, device_type);
+#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
+    omp_set_default_device(device_num);
+#endif
 
-    if (nrnmpi_myid == 0) {
+    if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
         std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
                   << " ranks per node\n";
     }
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
index 67e6a058c..354bdc208 100644
--- a/coreneuron/gpu/nrn_acc_manager.hpp
+++ b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -23,8 +23,6 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void modify_data_on_device(NrnThread* threads, int nthreads);
 void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);
 
-void update_matrix_from_gpu(NrnThread* _nt);
-void update_matrix_to_gpu(NrnThread* _nt);
 void update_net_receive_buffer(NrnThread* _nt);
 void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
diff --git a/coreneuron/io/lfp.cpp b/coreneuron/io/lfp.cpp
index 646fbf5a0..2a001b85a 100644
--- a/coreneuron/io/lfp.cpp
+++ b/coreneuron/io/lfp.cpp
@@ -7,9 +7,6 @@
 
 
 namespace coreneuron {
-// extern variables require acc declare
-#pragma acc declare create(pi)
-
 namespace lfputils {
 
 double line_source_lfp_factor(const Point3D& e_pos,
diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp
index ee62f660d..42c65cb18 100644
--- a/coreneuron/mechanism/capac.cpp
+++ b/coreneuron/mechanism/capac.cpp
@@ -12,25 +12,9 @@
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/permute/data_layout.hpp"
 
-// clang-format off
-
-#if defined(_OPENACC)
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  \
-    _Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   \
-    _Pragma(                        \
-        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \
-    _Pragma(                        \
-        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#else
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  _Pragma("")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   _Pragma("")
-#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("")
-#endif
-
-// clang-format on
-
+#define _PRAGMA_FOR_INIT_ACC_LOOP_                                                               \
+    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
 #define _STRIDE _cntml_padded + _iml
 
 namespace coreneuron {
@@ -78,15 +62,16 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
     (void) _cntml_padded; /* unused when layout=1*/
 
     double* _vec_d = _nt->_actual_d;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     { /*if (use_cachevec) {*/
         int* ni = ml->nodeindices;
 
         vdata = ml->data;
-        _PRAGMA_FOR_JACOB_ACC_LOOP_
+        nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
+                                             ni [0:_cntml_actual],
+                                             _vec_d [0:_nt->end]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (_iml = 0; _iml < _cntml_actual; _iml++) {
             _vec_d[ni[_iml]] += cfac * cm;
         }
@@ -126,12 +111,13 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
     /* no need to distinguish secondorder */
     int* ni = ml->nodeindices;
     double* _vec_rhs = _nt->_actual_rhs;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     vdata = ml->data;
-    _PRAGMA_FOR_CUR_ACC_LOOP_
+    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
+                                         ni [0:_cntml_actual],
+                                         _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; _iml++) {
         i_cap = cfac * cm * _vec_rhs[ni[_iml]];
     }
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 76adc9045..727f30ea6 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -19,26 +19,6 @@
 
 #define _STRIDE _cntml_padded + _iml
 
-// clang-format off
-
-#if defined(_OPENACC)
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  \
-    _Pragma(                        \
-        "acc parallel loop present(pd[0:_cntml_padded*5], ppd[0:1], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu)")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   \
-    _Pragma(                        \
-        "acc parallel loop present(pd[0:_cntml_padded*5], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu) async(stream_id)")
-#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_     \
-    _Pragma(                                    \
-        "acc parallel loop present(pd[0:_cntml_padded*5], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#else
-#define _PRAGMA_FOR_INIT_ACC_LOOP_          _Pragma("")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_           _Pragma("")
-#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ _Pragma("")
-#endif
-
-// clang-format on
-
 namespace coreneuron {
 
 // for each ion it refers to internal concentration, external concentration, and charge,
@@ -277,14 +257,16 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
     double* pd;
     Datum* ppd;
     (void) nt; /* unused */
-#if defined(_OPENACC)
-    int stream_id = nt->stream_id;
-#endif
     /*printf("ion_cur %s\n", memb_func[type].sym->name);*/
     int _cntml_padded = ml->_nodecount_padded;
     pd = ml->data;
     ppd = ml->pdata;
-    _PRAGMA_FOR_CUR_ACC_LOOP_
+    nrn_pragma_acc(parallel loop present(
+        pd [0:_cntml_padded * 5],
+        nrn_ion_global_map
+        [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)
+                       async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
         dcurdv = 0.;
         cur = 0.;
@@ -312,7 +294,16 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) {
     int _cntml_padded = ml->_nodecount_padded;
     pd = ml->data;
     ppd = ml->pdata;
-    _PRAGMA_FOR_INIT_ACC_LOOP_
+    // There was no async(...) clause in the initial OpenACC implementation, so
+    // no `nowait` clause has been added to the OpenMP implementation. TODO:
+    // verify if this can be made asynchronous or if there is a strong reason it
+    // needs to be like this.
+    nrn_pragma_acc(parallel loop present(
+        pd [0:_cntml_padded * 5],
+        ppd [0:1],
+        nrn_ion_global_map
+        [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
         if (iontype & 04) {
             conci = conci0;
@@ -332,9 +323,6 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
     int _cntml_padded;
     double* pd;
     (void) _nt; /* unused */
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
     double* _vec_rhs = _nt->_actual_rhs;
 
     if (secondorder == 2) {
@@ -345,7 +333,11 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
                 int* ni = ml->nodeindices;
                 _cntml_padded = ml->_nodecount_padded;
                 pd = ml->data;
-                _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_
+                nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5],
+                                                     ni [0:_cntml_actual],
+                                                     _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
+                                   async(_nt->stream_id))
+                nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
                 for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
                     cur += dcurdv * (_vec_rhs[ni[_iml]]);
                 }
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 3acdff1ea..a8bff7a50 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -20,10 +20,6 @@
 namespace coreneuron {
 int secondorder = 0;
 double t, dt, celsius, pi;
-// declare copyin required for correct initialization
-#pragma acc declare copyin(secondorder)
-#pragma acc declare copyin(celsius)
-#pragma acc declare copyin(pi)
 int rev_dt;
 
 using Pfrv = void (*)();
diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp
index 6ed52dc34..31b2fec54 100644
--- a/coreneuron/network/cvodestb.cpp
+++ b/coreneuron/network/cvodestb.cpp
@@ -61,11 +61,9 @@ void init_net_events() {
         NrnThread* nt = nrn_threads + ith;
         double* weights = nt->weights;
         int n_weight = nt->n_weight;
-        if (n_weight) {
-            // clang-format off
-
-            #pragma acc update device(weights[0 : n_weight]) if (nt->compute_gpu)
-            // clang-format on
+        if (n_weight && nt->compute_gpu) {
+            nrn_pragma_acc(update device(weights[0:n_weight]))
+            nrn_pragma_omp(target update to(weights[0:n_weight]))
         }
     }
 #endif
diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp
index 899bc1e14..ee2e5cb3e 100644
--- a/coreneuron/network/netcvode.cpp
+++ b/coreneuron/network/netcvode.cpp
@@ -531,28 +531,13 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
     PreSynHelper* presyns_helper = nt->presyns_helper;
     double* actual_v = nt->_actual_v;
 
-#if defined(_OPENACC)
-    int stream_id = nt->stream_id;
-#endif
-
     if (nt->ncell == 0)
         return;
 
-        //_net_send_buffer_cnt is no longer used in openacc kernel, remove this?
-        //#ifdef _OPENACC
-        //    if(nt->compute_gpu)
-        //        acc_update_device(&(nt->_net_send_buffer_cnt), sizeof(int));
-        //#endif
-
-        // on GPU...
-        // clang-format off
-
-    #pragma acc parallel loop present(                  \
-        nt[0:1], presyns_helper[0:nt->n_presyn],        \
-        presyns[0:nt->n_presyn], actual_v[0:nt->end])   \
-        copy(net_send_buf_count) if (nt->compute_gpu)   \
-        async(stream_id)
-    // clang-format on
+    nrn_pragma_acc(parallel loop present(
+        nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
+                       copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu))
     for (int i = 0; i < nt->ncell; ++i) {
         PreSyn* ps = presyns + i;
         PreSynHelper* psh = presyns_helper + i;
@@ -563,7 +548,7 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
         int* flag = &(psh->flag_);
 
         if (pscheck(v, threshold, flag)) {
-#ifndef _OPENACC
+#ifndef CORENEURON_ENABLE_GPU
             nt->_net_send_buffer_cnt = net_send_buf_count;
             if (nt->_net_send_buffer_cnt >= nt->_net_send_buffer_size) {
                 nt->_net_send_buffer_size *= 2;
@@ -572,31 +557,23 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
             }
 #endif
 
-            // clang-format off
-
-            #pragma acc atomic capture
-            // clang-format on
+            nrn_pragma_acc(atomic capture)
+            nrn_pragma_omp(atomic capture)
             idx = net_send_buf_count++;
 
             nt->_net_send_buffer[idx] = i;
         }
     }
-
-    // clang-format off
-
-    #pragma acc wait(stream_id)
-    // clang-format on
+    nrn_pragma_acc(wait(nt->stream_id))
     nt->_net_send_buffer_cnt = net_send_buf_count;
 
-    if (nt->_net_send_buffer_cnt) {
-#ifdef _OPENACC
+    if (nt->compute_gpu && nt->_net_send_buffer_cnt) {
+#ifdef CORENEURON_ENABLE_GPU
         int* nsbuffer = nt->_net_send_buffer;
 #endif
-        // clang-format off
-
-        #pragma acc update host(nsbuffer[0:nt->_net_send_buffer_cnt]) if (nt->compute_gpu) async(stream_id)
-        #pragma acc wait(stream_id)
-        // clang-format on
+        nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id))
+        nrn_pragma_acc(wait(nt->stream_id))
+        nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]))
     }
 
     // on CPU...
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
index e74d866ce..1bd822f54 100644
--- a/coreneuron/network/partrans.cpp
+++ b/coreneuron/network/partrans.cpp
@@ -41,40 +41,39 @@ void nrnmpi_v_transfer() {
     // gather the source values. can be done in parallel
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         auto& ttd = transfer_thread_data_[tid];
-        auto& nt = nrn_threads[tid];
+        auto* nt = &nrn_threads[tid];
         int n = int(ttd.outsrc_indices.size());
         if (n == 0) {
             continue;
         }
-        double* src_data = nt._data;
+        double* src_data = nt->_data;
         int* src_indices = ttd.src_indices.data();
 
         // gather sources on gpu and copy to cpu, cpu scatters to outsrc_buf
         double* src_gather = ttd.src_gather.data();
         size_t n_src_gather = ttd.src_gather.size();
-        // clang-format off
 
-        #pragma acc parallel loop present(                                          \
-            src_indices[0:n_src_gather], src_data[0:nt._ndata],                     \
-            src_gather[0 : n_src_gather]) /*copyout(src_gather[0:n_src_gather])*/   \
-            if (nt.compute_gpu) async(nt.stream_id)
+        nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather],
+                                             src_data [0:nt->_ndata],
+                                             src_gather [0:n_src_gather]) if (nt->compute_gpu)
+                           async(nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
         for (int i = 0; i < n_src_gather; ++i) {
             src_gather[i] = src_data[src_indices[i]];
         }
-        // do not know why the copyout above did not work
-        // and the following update is needed
-        #pragma acc update host(src_gather[0 : n_src_gather])   \
-            if (nrn_threads[0].compute_gpu)                     \
-            async(nt.stream_id)
-        // clang-format on
+        nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu)
+                           async(nt->stream_id))
+        nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu))
     }
 
     // copy gathered source values to outsrc_buf_
+    bool compute_gpu = false;
     for (int tid = 0; tid < nrn_nthread; ++tid) {
-        // clang-format off
-
-        #pragma acc wait(nrn_threads[tid].stream_id)
-        // clang-format on
+        if (nrn_threads[tid].compute_gpu) {
+            compute_gpu = true;
+            nrn_pragma_acc(wait(nrn_threads[tid].stream_id))
+            nrn_pragma_omp(taskwait)
+        }
         TransferThreadData& ttd = transfer_thread_data_[tid];
         size_t n_outsrc_indices = ttd.outsrc_indices.size();
         int* outsrc_indices = ttd.outsrc_indices.data();
@@ -102,12 +101,8 @@ void nrnmpi_v_transfer() {
     }
 
     // insrc_buf_ will get copied to targets via nrnthread_v_transfer
-    // clang-format off
-
-    #pragma acc update device(          \
-        insrc_buf_[0:n_insrc_buf])      \
-        if (nrn_threads[0].compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(update device(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu))
+    nrn_pragma_omp(target update to(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu))
 }
 
 void nrnthread_v_transfer(NrnThread* _nt) {
@@ -119,33 +114,32 @@ void nrnthread_v_transfer(NrnThread* _nt) {
     int* insrc_indices = ttd.insrc_indices.data();
     double* tar_data = _nt->_data;
     // last element in the displacement vector gives total length
+#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
     int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
     int ndata = _nt->_ndata;
+#endif
 
-    // clang-format off
-
-    #pragma acc parallel loop present(  \
-        insrc_indices[0:ntar],          \
-        tar_data[0:ndata],              \
-        insrc_buf_[0:n_insrc_buf])      \
-    if (_nt->compute_gpu)               \
-        async(_nt->stream_id)
-    // clang-format on
+    nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar],
+                                         tar_data [0:ndata],
+                                         insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu))
     for (size_t i = 0; i < ntar; ++i) {
         tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]];
     }
 }
 
+/// TODO: Corresponding exit data cluase for OpenACC/OpenMP is missing and hence
+///       GPU buffers are not freed.
 void nrn_partrans::gap_update_indices() {
     // Ensure index vectors, src_gather, and insrc_buf_ are on the gpu.
     if (insrcdspl_) {
         int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
+        nrn_pragma_acc(enter data create(insrc_buf_ [0:n_insrc_buf]) if (corenrn_param.gpu))
+        // clang-format off
+        nrn_pragma_omp(target enter data map(alloc: insrc_buf_[0:n_insrc_buf])
+                                         if(corenrn_param.gpu))
         // clang-format off
-
-        #pragma acc enter data create(      \
-            insrc_buf_[0:n_insrc_buf])      \
-            if (nrn_threads[0].compute_gpu)
-        // clang-format on
     }
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         TransferThreadData& ttd = transfer_thread_data_[tid];
@@ -154,21 +148,25 @@ void nrn_partrans::gap_update_indices() {
         size_t n_src_gather = ttd.src_gather.size();
         NrnThread* nt = nrn_threads + tid;
         if (n_src_indices) {
+            int* src_indices = ttd.src_indices.data();
+            double* src_gather = ttd.src_gather.data();
+            nrn_pragma_acc(enter data copyin(src_indices[0:n_src_indices]) if(nt->compute_gpu))
+            nrn_pragma_acc(enter data create(src_gather[0:n_src_gather]) if(nt->compute_gpu))
             // clang-format off
-
-            int *src_indices = ttd.src_indices.data();
-            double *src_gather = ttd.src_gather.data();
-            #pragma acc enter data copyin(src_indices[0 : n_src_indices]) if (nt->compute_gpu)
-            #pragma acc enter data create(src_gather[0 : n_src_gather]) if (nt->compute_gpu)
+            nrn_pragma_omp(target enter data map(to: src_indices [0:n_src_indices])
+                                             map(alloc: src_gather[0:n_src_gather])
+                                             if(nt->compute_gpu))
             // clang-format on
         }
 
         if (ttd.insrc_indices.size()) {
-            // clang-format off
-
-            int *insrc_indices = ttd.insrc_indices.data();
+            int* insrc_indices = ttd.insrc_indices.data();
             size_t n_insrc_indices = ttd.insrc_indices.size();
-            #pragma acc enter data copyin(insrc_indices[0 : n_insrc_indices]) if (nt->compute_gpu)
+            nrn_pragma_acc(
+                enter data copyin(insrc_indices [0:n_insrc_indices]) if (nt->compute_gpu))
+            // clang-format off
+            nrn_pragma_omp(target enter data map(to: insrc_indices[0:n_insrc_indices])
+                                             if(nt->compute_gpu))
             // clang-format on
         }
     }
diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h
index 2c7fb8bb9..225d6d2ad 100644
--- a/coreneuron/nrnconf.h
+++ b/coreneuron/nrnconf.h
@@ -9,6 +9,8 @@
 #ifndef _H_NRNCONF_
 #define _H_NRNCONF_
 
+#include "coreneuron/utils/offload.hpp"
+
 #include <cstdio>
 #include <cmath>
 #include <cassert>
@@ -32,14 +34,16 @@ using Symbol = char;
 #define VECTORIZE   1
 
 // extern variables require acc declare
+nrn_pragma_omp(declare target)
 extern double celsius;
-#pragma acc declare create(celsius)
+nrn_pragma_acc(declare create(celsius))
 
 extern double pi;
-#pragma acc declare create(pi)
+nrn_pragma_acc(declare create(pi))
 
 extern int secondorder;
-#pragma acc declare create(secondorder)
+nrn_pragma_acc(declare create(secondorder))
+nrn_pragma_omp(end declare target)
 
 extern double t, dt;
 extern int rev_dt;
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index 2b6167f57..fd784fe38 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -6,8 +6,6 @@
 # =============================================================================
 */
 
-#include <set>
-
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/utils/nrn_assert.h"
@@ -15,6 +13,7 @@
 #include "coreneuron/network/tnode.hpp"
 #include "coreneuron/utils/lpt.hpp"
 #include "coreneuron/utils/memory.h"
+#include "coreneuron/utils/offload.hpp"
 #include "coreneuron/apps/corenrn_parameters.hpp"
 
 #include "coreneuron/permute/node_permute.h"  // for print_quality
@@ -22,6 +21,9 @@
 #ifdef _OPENACC
 #include <openacc.h>
 #endif
+
+#include <set>
+
 namespace coreneuron {
 int interleave_permute_type;
 InterleaveInfo* interleave_info;  // nrn_nthread array
@@ -488,8 +490,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     bool has_subtrees_to_compute = true;
 
     // clang-format off
-
-    #pragma acc loop seq
+    nrn_pragma_acc(loop seq)
     for (; has_subtrees_to_compute; ) {  // ncycle loop
 #if !defined(_OPENACC)
         // serial test, gpu does this in parallel
@@ -500,9 +501,11 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
                 // what is the index
                 int ip = GPU_PARENT(i);
                 double p = GPU_A(i) / GPU_D(i);
-                #pragma acc atomic update
+                nrn_pragma_acc(atomic update)
+                nrn_pragma_omp(atomic update)
                 GPU_D(ip) -= p * GPU_B(i);
-                #pragma acc atomic update
+                nrn_pragma_acc(atomic update)
+                nrn_pragma_omp(atomic update)
                 GPU_RHS(ip) -= p * GPU_RHS(i);
             }
 #if !defined(_OPENACC)
@@ -535,10 +538,7 @@ static void bksub_interleaved2(NrnThread* nt,
 #if !defined(_OPENACC)
     for (int i = root; i < lastroot; i += 1) {
 #else
-    // clang-format off
-
-    #pragma acc loop seq
-    // clang-format on
+    nrn_pragma_acc(loop seq)
     for (int i = root; i < lastroot; i += warpsize) {
 #endif
         GPU_RHS(i) /= GPU_D(i);  // the root
@@ -596,21 +596,17 @@ void solve_interleaved2(int ith) {
         int* strides = ii.stride;           // sum ncycles of these (bad since ncompart/warpsize)
         int* rootbegin = ii.firstnode;      // nwarp+1 of these
         int* nodebegin = ii.lastnode;       // nwarp+1 of these
-#ifdef _OPENACC
+#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
         int nstride = stridedispl[nwarp];
-        int stream_id = nt->stream_id;
-#endif
-
-#ifdef _OPENACC
-        // clang-format off
-        
-        #pragma acc parallel loop gang vector vector_length(warpsize) \
-            present(nt[0:1], strides[0:nstride],                      \
-            ncycles[0:nwarp], stridedispl[0:nwarp+1],                 \
-            rootbegin[0:nwarp+1], nodebegin[0:nwarp+1])               \
-            if (nt->compute_gpu) async(stream_id)
-// clang-format on
 #endif
+        nrn_pragma_acc(parallel loop gang vector vector_length(
+            warpsize) present(nt [0:1],
+                              strides [0:nstride],
+                              ncycles [0:nwarp],
+                              stridedispl [0:nwarp + 1],
+                              rootbegin [0:nwarp + 1],
+                              nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
         for (int icore = 0; icore < ncore; ++icore) {
             int iwarp = icore / warpsize;     // figure out the >> value
             int ic = icore & (warpsize - 1);  // figure out the & mask
@@ -629,9 +625,7 @@ void solve_interleaved2(int ith) {
             }  // serial test mode
 #endif
         }
-#ifdef _OPENACC
-#pragma acc wait(nt->stream_id)
-#endif
+        nrn_pragma_acc(wait(nt->stream_id))
 #ifdef _OPENACC
     }
 #endif
@@ -656,28 +650,23 @@ void solve_interleaved1(int ith) {
     int* firstnode = ii.firstnode;
     int* lastnode = ii.lastnode;
     int* cellsize = ii.cellsize;
-#if _OPENACC
-    int stream_id = nt->stream_id;
-#endif
 
-#ifdef _OPENACC
-    // clang-format off
-
-    #pragma acc parallel loop present(              \
-        nt[0:1], stride[0:nstride],                 \
-        firstnode[0:ncell], lastnode[0:ncell],      \
-        cellsize[0:ncell]) if (nt->compute_gpu)     \
-        async(stream_id)
-// clang-format on
-#endif
+    // OL211123: can we preserve the error checking behaviour of OpenACC's
+    // present clause with OpenMP? It is a bug if these data are not present,
+    // so diagnostics are helpful...
+    nrn_pragma_acc(parallel loop present(nt [0:1],
+                                         stride [0:nstride],
+                                         firstnode [0:ncell],
+                                         lastnode [0:ncell],
+                                         cellsize [0:ncell]) if (nt->compute_gpu)
+                       async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int icell = 0; icell < ncell; ++icell) {
         int icellsize = cellsize[icell];
         triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
         bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
     }
-#ifdef _OPENACC
-#pragma acc wait(stream_id)
-#endif
+    nrn_pragma_acc(wait(nt->stream_id))
 }
 
 void solve_interleaved(int ith) {
diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp
index 8f4ac14cf..a46f83535 100644
--- a/coreneuron/sim/fadvance_core.cpp
+++ b/coreneuron/sim/fadvance_core.cpp
@@ -78,10 +78,11 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
             } else {
                 nt->cj = 1.0 / dt;
             }
+            nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj)
+                               async(nt->stream_id) if (nt->compute_gpu))
             // clang-format off
-
-            #pragma acc update device(nt->_t, nt->_dt, nt->cj) \
-                    async(nt->stream_id) if(nt->compute_gpu)
+            nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj)
+                                         if(nt->compute_gpu))
             // clang-format on
         }
     }
@@ -201,35 +202,24 @@ void update(NrnThread* _nt) {
     double* vec_v = &(VEC_V(0));
     double* vec_rhs = &(VEC_RHS(0));
     int i2 = _nt->end;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     /* do not need to worry about linmod or extracellular*/
     if (secondorder) {
-        // clang-format off
-
-        #pragma acc parallel loop present(          \
-            vec_v[0:i2], vec_rhs[0:i2])             \
-            if (_nt->compute_gpu) async(stream_id)
-        // clang-format on
+        nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = 0; i < i2; ++i) {
             vec_v[i] += 2. * vec_rhs[i];
         }
     } else {
-        // clang-format off
-
-        #pragma acc parallel loop present(              \
-                vec_v[0:i2], vec_rhs[0:i2])             \
-                if (_nt->compute_gpu) async(stream_id)
-        // clang-format on
+        nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = 0; i < i2; ++i) {
             vec_v[i] += vec_rhs[i];
         }
     }
 
-    // update_matrix_to_gpu(_nt);
-
     if (_nt->tml) {
         assert(_nt->tml->index == CAP);
         nrn_cur_capacitance(_nt, _nt->tml->ml, _nt->tml->index);
@@ -304,10 +294,9 @@ void nrncore2nrn_send_values(NrnThread* nth) {
             // make sure we do not overflow the `varrays` buffers
             assert(vs < tr->bsize);
 
-            // clang-format off
-
-            #pragma acc parallel loop present(tr[0:1]) if(nth->compute_gpu) async(nth->stream_id)
-            // clang-format on
+            nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu)
+                               async(nth->stream_id))
+            nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu))
             for (int i = 0; i < tr->n_trajec; ++i) {
                 tr->varrays[i][vs] = *tr->gather[i];
             }
@@ -326,12 +315,12 @@ void nrncore2nrn_send_values(NrnThread* nth) {
             // https://github.com/BlueBrain/CoreNeuron/issues/611
             for (int i = 0; i < tr->n_trajec; ++i) {
                 double* gather_i = tr->gather[i];
-                // clang-format off
-
-                #pragma acc update self(gather_i[0:1]) if(nth->compute_gpu) async(nth->stream_id)
+                nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu)
+                                   async(nth->stream_id))
+                nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu))
             }
-            #pragma acc wait(nth->stream_id)
-            // clang-format on
+            nrn_pragma_acc(wait(nth->stream_id))
+            nrn_pragma_omp(taskwait)
             for (int i = 0; i < tr->n_trajec; ++i) {
                 *(tr->scatter[i]) = *(tr->gather[i]);
             }
@@ -351,15 +340,11 @@ static void* nrn_fixed_step_thread(NrnThread* nth) {
     nth->_t += .5 * nth->_dt;
 
     if (nth->ncell) {
-#if defined(_OPENACC)
-        int stream_id = nth->stream_id;
-        /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */
-        // clang-format off
-
-        #pragma acc update device(nth->_t) if (nth->compute_gpu) async(stream_id)
-        #pragma acc wait(stream_id)
-// clang-format on
-#endif
+        /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can
+        launch kernel) */
+        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
+        nrn_pragma_acc(wait(nth->stream_id))
+        nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
         fixed_play_continuous(nth);
 
         {
@@ -393,12 +378,9 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) {
 
     if (nth->ncell) {
         /*@todo: do we need to update nth->_t on GPU */
-        // clang-format off
-
-        #pragma acc update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)
-        #pragma acc wait(nth->stream_id)
-        // clang-format on
-
+        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
+        nrn_pragma_acc(wait(nth->stream_id))
+        nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
         fixed_play_continuous(nth);
         nonvint(nth);
         nrncore2nrn_send_values(nth);
diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp
index 8dfb0cd76..1218b7967 100644
--- a/coreneuron/sim/fast_imem.cpp
+++ b/coreneuron/sim/fast_imem.cpp
@@ -50,10 +50,10 @@ void nrn_calc_fast_imem(NrnThread* nt) {
 
     double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d;
     double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(vec_rhs,     \
-                                  vec_area,    \
-                                  fast_imem_d, \
-                                  fast_imem_rhs) if (nt->compute_gpu) async(nt->stream_id)
+    nrn_pragma_acc(
+        parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu)
+            async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
     }
@@ -68,8 +68,9 @@ void nrn_calc_fast_imem_init(NrnThread* nt) {
     double* vec_area = nt->_actual_area;
 
     double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) \
-    async(nt->stream_id)
+    nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu)
+                       async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
     }
diff --git a/coreneuron/sim/finitialize.cpp b/coreneuron/sim/finitialize.cpp
index 1ae79a92f..d711ae247 100644
--- a/coreneuron/sim/finitialize.cpp
+++ b/coreneuron/sim/finitialize.cpp
@@ -53,12 +53,9 @@ void nrn_finitialize(int setv, double v) {
     if (setv) {
         for (auto _nt = nrn_threads; _nt < nrn_threads + nrn_nthread; ++_nt) {
             double* vec_v = &(VEC_V(0));
-            // clang-format off
-
-            #pragma acc parallel loop present(      \
-                _nt[0:1], vec_v[0:_nt->end])        \
-                if (_nt->compute_gpu)
-            // clang-format on
+            nrn_pragma_acc(
+                parallel loop present(_nt [0:1], vec_v [0:_nt->end]) if (_nt->compute_gpu))
+            nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
             for (int i = 0; i < _nt->end; ++i) {
                 vec_v[i] = v;
             }
diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp
index a24c8360f..60ba2b660 100644
--- a/coreneuron/sim/solve_core.cpp
+++ b/coreneuron/sim/solve_core.cpp
@@ -24,7 +24,9 @@ void nrn_solve_minimal(NrnThread* _nt) {
     }
 }
 
-/** TODO loops are executed seq in OpenACC just for debugging, remove it! */
+/** @todo OpenACC GPU offload is sequential/slow. Because --cell-permute=0 and
+ *  --gpu is forbidden anyway, no OpenMP target offload equivalent is implemented.
+ */
 
 /* triangularization of the matrix equations */
 static void triang(NrnThread* _nt) {
@@ -37,17 +39,9 @@ static void triang(NrnThread* _nt) {
     double* vec_rhs = &(VEC_RHS(0));
     int* parent_index = _nt->_v_parent_index;
 
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
-    /** @todo: just for benchmarking, otherwise produces wrong results */
-    // clang-format off
-
-    #pragma acc parallel loop seq present(      \
-        vec_a[0:i3], vec_b[0:i3], vec_d[0:i3],  \
-        vec_rhs[0:i3], parent_index[0:i3])      \
-        async(stream_id) if (_nt->compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(parallel loop seq present(
+        vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
+                       async(_nt->stream_id) if (_nt->compute_gpu))
     for (int i = i3 - 1; i >= i2; --i) {
         double p = vec_a[i] / vec_d[i];
         vec_d[parent_index[i]] -= p * vec_b[i];
@@ -66,33 +60,22 @@ static void bksub(NrnThread* _nt) {
     double* vec_rhs = &(VEC_RHS(0));
     int* parent_index = _nt->_v_parent_index;
 
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
-    /** @todo: just for benchmarking, otherwise produces wrong results */
-    // clang-format off
-
-    #pragma acc parallel loop seq present(      \
-        vec_d[0:i2], vec_rhs[0:i2])             \
-        async(stream_id) if (_nt->compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2])
+                       async(_nt->stream_id) if (_nt->compute_gpu))
     for (int i = i1; i < i2; ++i) {
         vec_rhs[i] /= vec_d[i];
     }
 
-    /** @todo: just for benchmarking, otherwise produces wrong results */
-    // clang-format off
-
-    #pragma acc parallel loop seq present(          \
-        vec_b[0:i3], vec_d[0:i3], vec_rhs[0:i3],    \
-        parent_index[0:i3]) async(stream_id)        \
-        if (_nt->compute_gpu)
+    nrn_pragma_acc(
+        parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
+            async(_nt->stream_id) if (_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]];
         vec_rhs[i] /= vec_d[i];
     }
 
-    #pragma acc wait(stream_id)
-    // clang-format on
+    if (_nt->compute_gpu) {
+        nrn_pragma_acc(wait(_nt->stream_id))
+    }
 }
 }  // namespace coreneuron
diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp
index 943980bcd..bb92d2ab1 100644
--- a/coreneuron/sim/treeset_core.cpp
+++ b/coreneuron/sim/treeset_core.cpp
@@ -32,12 +32,9 @@ static void nrn_rhs(NrnThread* _nt) {
     double* vec_v = &(VEC_V(0));
     int* parent_index = _nt->_v_parent_index;
 
-    // clang-format off
-
-    #pragma acc parallel loop present(          \
-        vec_rhs[0:i3], vec_d[0:i3])             \
-        if (_nt->compute_gpu) async(_nt->stream_id)
-    // clang-format on
+    nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         vec_rhs[i] = 0.;
         vec_d[i] = 0.;
@@ -46,9 +43,10 @@ static void nrn_rhs(NrnThread* _nt) {
     if (_nt->nrn_fast_imem) {
         double* fast_imem_d = _nt->nrn_fast_imem->nrn_sav_d;
         double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(fast_imem_d [i1:i3],                         \
-                                  fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) \
-    async(_nt->stream_id)
+        nrn_pragma_acc(
+            parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
+                async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             fast_imem_d[i] = 0.;
             fast_imem_rhs[i] = 0.;
@@ -76,7 +74,9 @@ static void nrn_rhs(NrnThread* _nt) {
            so here we transform so it only has membrane current contribution
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id)
+        nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] -= vec_rhs[i];
         }
@@ -86,22 +86,24 @@ static void nrn_rhs(NrnThread* _nt) {
     The extracellular mechanism contribution is already done.
             rhs += ai_j*(vi_j - vi)
     */
-    // clang-format off
-
-    #pragma acc parallel loop present(          \
-        vec_rhs[0:i3], vec_d[0:i3],             \
-        vec_a[0:i3], vec_b[0:i3],               \
-        vec_v[0:i3], parent_index[0:i3])        \
-        if (_nt->compute_gpu) async(_nt->stream_id)
+    nrn_pragma_acc(parallel loop present(vec_rhs [0:i3],
+                                         vec_d [0:i3],
+                                         vec_a [0:i3],
+                                         vec_b [0:i3],
+                                         vec_v [0:i3],
+                                         parent_index [0:i3]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         double dv = vec_v[parent_index[i]] - vec_v[i];
         /* our connection coefficients are negative so */
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_rhs[i] -= vec_b[i] * dv;
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_rhs[parent_index[i]] += vec_a[i] * dv;
     }
-    // clang-format on
 }
 
 /* calculate left hand side of
@@ -150,34 +152,32 @@ static void nrn_lhs(NrnThread* _nt) {
            so here we transform so it only has membrane current contribution
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_d;
-#pragma acc parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)
+        nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] += vec_d[i];
         }
     }
 
     /* now add the axial currents */
-    // clang-format off
-
-    #pragma acc parallel loop present(          \
-        vec_d[0:i3], vec_a[0:i3],               \
-        vec_b[0:i3], parent_index[0:i3])        \
-        if (_nt->compute_gpu) async(_nt->stream_id)
+    nrn_pragma_acc(parallel loop present(
+        vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_d[i] -= vec_b[i];
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_d[parent_index[i]] -= vec_a[i];
     }
-    // clang-format on
 }
 
 /* for the fixed step method */
 void* setup_tree_matrix_minimal(NrnThread* _nt) {
     nrn_rhs(_nt);
     nrn_lhs(_nt);
-    // update_matrix_from_gpu(_nt);
-
     return nullptr;
 }
 }  // namespace coreneuron
diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h
index 965c06e78..2f0e24458 100644
--- a/coreneuron/utils/memory.h
+++ b/coreneuron/utils/memory.h
@@ -115,8 +115,7 @@ auto allocate_unique(const Alloc& alloc, Args&&... args) {
 }  // namespace coreneuron
 
 /// for gpu builds with unified memory support
-/// OL210812: why do we include __CUDACC__ here?
-#if (defined(__CUDACC__) || defined(CORENEURON_UNIFIED_MEMORY))
+#ifdef CORENEURON_UNIFIED_MEMORY
 
 #include <cuda_runtime_api.h>
 
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
new file mode 100644
index 000000000..d90cc10fd
--- /dev/null
+++ b/coreneuron/utils/offload.hpp
@@ -0,0 +1,20 @@
+/*
+# =============================================================================
+# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================
+*/
+#pragma once
+#define nrn_pragma_stringify(x) #x
+#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+#define nrn_pragma_acc(x)
+#define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x))
+#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+#define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x))
+#define nrn_pragma_omp(x)
+#else
+#define nrn_pragma_acc(x)
+#define nrn_pragma_omp(x)
+#endif
diff --git a/external/nmodl b/external/nmodl
index 794b419f5..a60c5e903 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 794b419f5256f40efcdca1674f712a6e544c235a
+Subproject commit a60c5e903126ad95cfe2bceb904d0efe83ba9d8a
diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt
index 3e2ac8e80..ec795f178 100644
--- a/tests/unit/lfp/CMakeLists.txt
+++ b/tests/unit/lfp/CMakeLists.txt
@@ -22,3 +22,4 @@ set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
 target_compile_options(lfp_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
 add_dependencies(lfp_test_bin nrniv-core)
 add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:lfp_test_bin>)
+set_tests_properties(lfp_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1)

From 02abf78c1ffd57130bccccbe2c325f6bedb33a3e Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Thu, 9 Dec 2021 14:13:32 +0100
Subject: [PATCH 10/31] GPU data management using OpenACC as well as OpenMP API
  (#704)

* Add wrapper functions for using OpenMP or OpenACC API
* Add -mp=gpu in order to link gpu runtime with tests as well
* Avoid copying VecPlay members twice otherwise association fails with OpenMP
     * IvocVect members t_ and y_ were copied twice
     * only discon_indices_ is pointer and hence that
        needs to be copied
---
 CMake/OpenAccHelper.cmake          |   1 +
 coreneuron/gpu/nrn_acc_manager.cpp | 908 ++++++++++++++++-------------
 coreneuron/utils/vrecord.cpp       |   9 +-
 3 files changed, 523 insertions(+), 395 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index c7f91a7c9..e8fa6738a 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -68,6 +68,7 @@ if(CORENRN_ENABLE_GPU)
     # for a region then prefer OpenMP.
     add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
     string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
+    string(APPEND NVHPC_ACC_LINK_FLAGS " -mp=gpu")
   endif()
   # avoid PGI adding standard compliant "-A" flags
   set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index b249875dc..089b90848 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -36,13 +36,66 @@
 #endif
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
-void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
+void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div, bool vector_copy_needed = false);
 void delete_ivoc_vect_from_device(IvocVect&);
 void nrn_ion_global_map_copyto_device();
 void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
+void* cnrn_gpu_copyin(void* h_ptr, std::size_t len) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    return acc_copyin(h_ptr, len);
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    auto host_id = omp_get_initial_device();
+    auto device_id = omp_get_default_device();
+    auto* d_ptr = omp_target_alloc(len, device_id);
+    nrn_assert(d_ptr != nullptr);
+    nrn_assert(omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id) == 0);
+    nrn_assert(omp_target_associate_ptr(h_ptr, d_ptr, len, 0, device_id) == 0);
+    return d_ptr;
+#else
+    throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    acc_memcpy_to_device(d_ptr, h_ptr, len);
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    auto host_id = omp_get_initial_device();
+    auto device_id = omp_get_default_device();
+    omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id);
+#else
+    throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+void cnrn_target_delete(void* h_ptr, size_t len) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    acc_delete(h_ptr, len);
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    (void)len;
+    auto device_id = omp_get_default_device();
+    omp_target_disassociate_ptr(h_ptr, device_id);
+    auto* d_ptr = omp_get_mapped_ptr(h_ptr, device_id);
+    omp_target_free(d_ptr, device_id);
+#else
+    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+void* cnrn_target_deviceptr(void* h_ptr) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    return acc_deviceptr(h_ptr);
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    auto device_id = omp_get_default_device();
+    return omp_get_mapped_ptr(h_ptr, device_id);
+#else
+    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
 /* note: threads here are corresponding to global nrn_threads array */
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 #ifdef _OPENACC
@@ -61,13 +114,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         NrnThread* nt = threads + i;  // NrnThread on host
 
         if (nt->n_presyn) {
-            PreSyn* d_presyns = (PreSyn*) acc_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) acc_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
             // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu,
             // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay
             // inside nt on cpu (due to unified memory).
@@ -85,7 +138,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
      * find
      * corresponding NrnThread using Point_process in NET_RECEIVE block
      */
-    NrnThread* d_threads = (NrnThread*) acc_copyin(threads, sizeof(NrnThread) * nthreads);
+    NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, sizeof(NrnThread) * nthreads);
 
     if (interleave_info == nullptr) {
         printf("\n Warning: No permutation data? Required for linear algebra!");
@@ -104,7 +157,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         /* -- copy _data to device -- */
 
         /*copy all double data for thread */
-        d__data = (double*) acc_copyin(nt->_data, nt->_ndata * sizeof(double));
+        d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata * sizeof(double));
+
 
         /* Here is the example of using OpenACC data enter/exit
          * Remember that we are not allowed to use nt->_data but we have to use:
@@ -114,7 +168,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
          */
 
         /*update d_nt._data to point to device copy */
-        acc_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*));
+        cnrn_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*));
+        auto host_id = omp_get_initial_device();
 
         /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */
         double* dptr;
@@ -123,36 +178,36 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         int ne = nrn_soa_padded_size(nt->end, 0);
 
         dptr = d__data + 0 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*));
+        cnrn_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*));
 
         dptr = d__data + 1 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*));
+        cnrn_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*));
 
         dptr = d__data + 2 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*));
+        cnrn_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*));
 
         dptr = d__data + 3 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*));
+        cnrn_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*));
 
         dptr = d__data + 4 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*));
+        cnrn_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*));
 
         dptr = d__data + 5 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*));
+        cnrn_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*));
 
         if (nt->_actual_diam) {
             dptr = d__data + 6 * ne;
-            acc_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*));
+            cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*));
         }
 
-        int* d_v_parent_index = (int*) acc_copyin(nt->_v_parent_index, nt->end * sizeof(int));
-        acc_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*));
+        int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end * sizeof(int));
+        cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*));
 
         /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
-        Memb_list** d_ml_list = (Memb_list**) acc_copyin(nt->_ml_list,
+        Memb_list** d_ml_list = (Memb_list**) cnrn_gpu_copyin(nt->_ml_list,
                                                          corenrn.get_memb_funcs().size() *
                                                              sizeof(Memb_list*));
-        acc_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**));
+        cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**));
 
         /* -- copy NrnThreadMembList list ml to device -- */
 
@@ -163,26 +218,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         for (auto tml = nt->tml; tml; tml = tml->next) {
             /*copy tml to device*/
             /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/
-            auto d_tml = (NrnThreadMembList*) acc_copyin(tml, sizeof(NrnThreadMembList));
+            auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml, sizeof(NrnThreadMembList));
 
             /*first tml is pointed by nt */
             if (first_tml) {
-                acc_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*));
+                cnrn_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*));
                 first_tml = false;
             } else {
                 /*rest of tml forms linked list */
-                acc_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*));
+                cnrn_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*));
             }
 
             // book keeping for linked-list
             d_last_tml = d_tml;
 
             /* now for every tml, there is a ml. copy that and setup pointer */
-            auto d_ml = (Memb_list*) acc_copyin(tml->ml, sizeof(Memb_list));
-            acc_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*));
+            auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml, sizeof(Memb_list));
+            cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*));
 
             /* setup nt._ml_list */
-            acc_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*));
+            cnrn_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*));
 
             int type = tml->index;
             int n = tml->ml->nodecount;
@@ -191,26 +246,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int is_art = corenrn.get_is_artificial()[type];
 
             // get device pointer for corresponding mechanism data
-            dptr = (double*) acc_deviceptr(tml->ml->data);
-            acc_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*));
+            dptr = (double*) cnrn_target_deviceptr(tml->ml->data);
+            cnrn_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*));
 
 
             if (!is_art) {
-                int* d_nodeindices = (int*) acc_copyin(tml->ml->nodeindices, sizeof(int) * n);
-                acc_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*));
+                int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, sizeof(int) * n);
+                cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*));
             }
 
             if (szdp) {
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                int* d_pdata = (int*) acc_copyin(tml->ml->pdata, sizeof(int) * pcnt);
-                acc_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*));
+                int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, sizeof(int) * pcnt);
+                cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*));
             }
 
             int ts = corenrn.get_memb_funcs()[type].thread_size_;
             if (ts) {
-                ThreadDatum* td = (ThreadDatum*) acc_copyin(tml->ml->_thread,
+                ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread,
                                                             ts * sizeof(ThreadDatum));
-                acc_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*));
+                cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*));
             }
 
             NetReceiveBuffer_t *nrb, *d_nrb;
@@ -222,28 +277,28 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
             // if net receive buffer exist for mechanism
             if (nrb) {
-                d_nrb = (NetReceiveBuffer_t*) acc_copyin(nrb, sizeof(NetReceiveBuffer_t));
-                acc_memcpy_to_device(&(d_ml->_net_receive_buffer),
+                d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb, sizeof(NetReceiveBuffer_t));
+                cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer),
                                      &d_nrb,
                                      sizeof(NetReceiveBuffer_t*));
 
-                d_pnt_index = (int*) acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
+                d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
+                cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
 
-                d_weight_index = (int*) acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
+                d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
+                cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
 
-                d_nrb_t = (double*) acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
+                d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
+                cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
 
-                d_nrb_flag = (double*) acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
+                d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
+                cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
 
-                d_displ = (int*) acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
+                d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+                cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
 
-                d_nrb_index = (int*) acc_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
+                d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
+                cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
             }
 
             /* copy NetSendBuffer_t on to GPU */
@@ -255,26 +310,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 int* d_iptr;
                 double* d_dptr;
 
-                d_nsb = (NetSendBuffer_t*) acc_copyin(nsb, sizeof(NetSendBuffer_t));
-                acc_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*));
+                d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb, sizeof(NetSendBuffer_t));
+                cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*));
 
-                d_iptr = (int*) acc_copyin(nsb->_sendtype, sizeof(int) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*));
+                d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, sizeof(int) * nsb->_size);
+                cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*));
 
-                d_iptr = (int*) acc_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*));
+                d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size);
+                cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*));
 
-                d_iptr = (int*) acc_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*));
+                d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size);
+                cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*));
 
-                d_iptr = (int*) acc_copyin(nsb->_weight_index, sizeof(int) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*));
+                d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, sizeof(int) * nsb->_size);
+                cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*));
 
-                d_dptr = (double*) acc_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*));
+                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size);
+                cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*));
 
-                d_dptr = (double*) acc_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*));
+                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size);
+                cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*));
             }
         }
 
@@ -284,28 +339,28 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
 
             /* copy shadow_rhs to device and fix-up the pointer */
-            d_shadow_ptr = (double*) acc_copyin(nt->_shadow_rhs, pcnt * sizeof(double));
-            acc_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*));
+            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt * sizeof(double));
+            cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*));
 
             /* copy shadow_d to device and fix-up the pointer */
-            d_shadow_ptr = (double*) acc_copyin(nt->_shadow_d, pcnt * sizeof(double));
-            acc_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*));
+            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt * sizeof(double));
+            cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*));
         }
 
         /* Fast membrane current calculation struct */
         if (nt->nrn_fast_imem) {
             auto* d_fast_imem = reinterpret_cast<NrnFastImem*>(
-                acc_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem)));
-            acc_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*));
+                cnrn_gpu_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem)));
+            cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*));
             {
                 auto* d_ptr = reinterpret_cast<double*>(
-                    acc_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)));
-                acc_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*));
+                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)));
+                cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*));
             }
             {
                 auto* d_ptr = reinterpret_cast<double*>(
-                    acc_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)));
-                acc_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*));
+                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)));
+                cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*));
             }
         }
 
@@ -313,21 +368,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
              */
             Point_process* pntptr =
-                (Point_process*) acc_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
-            acc_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*));
+                (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+            cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*));
         }
 
         if (nt->n_weight) {
             /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
-            double* d_weights = (double*) acc_copyin(nt->weights, sizeof(double) * nt->n_weight);
-            acc_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*));
+            double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, sizeof(double) * nt->n_weight);
+            cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*));
         }
 
         if (nt->_nvdata) {
             /* copy vdata which is setup in bbcore_read. This contains cuda allocated
              * nrnran123_State * */
-            void** d_vdata = (void**) acc_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata);
-            acc_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**));
+            void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata);
+            cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**));
         }
 
         if (nt->n_presyn) {
@@ -337,24 +392,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
              * to
              * VTable and alignment */
             PreSynHelper* d_presyns_helper =
-                (PreSynHelper*) acc_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
-            acc_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*));
-            PreSyn* d_presyns = (PreSyn*) acc_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            acc_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*));
+                (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+            cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*));
+            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*));
         }
 
         if (nt->_net_send_buffer_size) {
             /* copy send_receive buffer */
-            int* d_net_send_buffer = (int*) acc_copyin(nt->_net_send_buffer,
+            int* d_net_send_buffer = (int*) cnrn_gpu_copyin(nt->_net_send_buffer,
                                                        sizeof(int) * nt->_net_send_buffer_size);
-            acc_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*));
+            cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*));
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) acc_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
-            acc_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**));
+            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**));
 
             nrn_VecPlay_copyto_device(nt, d_vecplay);
         }
@@ -363,41 +418,41 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (interleave_permute_type == 1) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = (InterleaveInfo*) acc_copyin(info, sizeof(InterleaveInfo));
+                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo));
                 int* d_ptr = nullptr;
 
-                d_ptr = (int*) acc_copyin(info->stride, sizeof(int) * (info->nstride + 1));
-                acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+                d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * (info->nstride + 1));
+                cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) acc_copyin(info->firstnode, sizeof(int) * nt->ncell);
-                acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * nt->ncell);
+                cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) acc_copyin(info->lastnode, sizeof(int) * nt->ncell);
-                acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * nt->ncell);
+                cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) acc_copyin(info->cellsize, sizeof(int) * nt->ncell);
-                acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * nt->ncell);
+                cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
 
             } else if (interleave_permute_type == 2) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = (InterleaveInfo*) acc_copyin(info, sizeof(InterleaveInfo));
+                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo));
                 int* d_ptr = nullptr;
 
-                d_ptr = (int*) acc_copyin(info->stride, sizeof(int) * info->nstride);
-                acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+                d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * info->nstride);
+                cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) acc_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1));
-                acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1));
+                cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) acc_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1));
-                acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1));
+                cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) acc_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1));
-                acc_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*));
+                d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1));
+                cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) acc_copyin(info->cellsize, sizeof(int) * info->nwarp);
-                acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * info->nwarp);
+                cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
             } else {
                 printf("\n ERROR: only --cell_permute = [12] implemented");
                 abort();
@@ -412,21 +467,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 // Create a device-side copy of the `trajec_requests` struct and
                 // make sure the device-side NrnThread object knows about it.
                 auto* d_trajec_requests = reinterpret_cast<TrajectoryRequests*>(
-                    acc_copyin(tr, sizeof(TrajectoryRequests)));
-                acc_memcpy_to_device(&(d_nt->trajec_requests),
+                    cnrn_gpu_copyin(tr, sizeof(TrajectoryRequests)));
+                cnrn_memcpy_to_device(&(d_nt->trajec_requests),
                                      &d_trajec_requests,
                                      sizeof(TrajectoryRequests*));
                 // Initialise the double** gather member of the struct.
                 auto* d_tr_gather = reinterpret_cast<double**>(
-                    acc_copyin(tr->gather, sizeof(double*) * tr->n_trajec));
-                acc_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**));
+                    cnrn_gpu_copyin(tr->gather, sizeof(double*) * tr->n_trajec));
+                cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**));
                 // Initialise the double** varrays member of the struct if it's
                 // set.
                 double** d_tr_varrays{nullptr};
                 if (tr->varrays) {
                     d_tr_varrays = reinterpret_cast<double**>(
-                        acc_copyin(tr->varrays, sizeof(double*) * tr->n_trajec));
-                    acc_memcpy_to_device(&(d_trajec_requests->varrays),
+                        cnrn_gpu_copyin(tr->varrays, sizeof(double*) * tr->n_trajec));
+                    cnrn_memcpy_to_device(&(d_trajec_requests->varrays),
                                          &d_tr_varrays,
                                          sizeof(double**));
                 }
@@ -436,13 +491,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                         // make a device-side copy of it and store a pointer to it in
                         // the device-side version of tr->varrays.
                         auto* d_buf_traj_i = reinterpret_cast<double*>(
-                            acc_copyin(tr->varrays[i], tr->bsize * sizeof(double)));
-                        acc_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*));
+                            cnrn_gpu_copyin(tr->varrays[i], tr->bsize * sizeof(double)));
+                        cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*));
                     }
                     // tr->gather[i] is a double* referring to (host) data in the
                     // (host) _data block
-                    auto* d_gather_i = acc_deviceptr(tr->gather[i]);
-                    acc_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*));
+                    auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]);
+                    cnrn_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*));
                 }
                 // TODO: other `double** scatter` and `void** vpr` members of
                 // the TrajectoryRequests struct are not copied to the device.
@@ -459,15 +514,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 #endif
 }
 
-void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
+void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_copy_needed) {
 #ifdef _OPENACC
-    IvocVect* d_iv = (IvocVect*) acc_copyin((void*) &from, sizeof(IvocVect));
-    acc_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*));
-
+    /// by default `to` is desitionation pointer on a device
+    IvocVect* d_iv = &to;
+
+    /// if we need to copy IvocVect vector then newly alloated vector
+    /// on the device is a new destination pointer
+    if(vector_copy_needed) {
+        d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from, sizeof(IvocVect));
+        cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*));
+    }
     size_t n = from.size();
     if (n) {
-        double* d_data = (double*) acc_copyin((void*) from.data(), sizeof(double) * n);
-        acc_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*));
+        double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), sizeof(double) * n);
+        cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*));
     }
 #else
     (void) from;
@@ -479,9 +540,9 @@ void delete_ivoc_vect_from_device(IvocVect& vec) {
 #ifdef _OPENACC
     auto const n = vec.size();
     if (n) {
-        acc_delete(vec.data(), sizeof(double) * n);
+        cnrn_target_delete(vec.data(), sizeof(double) * n);
     }
-    acc_delete(&vec, sizeof(IvocVect));
+    cnrn_target_delete(&vec, sizeof(IvocVect));
 #else
     (void) vec;
 #endif
@@ -496,12 +557,12 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
 #ifdef _OPENACC
     if (nt->compute_gpu) {
         // free existing vectors in buffers on gpu
-        acc_delete(nrb->_pnt_index, nrb->_size * sizeof(int));
-        acc_delete(nrb->_weight_index, nrb->_size * sizeof(int));
-        acc_delete(nrb->_nrb_t, nrb->_size * sizeof(double));
-        acc_delete(nrb->_nrb_flag, nrb->_size * sizeof(double));
-        acc_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int));
-        acc_delete(nrb->_nrb_index, nrb->_size * sizeof(int));
+        cnrn_target_delete(nrb->_pnt_index, nrb->_size * sizeof(int));
+        cnrn_target_delete(nrb->_weight_index, nrb->_size * sizeof(int));
+        cnrn_target_delete(nrb->_nrb_t, nrb->_size * sizeof(double));
+        cnrn_target_delete(nrb->_nrb_flag, nrb->_size * sizeof(double));
+        cnrn_target_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int));
+        cnrn_target_delete(nrb->_nrb_index, nrb->_size * sizeof(int));
     }
 #endif
 
@@ -520,28 +581,29 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
         double *d_nrb_t, *d_nrb_flag;
 
         // update device copy
-        acc_update_device(nrb, sizeof(NetReceiveBuffer_t));
+        nrn_pragma_acc(update device(nrb));
+        nrn_pragma_omp(target update to(nrb));
 
-        NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) acc_deviceptr(nrb);
+        NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb);
 
         // recopy the vectors in the buffer
-        d_pnt_index = (int*) acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
+        d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
+        cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
 
-        d_weight_index = (int*) acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
+        d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
+        cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
 
-        d_nrb_t = (double*) acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
+        d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
+        cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
 
-        d_nrb_flag = (double*) acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
+        d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
+        cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
 
-        d_displ = (int*) acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-        acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
+        d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+        cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
 
-        d_nrb_index = (int*) acc_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
+        d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
+        cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
     }
 #endif
 }
@@ -655,13 +717,23 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
 
     if (nsb->_cnt) {
         Instrumentor::phase p_net_receive_buffer_order("net-send-buf-gpu2cpu");
-        acc_update_self(nsb->_sendtype, sizeof(int) * nsb->_cnt);
-        acc_update_self(nsb->_vdata_index, sizeof(int) * nsb->_cnt);
-        acc_update_self(nsb->_pnt_index, sizeof(int) * nsb->_cnt);
-        acc_update_self(nsb->_weight_index, sizeof(int) * nsb->_cnt);
-        acc_update_self(nsb->_nsb_t, sizeof(double) * nsb->_cnt);
-        acc_update_self(nsb->_nsb_flag, sizeof(double) * nsb->_cnt);
     }
+    nrn_pragma_acc(update self(
+                nsb->_sendtype[:nsb->_cnt],
+                nsb->_vdata_index[:nsb->_cnt],
+                nsb->_pnt_index[:nsb->_cnt],
+                nsb->_weight_index[:nsb->_cnt],
+                nsb->_nsb_t[:nsb->_cnt],
+                nsb->_nsb_flag[:nsb->_cnt])
+        if nsb->_cnt)
+    nrn_pragma_omp(target update from(
+                nsb->_sendtype[:nsb->_cnt],
+                nsb->_vdata_index[:nsb->_cnt],
+                nsb->_pnt_index[:nsb->_cnt],
+                nsb->_weight_index[:nsb->_cnt],
+                nsb->_nsb_t[:nsb->_cnt],
+                nsb->_nsb_flag[:nsb->_cnt])
+        if (nsb->_cnt))
 #else
     (void) nt;
     (void) nsb;
@@ -679,15 +751,23 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
 
             int ne = nrn_soa_padded_size(nt->end, 0);
 
-            acc_update_self(nt->_actual_rhs, ne * sizeof(double));
-            acc_update_self(nt->_actual_d, ne * sizeof(double));
-            acc_update_self(nt->_actual_a, ne * sizeof(double));
-            acc_update_self(nt->_actual_b, ne * sizeof(double));
-            acc_update_self(nt->_actual_v, ne * sizeof(double));
-            acc_update_self(nt->_actual_area, ne * sizeof(double));
-            if (nt->_actual_diam) {
-                acc_update_self(nt->_actual_diam, ne * sizeof(double));
-            }
+            nrn_pragma_acc(update self(
+                        nt->_actual_rhs[:ne],
+                        nt->_actual_d[:ne],
+                        nt->_actual_a[:ne],
+                        nt->_actual_b[:ne],
+                        nt->_actual_v[:ne],
+                        nt->_actual_area[:ne]))
+            nrn_pragma_omp(target update from(
+                        nt->_actual_rhs[:ne],
+                        nt->_actual_d[:ne],
+                        nt->_actual_a[:ne],
+                        nt->_actual_b[:ne],
+                        nt->_actual_v[:ne],
+                        nt->_actual_area[:ne]))
+
+            nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if nt->_actual_diam)
+            nrn_pragma_omp(target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
 
             /* @todo: nt._ml_list[tml->index] = tml->ml; */
 
@@ -695,8 +775,10 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
             for (auto tml = nt->tml; tml; tml = tml->next) {
                 Memb_list* ml = tml->ml;
 
-                acc_update_self(&tml->index, sizeof(int));
-                acc_update_self(&ml->nodecount, sizeof(int));
+                nrn_pragma_acc(update self(&tml->index,
+                                           &ml->nodecount))
+                nrn_pragma_omp(target update from(tml->index,
+                            ml->nodecount))
 
                 int type = tml->index;
                 int n = ml->nodecount;
@@ -713,54 +795,72 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
 
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
 
-                acc_update_self(ml->data, pcnt * sizeof(double));
-                acc_update_self(ml->nodeindices, n * sizeof(int));
+                nrn_pragma_acc(update self(ml->data[:pcnt],
+                            ml->nodeindices[:n]))
+                nrn_pragma_omp(target update from(ml->data[:pcnt],
+                            ml->nodeindices[:n]))
 
-                if (szdp) {
-                    int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                    acc_update_self(ml->pdata, pcnt * sizeof(int));
-                }
+                int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
+                nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if szdp)
+                nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp))
 
                 auto nrb = tml->ml->_net_receive_buffer;
 
-                if (nrb) {
-                    acc_update_self(&nrb->_cnt, sizeof(int));
-                    acc_update_self(&nrb->_size, sizeof(int));
-                    acc_update_self(&nrb->_pnt_offset, sizeof(int));
-                    acc_update_self(&nrb->_displ_cnt, sizeof(int));
-
-                    acc_update_self(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                    acc_update_self(nrb->_weight_index, sizeof(int) * nrb->_size);
-                    acc_update_self(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                    acc_update_self(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                }
-            }
-
-            if (nt->shadow_rhs_cnt) {
-                int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
-                /* copy shadow_rhs to host */
-                acc_update_self(nt->_shadow_rhs, pcnt * sizeof(double));
-                /* copy shadow_d to host */
-                acc_update_self(nt->_shadow_d, pcnt * sizeof(double));
-            }
-
-            if (nt->nrn_fast_imem) {
-                acc_update_self(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
-                acc_update_self(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
-            }
-
-            if (nt->n_pntproc) {
-                acc_update_self(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+                nrn_pragma_acc(update self(
+                            &nrb->_cnt,
+                            &nrb->_size,
+                            &nrb->_pnt_offset,
+                            &nrb->_displ_cnt,
+
+                            nrb->_pnt_index[:nrb->_size],
+                            nrb->_weight_index[:nrb->_size],
+                            nrb->_displ[:nrb->_size + 1],
+                            nrb->_nrb_index[:nrb->_size])
+                        if nrb)
+                nrn_pragma_omp(target update from(
+                            nrb->_cnt,
+                            nrb->_size,
+                            nrb->_pnt_offset,
+                            nrb->_displ_cnt,
+
+                            nrb->_pnt_index[:nrb->_size],
+                            nrb->_weight_index[:nrb->_size],
+                            nrb->_displ[:nrb->_size + 1],
+                            nrb->_nrb_index[:nrb->_size])
+                        if (nrb != nullptr))
             }
 
-            if (nt->n_weight) {
-                acc_update_self(nt->weights, sizeof(double) * nt->n_weight);
-            }
-
-            if (nt->n_presyn) {
-                acc_update_self(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
-                acc_update_self(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            }
+            int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
+            /* copy shadow_rhs to host */
+            /* copy shadow_d to host */
+            nrn_pragma_acc(update self(nt->_shadow_rhs[:pcnt],
+                        nt->_shadow_d[:pcnt])
+                    if nt->shadow_rhs_cnt)
+                nrn_pragma_omp(target update from(nt->_shadow_rhs[:pcnt],
+                            nt->_shadow_d[:pcnt])
+                        if (nt->shadow_rhs_cnt))
+
+            nrn_pragma_acc(update self(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
+                        nt->nrn_fast_imem->nrn_sav_d[:nt->end])
+                    if nt->nrn_fast_imem)
+            nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
+                        nt->nrn_fast_imem->nrn_sav_d[:nt->end])
+                    if (nt->nrn_fast_imem != nullptr))
+
+            nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if nt->n_pntproc)
+            nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
+
+            nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if nt->n_weight)
+            nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight))
+
+            nrn_pragma_acc(update self(
+                nt->presyns_helper[:nt->n_presyn],
+                nt->presyns[:nt->n_presyn])
+                    if nt->n_presyn)
+            nrn_pragma_omp(target update from(
+                nt->presyns_helper[:nt->n_presyn],
+                nt->presyns[:nt->n_presyn])
+                    if (nt->n_presyn))
 
             {
                 TrajectoryRequests* tr = nt->trajec_requests;
@@ -768,15 +868,17 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
                     // The full buffers have `bsize` entries, but only `vsize`
                     // of them are valid.
                     for (int i = 0; i < tr->n_trajec; ++i) {
-                        acc_update_self(tr->varrays[i], tr->vsize * sizeof(double));
+                        nrn_pragma_acc(update self(
+                            tr->varrays[i][:tr->vsize]))
+                        nrn_pragma_omp(target update from(
+                            tr->varrays[i][:tr->vsize]))
                     }
                 }
             }
 
             /* dont update vdata, its pointer array
-               if(nt->_nvdata) {
-               acc_update_self(nt->_vdata, sizeof(double)*nt->_nvdata);
-               }
+               nrn_pragma_acc(update self(nt->_vdata[:nt->_nvdata) if nt->_nvdata)
+               nrn_pragma_omp(target update from(nt->_vdata[:nt->_nvdata) if (nt->_nvdata))
              */
         }
     }
@@ -797,15 +899,23 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
             int ne = nrn_soa_padded_size(nt->end, 0);
 
-            acc_update_device(nt->_actual_rhs, ne * sizeof(double));
-            acc_update_device(nt->_actual_d, ne * sizeof(double));
-            acc_update_device(nt->_actual_a, ne * sizeof(double));
-            acc_update_device(nt->_actual_b, ne * sizeof(double));
-            acc_update_device(nt->_actual_v, ne * sizeof(double));
-            acc_update_device(nt->_actual_area, ne * sizeof(double));
-            if (nt->_actual_diam) {
-                acc_update_device(nt->_actual_diam, ne * sizeof(double));
-            }
+            nrn_pragma_acc(update device(
+                        nt->_actual_rhs[:ne],
+                        nt->_actual_d[:ne],
+                        nt->_actual_a[:ne],
+                        nt->_actual_b[:ne],
+                        nt->_actual_v[:ne],
+                        nt->_actual_area[:ne]))
+            nrn_pragma_omp(target update to(
+                        nt->_actual_rhs[:ne],
+                        nt->_actual_d[:ne],
+                        nt->_actual_a[:ne],
+                        nt->_actual_b[:ne],
+                        nt->_actual_v[:ne],
+                        nt->_actual_area[:ne]))
+
+            nrn_pragma_acc(update device(nt->_actual_diam[:ne]) if nt->_actual_diam)
+            nrn_pragma_omp(target update to(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
 
             /* @todo: nt._ml_list[tml->index] = tml->ml; */
 
@@ -819,57 +929,70 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
 
-                acc_update_device(ml->data, pcnt * sizeof(double));
-
-                if (!corenrn.get_is_artificial()[type]) {
-                    acc_update_device(ml->nodeindices, n * sizeof(int));
-                }
+                nrn_pragma_acc(update device(ml->data[:pcnt]))
+                nrn_pragma_omp(target update to(ml->data[:pcnt]))
 
-                if (szdp) {
-                    int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                    acc_update_device(ml->pdata, pcnt * sizeof(int));
-                }
+                nrn_pragma_acc(update device(ml->nodeindices[:n])
+                        if (!corenrn.get_is_artificial()[type]))
+                nrn_pragma_omp(target update to(ml->nodeindices[:n])
+                        if (!corenrn.get_is_artificial()[type]))
+                int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
+                nrn_pragma_acc(update device(ml->pdata[:dpcnt]) if szdp)
+                nrn_pragma_omp(target update to(ml->pdata[:dpcnt]) if (szdp))
 
                 auto nrb = tml->ml->_net_receive_buffer;
-
-                if (nrb) {
-                    acc_update_device(&nrb->_cnt, sizeof(int));
-                    acc_update_device(&nrb->_size, sizeof(int));
-                    acc_update_device(&nrb->_pnt_offset, sizeof(int));
-                    acc_update_device(&nrb->_displ_cnt, sizeof(int));
-
-                    acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                    acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_size);
-                    acc_update_device(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                    acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                }
-            }
-
-            if (nt->shadow_rhs_cnt) {
-                int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
-                /* copy shadow_rhs to host */
-                acc_update_device(nt->_shadow_rhs, pcnt * sizeof(double));
-                /* copy shadow_d to host */
-                acc_update_device(nt->_shadow_d, pcnt * sizeof(double));
-            }
-
-            if (nt->nrn_fast_imem) {
-                acc_update_device(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
-                acc_update_device(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
-            }
-
-            if (nt->n_pntproc) {
-                acc_update_device(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
-            }
-
-            if (nt->n_weight) {
-                acc_update_device(nt->weights, sizeof(double) * nt->n_weight);
-            }
-
-            if (nt->n_presyn) {
-                acc_update_device(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
-                acc_update_device(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+                nrn_pragma_acc(update device(&nrb->_cnt,
+                            &nrb->_size,
+                            &nrb->_pnt_offset,
+                            &nrb->_displ_cnt,
+                            nrb->_pnt_index[:nrb->_size],
+                            nrb->_weight_index[:nrb->_size],
+                            nrb->_displ[:nrb->_size],
+                            nrb->_nrb_index[:nrb->_size])
+                        if nrb)
+                nrn_pragma_omp(target update to(nrb->_cnt,
+                            nrb->_size,
+                            nrb->_pnt_offset,
+                            nrb->_displ_cnt,
+                            nrb->_pnt_index[:nrb->_size],
+                            nrb->_weight_index[:nrb->_size],
+                            nrb->_displ[:nrb->_size],
+                            nrb->_nrb_index[:nrb->_size])
+                        if (nrb != nullptr))
             }
+            int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
+            /* copy shadow_rhs to host */
+            nrn_pragma_acc(update device(nt->_shadow_rhs[:pcnt],
+                        /* copy shadow_d to host */
+                        nt->_shadow_d[:pcnt])
+                    if nt->shadow_rhs_cnt)
+            nrn_pragma_omp(target update to(nt->_shadow_rhs[:pcnt],
+                        /* copy shadow_d to host */
+                        nt->_shadow_d[:pcnt])
+                    if (nt->shadow_rhs_cnt))
+
+
+            nrn_pragma_acc(update device(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
+                        nt->nrn_fast_imem->nrn_sav_d[:nt->end])
+                    if nt->nrn_fast_imem)
+            nrn_pragma_omp(target update to(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
+                        nt->nrn_fast_imem->nrn_sav_d[:nt->end])
+                    if (nt->nrn_fast_imem != nullptr))
+
+                nrn_pragma_acc(update device(nt->pntprocs[:nt->n_pntproc])
+                        if nt->n_pntproc)
+                nrn_pragma_omp(target update to(nt->pntprocs[:nt->n_pntproc])
+                        if (nt->n_pntproc))
+
+            nrn_pragma_acc(update device(nt->weights[:nt->n_weight]) if nt->n_weight)
+            nrn_pragma_omp(target update to(nt->weights[:nt->n_weight]) if (nt->n_weight))
+
+                nrn_pragma_acc(update device(nt->presyns_helper[:nt->n_presyn],
+                            nt->presyns[:nt->n_presyn])
+                        if nt->n_presyn)
+                nrn_pragma_omp(target update to(nt->presyns_helper[:nt->n_presyn],
+                            nt->presyns[:nt->n_presyn])
+                        if (nt->n_presyn))
 
             {
                 TrajectoryRequests* tr = nt->trajec_requests;
@@ -877,15 +1000,15 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                     // The full buffers have `bsize` entries, but only `vsize`
                     // of them are valid.
                     for (int i = 0; i < tr->n_trajec; ++i) {
-                        acc_update_device(tr->varrays[i], tr->vsize * sizeof(double));
+                        nrn_pragma_acc(update device(tr->varrays[i][:tr->vsize]))
+                        nrn_pragma_omp(target update to(tr->varrays[i][:tr->vsize]))
                     }
                 }
             }
 
             /* don't and don't update vdata, its pointer array
-               if(nt->_nvdata) {
-               acc_update_device(nt->_vdata, sizeof(double)*nt->_nvdata);
-               }
+               nrn_pragma_acc(update device(nt->_vdata[:nt->_nvdata) if nt->_nvdata)
+               nrn_pragma_omp(target update tp(nt->_vdata[:nt->_nvdata) if (nt->_nvdata))
              */
         }
     }
@@ -916,22 +1039,22 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
 
 /** Cleanup device memory that is being tracked by the OpenACC runtime.
  *
- *  This function painstakingly calls `acc_delete` in reverse order on all
- *  pointers that were passed to `acc_copyin` in `setup_nrnthreads_on_device`.
+ *  This function painstakingly calls `cnrn_target_delete` in reverse order on all
+ *  pointers that were passed to `cnrn_gpu_copyin` in `setup_nrnthreads_on_device`.
  *  This cleanup ensures that if the GPU is initialised multiple times from the
  *  same process then the OpenACC runtime will not be polluted with old
  *  pointers, which can cause errors. In particular if we do:
  *  @code
  *    {
  *      // ... some_ptr is dynamically allocated ...
- *      acc_copyin(some_ptr, some_size);
+ *      cnrn_gpu_copyin(some_ptr, some_size);
  *      // ... do some work ...
- *      // acc_delete(some_ptr);
+ *      // cnrn_target_delete(some_ptr);
  *      free(some_ptr);
  *    }
  *    {
  *      // ... same_ptr_again is dynamically allocated at the same address ...
- *      acc_copyin(same_ptr_again, some_other_size); // ERROR
+ *      cnrn_gpu_copyin(same_ptr_again, some_other_size); // ERROR
  *    }
  *  @endcode
  *  the application will/may abort with an error such as:
@@ -948,73 +1071,73 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (tr) {
                 if (tr->varrays) {
                     for (int i = 0; i < tr->n_trajec; ++i) {
-                        acc_delete(tr->varrays[i], tr->bsize * sizeof(double));
+                        cnrn_target_delete(tr->varrays[i], tr->bsize * sizeof(double));
                     }
-                    acc_delete(tr->varrays, sizeof(double*) * tr->n_trajec);
+                    cnrn_target_delete(tr->varrays, sizeof(double*) * tr->n_trajec);
                 }
-                acc_delete(tr->gather, sizeof(double*) * tr->n_trajec);
-                acc_delete(tr, sizeof(TrajectoryRequests));
+                cnrn_target_delete(tr->gather, sizeof(double*) * tr->n_trajec);
+                cnrn_target_delete(tr, sizeof(TrajectoryRequests));
             }
         }
         if (nt->_permute) {
             if (interleave_permute_type == 1) {
                 InterleaveInfo* info = interleave_info + i;
-                acc_delete(info->cellsize, sizeof(int) * nt->ncell);
-                acc_delete(info->lastnode, sizeof(int) * nt->ncell);
-                acc_delete(info->firstnode, sizeof(int) * nt->ncell);
-                acc_delete(info->stride, sizeof(int) * (info->nstride + 1));
-                acc_delete(info, sizeof(InterleaveInfo));
+                cnrn_target_delete(info->cellsize, sizeof(int) * nt->ncell);
+                cnrn_target_delete(info->lastnode, sizeof(int) * nt->ncell);
+                cnrn_target_delete(info->firstnode, sizeof(int) * nt->ncell);
+                cnrn_target_delete(info->stride, sizeof(int) * (info->nstride + 1));
+                cnrn_target_delete(info, sizeof(InterleaveInfo));
             } else if (interleave_permute_type == 2) {
                 InterleaveInfo* info = interleave_info + i;
-                acc_delete(info->cellsize, sizeof(int) * info->nwarp);
-                acc_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1));
-                acc_delete(info->lastnode, sizeof(int) * (info->nwarp + 1));
-                acc_delete(info->firstnode, sizeof(int) * (info->nwarp + 1));
-                acc_delete(info->stride, sizeof(int) * info->nstride);
-                acc_delete(info, sizeof(InterleaveInfo));
+                cnrn_target_delete(info->cellsize, sizeof(int) * info->nwarp);
+                cnrn_target_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1));
+                cnrn_target_delete(info->lastnode, sizeof(int) * (info->nwarp + 1));
+                cnrn_target_delete(info->firstnode, sizeof(int) * (info->nwarp + 1));
+                cnrn_target_delete(info->stride, sizeof(int) * info->nstride);
+                cnrn_target_delete(info, sizeof(InterleaveInfo));
             }
         }
 
         if (nt->n_vecplay) {
             nrn_VecPlay_delete_from_device(nt);
-            acc_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            cnrn_target_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
         }
 
         // Cleanup send_receive buffer.
         if (nt->_net_send_buffer_size) {
-            acc_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size);
+            cnrn_target_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size);
         }
 
         if (nt->n_presyn) {
-            acc_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            acc_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+            cnrn_target_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            cnrn_target_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
         }
 
         // Cleanup data that's setup in bbcore_read.
         if (nt->_nvdata) {
-            acc_delete(nt->_vdata, sizeof(void*) * nt->_nvdata);
+            cnrn_target_delete(nt->_vdata, sizeof(void*) * nt->_nvdata);
         }
 
         // Cleanup weight vector used in NET_RECEIVE
         if (nt->n_weight) {
-            acc_delete(nt->weights, sizeof(double) * nt->n_weight);
+            cnrn_target_delete(nt->weights, sizeof(double) * nt->n_weight);
         }
 
         // Cleanup point processes
         if (nt->n_pntproc) {
-            acc_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+            cnrn_target_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
         }
 
         if (nt->nrn_fast_imem) {
-            acc_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
-            acc_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
-            acc_delete(nt->nrn_fast_imem, sizeof(NrnFastImem));
+            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
+            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
+            cnrn_target_delete(nt->nrn_fast_imem, sizeof(NrnFastImem));
         }
 
         if (nt->shadow_rhs_cnt) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
-            acc_delete(nt->_shadow_d, pcnt * sizeof(double));
-            acc_delete(nt->_shadow_rhs, pcnt * sizeof(double));
+            cnrn_target_delete(nt->_shadow_d, pcnt * sizeof(double));
+            cnrn_target_delete(nt->_shadow_rhs, pcnt * sizeof(double));
         }
 
         for (auto tml = nt->tml; tml; tml = tml->next) {
@@ -1022,26 +1145,26 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             {
                 NetSendBuffer_t* nsb{tml->ml->_net_send_buffer};
                 if (nsb) {
-                    acc_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size);
-                    acc_delete(nsb->_nsb_t, sizeof(double) * nsb->_size);
-                    acc_delete(nsb->_weight_index, sizeof(int) * nsb->_size);
-                    acc_delete(nsb->_pnt_index, sizeof(int) * nsb->_size);
-                    acc_delete(nsb->_vdata_index, sizeof(int) * nsb->_size);
-                    acc_delete(nsb->_sendtype, sizeof(int) * nsb->_size);
-                    acc_delete(nsb, sizeof(NetSendBuffer_t));
+                    cnrn_target_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size);
+                    cnrn_target_delete(nsb->_nsb_t, sizeof(double) * nsb->_size);
+                    cnrn_target_delete(nsb->_weight_index, sizeof(int) * nsb->_size);
+                    cnrn_target_delete(nsb->_pnt_index, sizeof(int) * nsb->_size);
+                    cnrn_target_delete(nsb->_vdata_index, sizeof(int) * nsb->_size);
+                    cnrn_target_delete(nsb->_sendtype, sizeof(int) * nsb->_size);
+                    cnrn_target_delete(nsb, sizeof(NetSendBuffer_t));
                 }
             }
             // Cleanup the net receive buffer if it exists.
             {
                 NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer};
                 if (nrb) {
-                    acc_delete(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                    acc_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                    acc_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-                    acc_delete(nrb->_nrb_t, sizeof(double) * nrb->_size);
-                    acc_delete(nrb->_weight_index, sizeof(int) * nrb->_size);
-                    acc_delete(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                    acc_delete(nrb, sizeof(NetReceiveBuffer_t));
+                    cnrn_target_delete(nrb->_nrb_index, sizeof(int) * nrb->_size);
+                    cnrn_target_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+                    cnrn_target_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size);
+                    cnrn_target_delete(nrb->_nrb_t, sizeof(double) * nrb->_size);
+                    cnrn_target_delete(nrb->_weight_index, sizeof(int) * nrb->_size);
+                    cnrn_target_delete(nrb->_pnt_index, sizeof(int) * nrb->_size);
+                    cnrn_target_delete(nrb, sizeof(NetReceiveBuffer_t));
                 }
             }
             int type = tml->index;
@@ -1050,23 +1173,23 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int is_art = corenrn.get_is_artificial()[type];
             int ts = corenrn.get_memb_funcs()[type].thread_size_;
             if (ts) {
-                acc_delete(tml->ml->_thread, ts * sizeof(ThreadDatum));
+                cnrn_target_delete(tml->ml->_thread, ts * sizeof(ThreadDatum));
             }
             if (szdp) {
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                acc_delete(tml->ml->pdata, sizeof(int) * pcnt);
+                cnrn_target_delete(tml->ml->pdata, sizeof(int) * pcnt);
             }
             if (!is_art) {
-                acc_delete(tml->ml->nodeindices, sizeof(int) * n);
+                cnrn_target_delete(tml->ml->nodeindices, sizeof(int) * n);
             }
-            acc_delete(tml->ml, sizeof(Memb_list));
-            acc_delete(tml, sizeof(NrnThreadMembList));
+            cnrn_target_delete(tml->ml, sizeof(Memb_list));
+            cnrn_target_delete(tml, sizeof(NrnThreadMembList));
         }
-        acc_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*));
-        acc_delete(nt->_v_parent_index, nt->end * sizeof(int));
-        acc_delete(nt->_data, nt->_ndata * sizeof(double));
+        cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*));
+        cnrn_target_delete(nt->_v_parent_index, nt->end * sizeof(int));
+        cnrn_target_delete(nt->_data, nt->_ndata * sizeof(double));
     }
-    acc_delete(threads, sizeof(NrnThread) * nthreads);
+    cnrn_target_delete(threads, sizeof(NrnThread) * nthreads);
     nrn_ion_global_map_delete_from_device();
 #endif
 }
@@ -1082,34 +1205,34 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
 
     int n = ns->n * ns->n_instance;
     // actually, the values of double do not matter, only the  pointers.
-    NewtonSpace* d_ns = (NewtonSpace*) acc_copyin(ns, sizeof(NewtonSpace));
+    NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns, sizeof(NewtonSpace));
 
     double* pd;
 
-    pd = (double*) acc_copyin(ns->delta_x, n * sizeof(double));
-    acc_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*));
+    pd = (double*) cnrn_gpu_copyin(ns->delta_x, n * sizeof(double));
+    cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*));
 
-    pd = (double*) acc_copyin(ns->high_value, n * sizeof(double));
-    acc_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*));
+    pd = (double*) cnrn_gpu_copyin(ns->high_value, n * sizeof(double));
+    cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*));
 
-    pd = (double*) acc_copyin(ns->low_value, n * sizeof(double));
-    acc_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*));
+    pd = (double*) cnrn_gpu_copyin(ns->low_value, n * sizeof(double));
+    cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*));
 
-    pd = (double*) acc_copyin(ns->rowmax, n * sizeof(double));
-    acc_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*));
+    pd = (double*) cnrn_gpu_copyin(ns->rowmax, n * sizeof(double));
+    cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*));
 
-    auto pint = (int*) acc_copyin(ns->perm, n * sizeof(int));
-    acc_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*));
+    auto pint = (int*) cnrn_gpu_copyin(ns->perm, n * sizeof(int));
+    cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*));
 
-    auto ppd = (double**) acc_copyin(ns->jacobian, ns->n * sizeof(double*));
-    acc_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**));
+    auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n * sizeof(double*));
+    cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**));
 
     // the actual jacobian doubles were allocated as a single array
-    double* d_jacdat = (double*) acc_copyin(ns->jacobian[0], ns->n * n * sizeof(double));
+    double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n * sizeof(double));
 
     for (int i = 0; i < ns->n; ++i) {
         pd = d_jacdat + i * n;
-        acc_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*));
+        cnrn_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*));
     }
 #endif
 }
@@ -1122,14 +1245,14 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
         return;
     }
     int n = ns->n * ns->n_instance;
-    acc_delete(ns->jacobian[0], ns->n * n * sizeof(double));
-    acc_delete(ns->jacobian, ns->n * sizeof(double*));
-    acc_delete(ns->perm, n * sizeof(int));
-    acc_delete(ns->rowmax, n * sizeof(double));
-    acc_delete(ns->low_value, n * sizeof(double));
-    acc_delete(ns->high_value, n * sizeof(double));
-    acc_delete(ns->delta_x, n * sizeof(double));
-    acc_delete(ns, sizeof(NewtonSpace));
+    cnrn_target_delete(ns->jacobian[0], ns->n * n * sizeof(double));
+    cnrn_target_delete(ns->jacobian, ns->n * sizeof(double*));
+    cnrn_target_delete(ns->perm, n * sizeof(int));
+    cnrn_target_delete(ns->rowmax, n * sizeof(double));
+    cnrn_target_delete(ns->low_value, n * sizeof(double));
+    cnrn_target_delete(ns->high_value, n * sizeof(double));
+    cnrn_target_delete(ns->delta_x, n * sizeof(double));
+    cnrn_target_delete(ns, sizeof(NewtonSpace));
 #endif
 }
 
@@ -1142,76 +1265,76 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
     }
 
     unsigned n1 = so->neqn + 1;
-    SparseObj* d_so = (SparseObj*) acc_copyin(so, sizeof(SparseObj));
+    SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so, sizeof(SparseObj));
     // only pointer fields in SparseObj that need setting up are
     //   rowst, diag, rhs, ngetcall, coef_list
     // only pointer fields in Elm that need setting up are
     //   r_down, c_right, value
     // do not care about the Elm* ptr value, just the space.
 
-    Elm** d_rowst = (Elm**) acc_copyin(so->rowst, n1 * sizeof(Elm*));
-    acc_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**));
+    Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1 * sizeof(Elm*));
+    cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**));
 
-    Elm** d_diag = (Elm**) acc_copyin(so->diag, n1 * sizeof(Elm*));
-    acc_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**));
+    Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1 * sizeof(Elm*));
+    cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**));
 
-    auto pu = (unsigned*) acc_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
-    acc_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**));
+    auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
+    cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**));
 
-    auto pd = (double*) acc_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double));
-    acc_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*));
+    auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double));
+    cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*));
 
-    auto d_coef_list = (double**) acc_copyin(so->coef_list, so->coef_list_size * sizeof(double*));
-    acc_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**));
+    auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size * sizeof(double*));
+    cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**));
 
     // Fill in relevant Elm pointer values
 
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            Elm* pelm = (Elm*) acc_copyin(elm, sizeof(Elm));
+            Elm* pelm = (Elm*) cnrn_gpu_copyin(elm, sizeof(Elm));
 
             if (elm == so->rowst[irow]) {
-                acc_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*));
+                cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*));
             } else {
-                Elm* d_e = (Elm*) acc_deviceptr(elm->c_left);
-                acc_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*));
+                Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->c_left);
+                cnrn_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*));
             }
 
             if (elm->col == elm->row) {
-                acc_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*));
+                cnrn_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*));
             }
 
             if (irow > 1) {
                 if (elm->r_up) {
-                    Elm* d_e = (Elm*) acc_deviceptr(elm->r_up);
-                    acc_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*));
+                    Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->r_up);
+                    cnrn_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*));
                 }
             }
 
-            pd = (double*) acc_copyin(elm->value, so->_cntml_padded * sizeof(double));
-            acc_memcpy_to_device(&(pelm->value), &pd, sizeof(double*));
+            pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded * sizeof(double));
+            cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*));
         }
     }
 
     // visit all the Elm again and fill in pelm->r_down and pelm->c_left
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            auto pelm = (Elm*) acc_deviceptr(elm);
+            auto pelm = (Elm*) cnrn_target_deviceptr(elm);
             if (elm->r_down) {
-                auto d_e = (Elm*) acc_deviceptr(elm->r_down);
-                acc_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*));
+                auto d_e = (Elm*) cnrn_target_deviceptr(elm->r_down);
+                cnrn_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*));
             }
             if (elm->c_right) {
-                auto d_e = (Elm*) acc_deviceptr(elm->c_right);
-                acc_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*));
+                auto d_e = (Elm*) cnrn_target_deviceptr(elm->c_right);
+                cnrn_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*));
             }
         }
     }
 
     // Fill in the d_so->coef_list
     for (unsigned i = 0; i < so->coef_list_size; ++i) {
-        pd = (double*) acc_deviceptr(so->coef_list[i]);
-        acc_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*));
+        pd = (double*) cnrn_target_deviceptr(so->coef_list[i]);
+        cnrn_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*));
     }
 #endif
 }
@@ -1226,16 +1349,16 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
     unsigned n1 = so->neqn + 1;
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            acc_delete(elm->value, so->_cntml_padded * sizeof(double));
-            acc_delete(elm, sizeof(Elm));
+            cnrn_target_delete(elm->value, so->_cntml_padded * sizeof(double));
+            cnrn_target_delete(elm, sizeof(Elm));
         }
     }
-    acc_delete(so->coef_list, so->coef_list_size * sizeof(double*));
-    acc_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double));
-    acc_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
-    acc_delete(so->diag, n1 * sizeof(Elm*));
-    acc_delete(so->rowst, n1 * sizeof(Elm*));
-    acc_delete(so, sizeof(SparseObj));
+    cnrn_target_delete(so->coef_list, so->coef_list_size * sizeof(double*));
+    cnrn_target_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double));
+    cnrn_target_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
+    cnrn_target_delete(so->diag, n1 * sizeof(Elm*));
+    cnrn_target_delete(so->rowst, n1 * sizeof(Elm*));
+    cnrn_target_delete(so, sizeof(SparseObj));
 #endif
 }
 
@@ -1243,14 +1366,14 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
 
 void nrn_ion_global_map_copyto_device() {
     if (nrn_ion_global_map_size) {
-        double** d_data = (double**) acc_copyin(nrn_ion_global_map,
+        double** d_data = (double**) cnrn_gpu_copyin(nrn_ion_global_map,
                                                 sizeof(double*) * nrn_ion_global_map_size);
         for (int j = 0; j < nrn_ion_global_map_size; j++) {
             if (nrn_ion_global_map[j]) {
-                double* d_mechmap = (double*) acc_copyin(nrn_ion_global_map[j],
+                double* d_mechmap = (double*) cnrn_gpu_copyin(nrn_ion_global_map[j],
                                                          ion_global_map_member_size *
                                                              sizeof(double));
-                acc_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*));
+                cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*));
             }
         }
     }
@@ -1259,11 +1382,11 @@ void nrn_ion_global_map_copyto_device() {
 void nrn_ion_global_map_delete_from_device() {
     for (int j = 0; j < nrn_ion_global_map_size; j++) {
         if (nrn_ion_global_map[j]) {
-            acc_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double));
+            cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double));
         }
     }
     if (nrn_ion_global_map_size) {
-        acc_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size);
+        cnrn_target_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size);
     }
 }
 
@@ -1317,8 +1440,8 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i];
 
         /** just VecPlayContinuous object */
-        void* d_p = (void*) acc_copyin(vecplay_instance, sizeof(VecPlayContinuous));
-        acc_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*));
+        void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance, sizeof(VecPlayContinuous));
+        cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*));
 
         VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p;
 
@@ -1327,32 +1450,33 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_);
         if (vecplay_instance->discon_indices_) {
             copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_),
-                                     *(d_vecplay_instance->discon_indices_));
+                                     *(d_vecplay_instance->discon_indices_),
+                                     true);
         }
 
         /** copy PlayRecordEvent : todo: verify this */
-        PlayRecordEvent* d_e_ = (PlayRecordEvent*) acc_copyin(vecplay_instance->e_,
+        PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_,
                                                               sizeof(PlayRecordEvent));
-        acc_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*));
-        acc_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*));
+        cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*));
+        cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*));
 
         /** copy pd_ : note that it's pointer inside ml->data and hence data itself is
          * already on GPU */
-        double* d_pd_ = (double*) acc_deviceptr(vecplay_instance->pd_);
-        acc_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*));
+        double* d_pd_ = (double*) cnrn_target_deviceptr(vecplay_instance->pd_);
+        cnrn_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*));
     }
 }
 
 void nrn_VecPlay_delete_from_device(NrnThread* nt) {
     for (int i = 0; i < nt->n_vecplay; i++) {
         auto* vecplay_instance = reinterpret_cast<VecPlayContinuous*>(nt->_vecplay[i]);
-        acc_delete(vecplay_instance->e_, sizeof(PlayRecordEvent));
+        cnrn_target_delete(vecplay_instance->e_, sizeof(PlayRecordEvent));
         if (vecplay_instance->discon_indices_) {
             delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
         }
         delete_ivoc_vect_from_device(vecplay_instance->t_);
         delete_ivoc_vect_from_device(vecplay_instance->y_);
-        acc_delete(vecplay_instance, sizeof(VecPlayContinuous));
+        cnrn_target_delete(vecplay_instance, sizeof(VecPlayContinuous));
     }
 }
 
diff --git a/coreneuron/utils/vrecord.cpp b/coreneuron/utils/vrecord.cpp
index 8af2b028e..a972e754a 100644
--- a/coreneuron/utils/vrecord.cpp
+++ b/coreneuron/utils/vrecord.cpp
@@ -78,7 +78,8 @@ void VecPlayContinuous::deliver(double tt, NetCvode* ns) {
     last_index_ = ubound_index_;
     // clang-format off
 
-    #pragma acc update device(last_index_) if (nt->compute_gpu)
+    nrn_pragma_acc(update device(last_index_) if (nt->compute_gpu))
+    nrn_pragma_omp(target update to(last_index_) if (nt->compute_gpu))
     // clang-format on
     if (discon_indices_) {
         if (discon_index_ < discon_indices_->size()) {
@@ -96,7 +97,8 @@ void VecPlayContinuous::deliver(double tt, NetCvode* ns) {
     }
     // clang-format off
 
-    #pragma acc update device(ubound_index_) if (nt->compute_gpu)
+    nrn_pragma_acc(update device(ubound_index_) if (nt->compute_gpu))
+    nrn_pragma_omp(target update to(ubound_index_) if (nt->compute_gpu))
     // clang-format on
     continuous(tt);
 }
@@ -105,7 +107,8 @@ void VecPlayContinuous::continuous(double tt) {
     NrnThread* nt = nrn_threads + ith_;
     // clang-format off
 
-    #pragma acc kernels present(this) if(nt->compute_gpu)
+    nrn_pragma_acc(kernels present(this) if(nt->compute_gpu))
+    nrn_pragma_omp(target if(nt->compute_gpu))
     {
         *pd_ = interpolate(tt);
     }

From 57f77244fd91b7d05313588a38587c7b75327efa Mon Sep 17 00:00:00 2001
From: Christos Kotsalos <christos.kotsalos@epfl.ch>
Date: Fri, 10 Dec 2021 16:22:55 +0100
Subject: [PATCH 11/31] small openacc fixes (#707)

---
 coreneuron/gpu/nrn_acc_manager.cpp | 55 +++++++++++++++---------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 089b90848..373fcdbc3 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -169,7 +169,6 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
         /*update d_nt._data to point to device copy */
         cnrn_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*));
-        auto host_id = omp_get_initial_device();
 
         /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */
         double* dptr;
@@ -725,7 +724,7 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
                 nsb->_weight_index[:nsb->_cnt],
                 nsb->_nsb_t[:nsb->_cnt],
                 nsb->_nsb_flag[:nsb->_cnt])
-        if nsb->_cnt)
+        if (nsb->_cnt))
     nrn_pragma_omp(target update from(
                 nsb->_sendtype[:nsb->_cnt],
                 nsb->_vdata_index[:nsb->_cnt],
@@ -766,7 +765,7 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
                         nt->_actual_v[:ne],
                         nt->_actual_area[:ne]))
 
-            nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if nt->_actual_diam)
+            nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
             nrn_pragma_omp(target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
 
             /* @todo: nt._ml_list[tml->index] = tml->ml; */
@@ -775,8 +774,8 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
             for (auto tml = nt->tml; tml; tml = tml->next) {
                 Memb_list* ml = tml->ml;
 
-                nrn_pragma_acc(update self(&tml->index,
-                                           &ml->nodecount))
+                nrn_pragma_acc(update self(tml->index,
+                                           ml->nodecount))
                 nrn_pragma_omp(target update from(tml->index,
                             ml->nodecount))
 
@@ -801,22 +800,22 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
                             ml->nodeindices[:n]))
 
                 int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if szdp)
+                nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp))
                 nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp))
 
                 auto nrb = tml->ml->_net_receive_buffer;
 
                 nrn_pragma_acc(update self(
-                            &nrb->_cnt,
-                            &nrb->_size,
-                            &nrb->_pnt_offset,
-                            &nrb->_displ_cnt,
+                            nrb->_cnt,
+                            nrb->_size,
+                            nrb->_pnt_offset,
+                            nrb->_displ_cnt,
 
                             nrb->_pnt_index[:nrb->_size],
                             nrb->_weight_index[:nrb->_size],
                             nrb->_displ[:nrb->_size + 1],
                             nrb->_nrb_index[:nrb->_size])
-                        if nrb)
+                        if (nrb != nullptr))
                 nrn_pragma_omp(target update from(
                             nrb->_cnt,
                             nrb->_size,
@@ -835,28 +834,28 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
             /* copy shadow_d to host */
             nrn_pragma_acc(update self(nt->_shadow_rhs[:pcnt],
                         nt->_shadow_d[:pcnt])
-                    if nt->shadow_rhs_cnt)
+                    if (nt->shadow_rhs_cnt))
                 nrn_pragma_omp(target update from(nt->_shadow_rhs[:pcnt],
                             nt->_shadow_d[:pcnt])
                         if (nt->shadow_rhs_cnt))
 
             nrn_pragma_acc(update self(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
                         nt->nrn_fast_imem->nrn_sav_d[:nt->end])
-                    if nt->nrn_fast_imem)
+                    if (nt->nrn_fast_imem != nullptr))
             nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
                         nt->nrn_fast_imem->nrn_sav_d[:nt->end])
                     if (nt->nrn_fast_imem != nullptr))
 
-            nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if nt->n_pntproc)
+            nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
             nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
 
-            nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if nt->n_weight)
+            nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if (nt->n_weight))
             nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight))
 
             nrn_pragma_acc(update self(
                 nt->presyns_helper[:nt->n_presyn],
                 nt->presyns[:nt->n_presyn])
-                    if nt->n_presyn)
+                    if (nt->n_presyn))
             nrn_pragma_omp(target update from(
                 nt->presyns_helper[:nt->n_presyn],
                 nt->presyns[:nt->n_presyn])
@@ -914,7 +913,7 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                         nt->_actual_v[:ne],
                         nt->_actual_area[:ne]))
 
-            nrn_pragma_acc(update device(nt->_actual_diam[:ne]) if nt->_actual_diam)
+            nrn_pragma_acc(update device(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
             nrn_pragma_omp(target update to(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
 
             /* @todo: nt._ml_list[tml->index] = tml->ml; */
@@ -937,19 +936,19 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 nrn_pragma_omp(target update to(ml->nodeindices[:n])
                         if (!corenrn.get_is_artificial()[type]))
                 int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                nrn_pragma_acc(update device(ml->pdata[:dpcnt]) if szdp)
+                nrn_pragma_acc(update device(ml->pdata[:dpcnt]) if (szdp))
                 nrn_pragma_omp(target update to(ml->pdata[:dpcnt]) if (szdp))
 
                 auto nrb = tml->ml->_net_receive_buffer;
-                nrn_pragma_acc(update device(&nrb->_cnt,
-                            &nrb->_size,
-                            &nrb->_pnt_offset,
-                            &nrb->_displ_cnt,
+                nrn_pragma_acc(update device(nrb->_cnt,
+                            nrb->_size,
+                            nrb->_pnt_offset,
+                            nrb->_displ_cnt,
                             nrb->_pnt_index[:nrb->_size],
                             nrb->_weight_index[:nrb->_size],
                             nrb->_displ[:nrb->_size],
                             nrb->_nrb_index[:nrb->_size])
-                        if nrb)
+                        if (nrb != nullptr))
                 nrn_pragma_omp(target update to(nrb->_cnt,
                             nrb->_size,
                             nrb->_pnt_offset,
@@ -965,7 +964,7 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             nrn_pragma_acc(update device(nt->_shadow_rhs[:pcnt],
                         /* copy shadow_d to host */
                         nt->_shadow_d[:pcnt])
-                    if nt->shadow_rhs_cnt)
+                    if (nt->shadow_rhs_cnt))
             nrn_pragma_omp(target update to(nt->_shadow_rhs[:pcnt],
                         /* copy shadow_d to host */
                         nt->_shadow_d[:pcnt])
@@ -974,22 +973,22 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
             nrn_pragma_acc(update device(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
                         nt->nrn_fast_imem->nrn_sav_d[:nt->end])
-                    if nt->nrn_fast_imem)
+                    if (nt->nrn_fast_imem != nullptr))
             nrn_pragma_omp(target update to(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
                         nt->nrn_fast_imem->nrn_sav_d[:nt->end])
                     if (nt->nrn_fast_imem != nullptr))
 
                 nrn_pragma_acc(update device(nt->pntprocs[:nt->n_pntproc])
-                        if nt->n_pntproc)
+                        if (nt->n_pntproc))
                 nrn_pragma_omp(target update to(nt->pntprocs[:nt->n_pntproc])
                         if (nt->n_pntproc))
 
-            nrn_pragma_acc(update device(nt->weights[:nt->n_weight]) if nt->n_weight)
+            nrn_pragma_acc(update device(nt->weights[:nt->n_weight]) if (nt->n_weight))
             nrn_pragma_omp(target update to(nt->weights[:nt->n_weight]) if (nt->n_weight))
 
                 nrn_pragma_acc(update device(nt->presyns_helper[:nt->n_presyn],
                             nt->presyns[:nt->n_presyn])
-                        if nt->n_presyn)
+                        if (nt->n_presyn))
                 nrn_pragma_omp(target update to(nt->presyns_helper[:nt->n_presyn],
                             nt->presyns[:nt->n_presyn])
                         if (nt->n_presyn))

From 56889cccaafedfffe3948cb9e721a2b66a1bd14f Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 13 Dec 2021 11:33:51 +0100
Subject: [PATCH 12/31] Fixup to make the CI work better while finalising
 hackathon changes.

---
 coreneuron/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 2308ab99a..437eb8ea7 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -303,7 +303,7 @@ set_target_properties(
 # create special-core with halfgap.mod for tests
 # =============================================================================
 set(all_output_binaries)
-if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "")
+if(EXISTS "${CORENRN_EXTERNAL_BENCHMARK_DATA}")
   # Hack for the december 2021 hackathon, build an extra special-core with channel-benchmark
   # mechanisms.
   set(modfile_directory

From 01a39d7d12f14a737b1e37cdecca1deadd21b102 Mon Sep 17 00:00:00 2001
From: Christos Kotsalos <christos.kotsalos@epfl.ch>
Date: Mon, 13 Dec 2021 13:45:59 +0100
Subject: [PATCH 13/31] solve_interleaved2_launcher (CUDA interface) : fixing
 size of blocksPerGrid & threadsPerBlock (#710)

---
 coreneuron/permute/cellorder.cu | 35 ++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu
index 82198410f..1226b4bf7 100644
--- a/coreneuron/permute/cellorder.cu
+++ b/coreneuron/permute/cellorder.cu
@@ -72,25 +72,32 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
     int* rootbegin = ii->firstnode;      // nwarp+1 of these
     int* nodebegin = ii->lastnode;       // nwarp+1 of these
 
-    int iwarp = icore / warpsize;     // figure out the >> value
-    int ic = icore & (warpsize - 1);  // figure out the & mask
-    int ncycle = ncycles[iwarp];
-    int* stride = strides + stridedispl[iwarp];
-    int root = rootbegin[iwarp];
-    int lastroot = rootbegin[iwarp + 1];
-    int firstnode = nodebegin[iwarp];
-    int lastnode = nodebegin[iwarp + 1];
-
-    triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
-    bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
+    while (icore < ncore) {
+        int iwarp = icore / warpsize;     // figure out the >> value
+        int ic = icore & (warpsize - 1);  // figure out the & mask
+        int ncycle = ncycles[iwarp];
+        int* stride = strides + stridedispl[iwarp];
+        int root = rootbegin[iwarp];
+        int lastroot = rootbegin[iwarp + 1];
+        int firstnode = nodebegin[iwarp];
+        int lastnode = nodebegin[iwarp + 1];
+
+        triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
+        bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
+
+        icore += blockDim.x * gridDim.x;
+    }
 }
 
 void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) {
     auto cuda_stream = static_cast<cudaStream_t>(stream);
 
-    int threadsPerBlock = warpsize;
-    // TODO: Should blocksPerGrid be a fixed number and have a while block inside the kernel?
-    int blocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
+    // the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e.
+    // 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
+    // The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU. 
+    // In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs.
+    int threadsPerBlock = 128;
+    int blocksPerGrid = 512;
 
     solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0, cuda_stream>>>(nt, info, ncore);
 

From 0fe815e525d00de617b20e6a6cc9e3213ddbe0b8 Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Mon, 13 Dec 2021 16:08:08 +0100
Subject: [PATCH 14/31] OpenMP offload: use #pragma instead of runtime API
 (#708)

* Use #pragma omp instead of runtime API in `cnrn_target_{copyin,delete}`
* Fix `VecPlayContinuous::discon_indices_` device transfer.
* Name `cnrn_target_` wrappers more consistently.

Co-authored-by: Olli Lupton <oliver.lupton@epfl.ch>
---
 coreneuron/gpu/nrn_acc_manager.cpp | 611 ++++++++++++++---------------
 1 file changed, 291 insertions(+), 320 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 373fcdbc3..4fe0004fd 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -36,63 +36,55 @@
 #endif
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
-void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div, bool vector_copy_needed = false);
+void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
 void delete_ivoc_vect_from_device(IvocVect&);
 void nrn_ion_global_map_copyto_device();
 void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
-void* cnrn_gpu_copyin(void* h_ptr, std::size_t len) {
+template <typename T>
+T* cnrn_target_deviceptr(const T* h_ptr) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return acc_copyin(h_ptr, len);
+    return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    auto host_id = omp_get_initial_device();
-    auto device_id = omp_get_default_device();
-    auto* d_ptr = omp_target_alloc(len, device_id);
-    nrn_assert(d_ptr != nullptr);
-    nrn_assert(omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id) == 0);
-    nrn_assert(omp_target_associate_ptr(h_ptr, d_ptr, len, 0, device_id) == 0);
-    return d_ptr;
+    return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
 #else
-    throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
-void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) {
+template <typename T>
+T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    acc_memcpy_to_device(d_ptr, h_ptr, len);
+    return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    auto host_id = omp_get_initial_device();
-    auto device_id = omp_get_default_device();
-    omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id);
+    #pragma omp target enter data map(to:h_ptr[:len])
+    return cnrn_target_deviceptr(const_cast<T*>(h_ptr));
 #else
-    throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
-void cnrn_target_delete(void* h_ptr, size_t len) {
+template <typename T>
+void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    acc_delete(h_ptr, len);
+    acc_delete(h_ptr, len * sizeof(T));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    (void)len;
-    auto device_id = omp_get_default_device();
-    omp_target_disassociate_ptr(h_ptr, device_id);
-    auto* d_ptr = omp_get_mapped_ptr(h_ptr, device_id);
-    omp_target_free(d_ptr, device_id);
+    #pragma omp target exit data map(delete: h_ptr[:len])
 #else
     throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
-void* cnrn_target_deviceptr(void* h_ptr) {
+template <typename T>
+void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return acc_deviceptr(h_ptr);
+    acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    auto device_id = omp_get_default_device();
-    return omp_get_mapped_ptr(h_ptr, device_id);
+    omp_target_memcpy(d_ptr, const_cast<T*>(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device());
 #else
-    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
@@ -114,13 +106,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         NrnThread* nt = threads + i;  // NrnThread on host
 
         if (nt->n_presyn) {
-            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
             // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu,
             // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay
             // inside nt on cpu (due to unified memory).
@@ -138,7 +130,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
      * find
      * corresponding NrnThread using Point_process in NET_RECEIVE block
      */
-    NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, sizeof(NrnThread) * nthreads);
+    NrnThread* d_threads = cnrn_target_copyin(threads, nthreads);
 
     if (interleave_info == nullptr) {
         printf("\n Warning: No permutation data? Required for linear algebra!");
@@ -157,7 +149,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         /* -- copy _data to device -- */
 
         /*copy all double data for thread */
-        d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata * sizeof(double));
+        d__data = cnrn_target_copyin(nt->_data, nt->_ndata);
 
 
         /* Here is the example of using OpenACC data enter/exit
@@ -168,7 +160,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
          */
 
         /*update d_nt._data to point to device copy */
-        cnrn_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_data), &d__data);
 
         /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */
         double* dptr;
@@ -177,36 +169,35 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         int ne = nrn_soa_padded_size(nt->end, 0);
 
         dptr = d__data + 0 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr));
 
         dptr = d__data + 1 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_d), &(dptr));
 
         dptr = d__data + 2 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_a), &(dptr));
 
         dptr = d__data + 3 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_b), &(dptr));
 
         dptr = d__data + 4 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_v), &(dptr));
 
         dptr = d__data + 5 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_area), &(dptr));
 
         if (nt->_actual_diam) {
             dptr = d__data + 6 * ne;
-            cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*));
+            cnrn_target_memcpy_to_device(&(d_nt->_actual_diam), &(dptr));
         }
 
-        int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end * sizeof(int));
-        cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*));
+        int* d_v_parent_index = cnrn_target_copyin(nt->_v_parent_index, nt->end);
+        cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index));
 
         /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
-        Memb_list** d_ml_list = (Memb_list**) cnrn_gpu_copyin(nt->_ml_list,
-                                                         corenrn.get_memb_funcs().size() *
-                                                             sizeof(Memb_list*));
-        cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**));
+        Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list,
+                                                         corenrn.get_memb_funcs().size());
+        cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list));
 
         /* -- copy NrnThreadMembList list ml to device -- */
 
@@ -217,26 +208,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         for (auto tml = nt->tml; tml; tml = tml->next) {
             /*copy tml to device*/
             /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/
-            auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml, sizeof(NrnThreadMembList));
+            auto d_tml = cnrn_target_copyin(tml);
 
             /*first tml is pointed by nt */
             if (first_tml) {
-                cnrn_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*));
+                cnrn_target_memcpy_to_device(&(d_nt->tml), &d_tml);
                 first_tml = false;
             } else {
                 /*rest of tml forms linked list */
-                cnrn_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*));
+                cnrn_target_memcpy_to_device(&(d_last_tml->next), &d_tml);
             }
 
             // book keeping for linked-list
             d_last_tml = d_tml;
 
             /* now for every tml, there is a ml. copy that and setup pointer */
-            auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml, sizeof(Memb_list));
-            cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*));
+            auto d_ml = cnrn_target_copyin(tml->ml);
+            cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml);
 
             /* setup nt._ml_list */
-            cnrn_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*));
+            cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml);
 
             int type = tml->index;
             int n = tml->ml->nodecount;
@@ -245,26 +236,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int is_art = corenrn.get_is_artificial()[type];
 
             // get device pointer for corresponding mechanism data
-            dptr = (double*) cnrn_target_deviceptr(tml->ml->data);
-            cnrn_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*));
+            dptr = cnrn_target_deviceptr(tml->ml->data);
+            cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
 
 
             if (!is_art) {
-                int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, sizeof(int) * n);
-                cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*));
+                int* d_nodeindices = cnrn_target_copyin(tml->ml->nodeindices, n);
+                cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);
             }
 
             if (szdp) {
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, sizeof(int) * pcnt);
-                cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*));
+                int* d_pdata = cnrn_target_copyin(tml->ml->pdata, pcnt);
+                cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata);
             }
 
             int ts = corenrn.get_memb_funcs()[type].thread_size_;
             if (ts) {
-                ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread,
-                                                            ts * sizeof(ThreadDatum));
-                cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*));
+                ThreadDatum* td = cnrn_target_copyin(tml->ml->_thread, ts);
+                cnrn_target_memcpy_to_device(&(d_ml->_thread), &td);
             }
 
             NetReceiveBuffer_t *nrb, *d_nrb;
@@ -276,28 +266,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
             // if net receive buffer exist for mechanism
             if (nrb) {
-                d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb, sizeof(NetReceiveBuffer_t));
-                cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer),
-                                     &d_nrb,
-                                     sizeof(NetReceiveBuffer_t*));
+                d_nrb = cnrn_target_copyin(nrb);
+                cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb);
 
-                d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
+                d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
 
-                d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
+                d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
 
-                d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
+                d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
 
-                d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
+                d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
 
-                d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
+                d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
+                cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
 
-                d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
+                d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
             }
 
             /* copy NetSendBuffer_t on to GPU */
@@ -309,26 +297,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 int* d_iptr;
                 double* d_dptr;
 
-                d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb, sizeof(NetSendBuffer_t));
-                cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*));
+                d_nsb = cnrn_target_copyin(nsb);
+                cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb);
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, sizeof(int) * nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*));
+                d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr);
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*));
+                d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr);
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*));
+                d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr);
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, sizeof(int) * nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*));
+                d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr);
 
-                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*));
+                d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr);
 
-                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*));
+                d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr);
             }
         }
 
@@ -338,28 +326,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
 
             /* copy shadow_rhs to device and fix-up the pointer */
-            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt * sizeof(double));
-            cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*));
+            d_shadow_ptr = cnrn_target_copyin(nt->_shadow_rhs, pcnt);
+            cnrn_target_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr);
 
             /* copy shadow_d to device and fix-up the pointer */
-            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt * sizeof(double));
-            cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*));
+            d_shadow_ptr = cnrn_target_copyin(nt->_shadow_d, pcnt);
+            cnrn_target_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr);
         }
 
         /* Fast membrane current calculation struct */
         if (nt->nrn_fast_imem) {
-            auto* d_fast_imem = reinterpret_cast<NrnFastImem*>(
-                cnrn_gpu_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem)));
-            cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*));
+            NrnFastImem* d_fast_imem = cnrn_target_copyin(nt->nrn_fast_imem);
+            cnrn_target_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem);
             {
-                auto* d_ptr = reinterpret_cast<double*>(
-                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)));
-                cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*));
+                double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
+                cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr);
             }
             {
-                auto* d_ptr = reinterpret_cast<double*>(
-                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)));
-                cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*));
+                double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end);
+                cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr);
             }
         }
 
@@ -367,21 +352,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
              */
             Point_process* pntptr =
-                (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
-            cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*));
+                cnrn_target_copyin(nt->pntprocs, nt->n_pntproc);
+            cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr);
         }
 
         if (nt->n_weight) {
             /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
-            double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, sizeof(double) * nt->n_weight);
-            cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*));
+            double* d_weights = cnrn_target_copyin(nt->weights, nt->n_weight);
+            cnrn_target_memcpy_to_device(&(d_nt->weights), &d_weights);
         }
 
         if (nt->_nvdata) {
             /* copy vdata which is setup in bbcore_read. This contains cuda allocated
              * nrnran123_State * */
-            void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata);
-            cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**));
+            void** d_vdata = cnrn_target_copyin(nt->_vdata, nt->_nvdata);
+            cnrn_target_memcpy_to_device(&(d_nt->_vdata), &d_vdata);
         }
 
         if (nt->n_presyn) {
@@ -391,24 +376,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
              * to
              * VTable and alignment */
             PreSynHelper* d_presyns_helper =
-                (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
-            cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*));
-            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*));
+                cnrn_target_copyin(nt->presyns_helper, nt->n_presyn);
+            cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper);
+            PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
+            cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns);
         }
 
         if (nt->_net_send_buffer_size) {
             /* copy send_receive buffer */
-            int* d_net_send_buffer = (int*) cnrn_gpu_copyin(nt->_net_send_buffer,
-                                                       sizeof(int) * nt->_net_send_buffer_size);
-            cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*));
+            int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer,
+                    nt->_net_send_buffer_size);
+            cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer);
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
-            cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**));
+            void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
+            cnrn_target_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay);
 
             nrn_VecPlay_copyto_device(nt, d_vecplay);
         }
@@ -417,41 +402,41 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (interleave_permute_type == 1) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo));
                 int* d_ptr = nullptr;
+                InterleaveInfo* d_info = cnrn_target_copyin(info);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * (info->nstride + 1));
-                cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->stride, info->nstride + 1);
+                cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * nt->ncell);
-                cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->firstnode, nt->ncell);
+                cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * nt->ncell);
-                cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->lastnode, nt->ncell);
+                cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * nt->ncell);
-                cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->cellsize, nt->ncell);
+                cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
 
             } else if (interleave_permute_type == 2) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo));
+                InterleaveInfo* d_info = cnrn_target_copyin(info);
                 int* d_ptr = nullptr;
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * info->nstride);
-                cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->stride, info->nstride);
+                cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1));
-                cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->firstnode, info->nwarp + 1);
+                cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1));
-                cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->lastnode, info->nwarp + 1);
+                cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1));
-                cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->stridedispl, info->nwarp + 1);
+                cnrn_target_memcpy_to_device(&(d_info->stridedispl), &d_ptr);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * info->nwarp);
-                cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->cellsize, info->nwarp);
+                cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
             } else {
                 printf("\n ERROR: only --cell_permute = [12] implemented");
                 abort();
@@ -465,38 +450,30 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (tr) {
                 // Create a device-side copy of the `trajec_requests` struct and
                 // make sure the device-side NrnThread object knows about it.
-                auto* d_trajec_requests = reinterpret_cast<TrajectoryRequests*>(
-                    cnrn_gpu_copyin(tr, sizeof(TrajectoryRequests)));
-                cnrn_memcpy_to_device(&(d_nt->trajec_requests),
-                                     &d_trajec_requests,
-                                     sizeof(TrajectoryRequests*));
+                TrajectoryRequests* d_trajec_requests = cnrn_target_copyin(tr);
+                cnrn_target_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests);
                 // Initialise the double** gather member of the struct.
-                auto* d_tr_gather = reinterpret_cast<double**>(
-                    cnrn_gpu_copyin(tr->gather, sizeof(double*) * tr->n_trajec));
-                cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**));
+                double** d_tr_gather = cnrn_target_copyin(tr->gather, tr->n_trajec);
+                cnrn_target_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather);
                 // Initialise the double** varrays member of the struct if it's
                 // set.
                 double** d_tr_varrays{nullptr};
                 if (tr->varrays) {
-                    d_tr_varrays = reinterpret_cast<double**>(
-                        cnrn_gpu_copyin(tr->varrays, sizeof(double*) * tr->n_trajec));
-                    cnrn_memcpy_to_device(&(d_trajec_requests->varrays),
-                                         &d_tr_varrays,
-                                         sizeof(double**));
+                    d_tr_varrays = cnrn_target_copyin(tr->varrays, tr->n_trajec);
+                    cnrn_target_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays);
                 }
                 for (int i = 0; i < tr->n_trajec; ++i) {
                     if (tr->varrays) {
                         // tr->varrays[i] is a buffer of tr->bsize doubles on the host,
                         // make a device-side copy of it and store a pointer to it in
                         // the device-side version of tr->varrays.
-                        auto* d_buf_traj_i = reinterpret_cast<double*>(
-                            cnrn_gpu_copyin(tr->varrays[i], tr->bsize * sizeof(double)));
-                        cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*));
+                        double* d_buf_traj_i = cnrn_target_copyin(tr->varrays[i], tr->bsize);
+                        cnrn_target_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i);
                     }
                     // tr->gather[i] is a double* referring to (host) data in the
                     // (host) _data block
                     auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]);
-                    cnrn_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*));
+                    cnrn_target_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i);
                 }
                 // TODO: other `double** scatter` and `void** vpr` members of
                 // the TrajectoryRequests struct are not copied to the device.
@@ -513,21 +490,15 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 #endif
 }
 
-void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_copy_needed) {
+void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
 #ifdef _OPENACC
     /// by default `to` is desitionation pointer on a device
     IvocVect* d_iv = &to;
 
-    /// if we need to copy IvocVect vector then newly alloated vector
-    /// on the device is a new destination pointer
-    if(vector_copy_needed) {
-        d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from, sizeof(IvocVect));
-        cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*));
-    }
     size_t n = from.size();
     if (n) {
-        double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), sizeof(double) * n);
-        cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*));
+        double* d_data =  cnrn_target_copyin(from.data(), n);
+        cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data);
     }
 #else
     (void) from;
@@ -539,9 +510,9 @@ void delete_ivoc_vect_from_device(IvocVect& vec) {
 #ifdef _OPENACC
     auto const n = vec.size();
     if (n) {
-        cnrn_target_delete(vec.data(), sizeof(double) * n);
+        cnrn_target_delete(vec.data(), n);
     }
-    cnrn_target_delete(&vec, sizeof(IvocVect));
+    cnrn_target_delete(&vec);
 #else
     (void) vec;
 #endif
@@ -556,12 +527,12 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
 #ifdef _OPENACC
     if (nt->compute_gpu) {
         // free existing vectors in buffers on gpu
-        cnrn_target_delete(nrb->_pnt_index, nrb->_size * sizeof(int));
-        cnrn_target_delete(nrb->_weight_index, nrb->_size * sizeof(int));
-        cnrn_target_delete(nrb->_nrb_t, nrb->_size * sizeof(double));
-        cnrn_target_delete(nrb->_nrb_flag, nrb->_size * sizeof(double));
-        cnrn_target_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int));
-        cnrn_target_delete(nrb->_nrb_index, nrb->_size * sizeof(int));
+        cnrn_target_delete(nrb->_pnt_index, nrb->_size);
+        cnrn_target_delete(nrb->_weight_index, nrb->_size);
+        cnrn_target_delete(nrb->_nrb_t, nrb->_size);
+        cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
+        cnrn_target_delete(nrb->_displ, nrb->_size + 1);
+        cnrn_target_delete(nrb->_nrb_index, nrb->_size);
     }
 #endif
 
@@ -583,26 +554,26 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
         nrn_pragma_acc(update device(nrb));
         nrn_pragma_omp(target update to(nrb));
 
-        NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb);
+        NetReceiveBuffer_t* d_nrb = cnrn_target_deviceptr(nrb);
 
         // recopy the vectors in the buffer
-        d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
+        d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
 
-        d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
+        d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
 
-        d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
+        d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
 
-        d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
+        d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
 
-        d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-        cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
+        d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
+        cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
 
-        d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
+        d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
     }
 #endif
 }
@@ -1039,21 +1010,21 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
 /** Cleanup device memory that is being tracked by the OpenACC runtime.
  *
  *  This function painstakingly calls `cnrn_target_delete` in reverse order on all
- *  pointers that were passed to `cnrn_gpu_copyin` in `setup_nrnthreads_on_device`.
+ *  pointers that were passed to `cnrn_target_copyin` in `setup_nrnthreads_on_device`.
  *  This cleanup ensures that if the GPU is initialised multiple times from the
  *  same process then the OpenACC runtime will not be polluted with old
  *  pointers, which can cause errors. In particular if we do:
  *  @code
  *    {
  *      // ... some_ptr is dynamically allocated ...
- *      cnrn_gpu_copyin(some_ptr, some_size);
+ *      cnrn_target_copyin(some_ptr, some_size);
  *      // ... do some work ...
  *      // cnrn_target_delete(some_ptr);
  *      free(some_ptr);
  *    }
  *    {
  *      // ... same_ptr_again is dynamically allocated at the same address ...
- *      cnrn_gpu_copyin(same_ptr_again, some_other_size); // ERROR
+ *      cnrn_target_copyin(same_ptr_again, some_other_size); // ERROR
  *    }
  *  @endcode
  *  the application will/may abort with an error such as:
@@ -1070,73 +1041,73 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (tr) {
                 if (tr->varrays) {
                     for (int i = 0; i < tr->n_trajec; ++i) {
-                        cnrn_target_delete(tr->varrays[i], tr->bsize * sizeof(double));
+                        cnrn_target_delete(tr->varrays[i], tr->bsize);
                     }
-                    cnrn_target_delete(tr->varrays, sizeof(double*) * tr->n_trajec);
+                    cnrn_target_delete(tr->varrays, tr->n_trajec);
                 }
-                cnrn_target_delete(tr->gather, sizeof(double*) * tr->n_trajec);
-                cnrn_target_delete(tr, sizeof(TrajectoryRequests));
+                cnrn_target_delete(tr->gather, tr->n_trajec);
+                cnrn_target_delete(tr);
             }
         }
         if (nt->_permute) {
             if (interleave_permute_type == 1) {
                 InterleaveInfo* info = interleave_info + i;
-                cnrn_target_delete(info->cellsize, sizeof(int) * nt->ncell);
-                cnrn_target_delete(info->lastnode, sizeof(int) * nt->ncell);
-                cnrn_target_delete(info->firstnode, sizeof(int) * nt->ncell);
-                cnrn_target_delete(info->stride, sizeof(int) * (info->nstride + 1));
-                cnrn_target_delete(info, sizeof(InterleaveInfo));
+                cnrn_target_delete(info->cellsize, nt->ncell);
+                cnrn_target_delete(info->lastnode, nt->ncell);
+                cnrn_target_delete(info->firstnode, nt->ncell);
+                cnrn_target_delete(info->stride, info->nstride + 1);
+                cnrn_target_delete(info);
             } else if (interleave_permute_type == 2) {
                 InterleaveInfo* info = interleave_info + i;
-                cnrn_target_delete(info->cellsize, sizeof(int) * info->nwarp);
-                cnrn_target_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1));
-                cnrn_target_delete(info->lastnode, sizeof(int) * (info->nwarp + 1));
-                cnrn_target_delete(info->firstnode, sizeof(int) * (info->nwarp + 1));
-                cnrn_target_delete(info->stride, sizeof(int) * info->nstride);
-                cnrn_target_delete(info, sizeof(InterleaveInfo));
+                cnrn_target_delete(info->cellsize, info->nwarp);
+                cnrn_target_delete(info->stridedispl, info->nwarp + 1);
+                cnrn_target_delete(info->lastnode, info->nwarp + 1);
+                cnrn_target_delete(info->firstnode, info->nwarp + 1);
+                cnrn_target_delete(info->stride, info->nstride);
+                cnrn_target_delete(info);
             }
         }
 
         if (nt->n_vecplay) {
             nrn_VecPlay_delete_from_device(nt);
-            cnrn_target_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            cnrn_target_delete(nt->_vecplay, nt->n_vecplay);
         }
 
         // Cleanup send_receive buffer.
         if (nt->_net_send_buffer_size) {
-            cnrn_target_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size);
+            cnrn_target_delete(nt->_net_send_buffer, nt->_net_send_buffer_size);
         }
 
         if (nt->n_presyn) {
-            cnrn_target_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            cnrn_target_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+            cnrn_target_delete(nt->presyns, nt->n_presyn);
+            cnrn_target_delete(nt->presyns_helper, nt->n_presyn);
         }
 
         // Cleanup data that's setup in bbcore_read.
         if (nt->_nvdata) {
-            cnrn_target_delete(nt->_vdata, sizeof(void*) * nt->_nvdata);
+            cnrn_target_delete(nt->_vdata, nt->_nvdata);
         }
 
         // Cleanup weight vector used in NET_RECEIVE
         if (nt->n_weight) {
-            cnrn_target_delete(nt->weights, sizeof(double) * nt->n_weight);
+            cnrn_target_delete(nt->weights, nt->n_weight);
         }
 
         // Cleanup point processes
         if (nt->n_pntproc) {
-            cnrn_target_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+            cnrn_target_delete(nt->pntprocs, nt->n_pntproc);
         }
 
         if (nt->nrn_fast_imem) {
-            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
-            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
-            cnrn_target_delete(nt->nrn_fast_imem, sizeof(NrnFastImem));
+            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end);
+            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
+            cnrn_target_delete(nt->nrn_fast_imem);
         }
 
         if (nt->shadow_rhs_cnt) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
-            cnrn_target_delete(nt->_shadow_d, pcnt * sizeof(double));
-            cnrn_target_delete(nt->_shadow_rhs, pcnt * sizeof(double));
+            cnrn_target_delete(nt->_shadow_d, pcnt);
+            cnrn_target_delete(nt->_shadow_rhs, pcnt);
         }
 
         for (auto tml = nt->tml; tml; tml = tml->next) {
@@ -1144,26 +1115,26 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             {
                 NetSendBuffer_t* nsb{tml->ml->_net_send_buffer};
                 if (nsb) {
-                    cnrn_target_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size);
-                    cnrn_target_delete(nsb->_nsb_t, sizeof(double) * nsb->_size);
-                    cnrn_target_delete(nsb->_weight_index, sizeof(int) * nsb->_size);
-                    cnrn_target_delete(nsb->_pnt_index, sizeof(int) * nsb->_size);
-                    cnrn_target_delete(nsb->_vdata_index, sizeof(int) * nsb->_size);
-                    cnrn_target_delete(nsb->_sendtype, sizeof(int) * nsb->_size);
-                    cnrn_target_delete(nsb, sizeof(NetSendBuffer_t));
+                    cnrn_target_delete(nsb->_nsb_flag, nsb->_size);
+                    cnrn_target_delete(nsb->_nsb_t, nsb->_size);
+                    cnrn_target_delete(nsb->_weight_index, nsb->_size);
+                    cnrn_target_delete(nsb->_pnt_index, nsb->_size);
+                    cnrn_target_delete(nsb->_vdata_index, nsb->_size);
+                    cnrn_target_delete(nsb->_sendtype, nsb->_size);
+                    cnrn_target_delete(nsb);
                 }
             }
             // Cleanup the net receive buffer if it exists.
             {
                 NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer};
                 if (nrb) {
-                    cnrn_target_delete(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                    cnrn_target_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                    cnrn_target_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-                    cnrn_target_delete(nrb->_nrb_t, sizeof(double) * nrb->_size);
-                    cnrn_target_delete(nrb->_weight_index, sizeof(int) * nrb->_size);
-                    cnrn_target_delete(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                    cnrn_target_delete(nrb, sizeof(NetReceiveBuffer_t));
+                    cnrn_target_delete(nrb->_nrb_index, nrb->_size);
+                    cnrn_target_delete(nrb->_displ, nrb->_size + 1);
+                    cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
+                    cnrn_target_delete(nrb->_nrb_t, nrb->_size);
+                    cnrn_target_delete(nrb->_weight_index, nrb->_size);
+                    cnrn_target_delete(nrb->_pnt_index, nrb->_size);
+                    cnrn_target_delete(nrb);
                 }
             }
             int type = tml->index;
@@ -1172,23 +1143,23 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int is_art = corenrn.get_is_artificial()[type];
             int ts = corenrn.get_memb_funcs()[type].thread_size_;
             if (ts) {
-                cnrn_target_delete(tml->ml->_thread, ts * sizeof(ThreadDatum));
+                cnrn_target_delete(tml->ml->_thread, ts);
             }
             if (szdp) {
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                cnrn_target_delete(tml->ml->pdata, sizeof(int) * pcnt);
+                cnrn_target_delete(tml->ml->pdata, pcnt);
             }
             if (!is_art) {
-                cnrn_target_delete(tml->ml->nodeindices, sizeof(int) * n);
+                cnrn_target_delete(tml->ml->nodeindices, n);
             }
-            cnrn_target_delete(tml->ml, sizeof(Memb_list));
-            cnrn_target_delete(tml, sizeof(NrnThreadMembList));
+            cnrn_target_delete(tml->ml);
+            cnrn_target_delete(tml);
         }
-        cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*));
-        cnrn_target_delete(nt->_v_parent_index, nt->end * sizeof(int));
-        cnrn_target_delete(nt->_data, nt->_ndata * sizeof(double));
+        cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size());
+        cnrn_target_delete(nt->_v_parent_index, nt->end);
+        cnrn_target_delete(nt->_data, nt->_ndata);
     }
-    cnrn_target_delete(threads, sizeof(NrnThread) * nthreads);
+    cnrn_target_delete(threads, nthreads);
     nrn_ion_global_map_delete_from_device();
 #endif
 }
@@ -1204,34 +1175,34 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
 
     int n = ns->n * ns->n_instance;
     // actually, the values of double do not matter, only the  pointers.
-    NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns, sizeof(NewtonSpace));
+    NewtonSpace* d_ns = cnrn_target_copyin(ns);
 
     double* pd;
 
-    pd = (double*) cnrn_gpu_copyin(ns->delta_x, n * sizeof(double));
-    cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->delta_x, n);
+    cnrn_target_memcpy_to_device(&(d_ns->delta_x), &pd);
 
-    pd = (double*) cnrn_gpu_copyin(ns->high_value, n * sizeof(double));
-    cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->high_value, n);
+    cnrn_target_memcpy_to_device(&(d_ns->high_value), &pd);
 
-    pd = (double*) cnrn_gpu_copyin(ns->low_value, n * sizeof(double));
-    cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->low_value, n);
+    cnrn_target_memcpy_to_device(&(d_ns->low_value), &pd);
 
-    pd = (double*) cnrn_gpu_copyin(ns->rowmax, n * sizeof(double));
-    cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->rowmax, n);
+    cnrn_target_memcpy_to_device(&(d_ns->rowmax), &pd);
 
-    auto pint = (int*) cnrn_gpu_copyin(ns->perm, n * sizeof(int));
-    cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*));
+    auto pint = cnrn_target_copyin(ns->perm, n);
+    cnrn_target_memcpy_to_device(&(d_ns->perm), &pint);
 
-    auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n * sizeof(double*));
-    cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**));
+    auto ppd = cnrn_target_copyin(ns->jacobian, ns->n);
+    cnrn_target_memcpy_to_device(&(d_ns->jacobian), &ppd);
 
     // the actual jacobian doubles were allocated as a single array
-    double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n * sizeof(double));
+    double* d_jacdat = cnrn_target_copyin(ns->jacobian[0], ns->n * n);
 
     for (int i = 0; i < ns->n; ++i) {
         pd = d_jacdat + i * n;
-        cnrn_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*));
+        cnrn_target_memcpy_to_device(&(ppd[i]), &pd);
     }
 #endif
 }
@@ -1244,14 +1215,14 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
         return;
     }
     int n = ns->n * ns->n_instance;
-    cnrn_target_delete(ns->jacobian[0], ns->n * n * sizeof(double));
-    cnrn_target_delete(ns->jacobian, ns->n * sizeof(double*));
-    cnrn_target_delete(ns->perm, n * sizeof(int));
-    cnrn_target_delete(ns->rowmax, n * sizeof(double));
-    cnrn_target_delete(ns->low_value, n * sizeof(double));
-    cnrn_target_delete(ns->high_value, n * sizeof(double));
-    cnrn_target_delete(ns->delta_x, n * sizeof(double));
-    cnrn_target_delete(ns, sizeof(NewtonSpace));
+    cnrn_target_delete(ns->jacobian[0], ns->n * n);
+    cnrn_target_delete(ns->jacobian, ns->n);
+    cnrn_target_delete(ns->perm, n);
+    cnrn_target_delete(ns->rowmax, n);
+    cnrn_target_delete(ns->low_value, n);
+    cnrn_target_delete(ns->high_value, n);
+    cnrn_target_delete(ns->delta_x, n);
+    cnrn_target_delete(ns);
 #endif
 }
 
@@ -1264,76 +1235,76 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
     }
 
     unsigned n1 = so->neqn + 1;
-    SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so, sizeof(SparseObj));
+    SparseObj* d_so = cnrn_target_copyin(so);
     // only pointer fields in SparseObj that need setting up are
     //   rowst, diag, rhs, ngetcall, coef_list
     // only pointer fields in Elm that need setting up are
     //   r_down, c_right, value
     // do not care about the Elm* ptr value, just the space.
 
-    Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1 * sizeof(Elm*));
-    cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**));
+    Elm** d_rowst = cnrn_target_copyin(so->rowst, n1);
+    cnrn_target_memcpy_to_device(&(d_so->rowst), &d_rowst);
 
-    Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1 * sizeof(Elm*));
-    cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**));
+    Elm** d_diag = cnrn_target_copyin(so->diag, n1);
+    cnrn_target_memcpy_to_device(&(d_so->diag), &d_diag);
 
-    auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
-    cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**));
+    unsigned* pu = cnrn_target_copyin(so->ngetcall, so->_cntml_padded);
+    cnrn_target_memcpy_to_device(&(d_so->ngetcall), &pu);
 
-    auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double));
-    cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*));
+    double* pd = cnrn_target_copyin(so->rhs, n1 * so->_cntml_padded);
+    cnrn_target_memcpy_to_device(&(d_so->rhs), &pd);
 
-    auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size * sizeof(double*));
-    cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**));
+    double** d_coef_list = cnrn_target_copyin(so->coef_list, so->coef_list_size);
+    cnrn_target_memcpy_to_device(&(d_so->coef_list), &d_coef_list);
 
     // Fill in relevant Elm pointer values
 
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            Elm* pelm = (Elm*) cnrn_gpu_copyin(elm, sizeof(Elm));
+            Elm* pelm = cnrn_target_copyin(elm);
 
             if (elm == so->rowst[irow]) {
-                cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*));
+                cnrn_target_memcpy_to_device(&(d_rowst[irow]), &pelm);
             } else {
-                Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->c_left);
-                cnrn_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*));
+                Elm* d_e = cnrn_target_deviceptr(elm->c_left);
+                cnrn_target_memcpy_to_device(&(pelm->c_left), &d_e);
             }
 
             if (elm->col == elm->row) {
-                cnrn_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*));
+                cnrn_target_memcpy_to_device(&(d_diag[irow]), &pelm);
             }
 
             if (irow > 1) {
                 if (elm->r_up) {
-                    Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->r_up);
-                    cnrn_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*));
+                    Elm* d_e = cnrn_target_deviceptr(elm->r_up);
+                    cnrn_target_memcpy_to_device(&(pelm->r_up), &d_e);
                 }
             }
 
-            pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded * sizeof(double));
-            cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*));
+            pd = cnrn_target_copyin(elm->value, so->_cntml_padded);
+            cnrn_target_memcpy_to_device(&(pelm->value), &pd);
         }
     }
 
     // visit all the Elm again and fill in pelm->r_down and pelm->c_left
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            auto pelm = (Elm*) cnrn_target_deviceptr(elm);
+            auto pelm = cnrn_target_deviceptr(elm);
             if (elm->r_down) {
-                auto d_e = (Elm*) cnrn_target_deviceptr(elm->r_down);
-                cnrn_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*));
+                auto d_e = cnrn_target_deviceptr(elm->r_down);
+                cnrn_target_memcpy_to_device(&(pelm->r_down), &d_e);
             }
             if (elm->c_right) {
-                auto d_e = (Elm*) cnrn_target_deviceptr(elm->c_right);
-                cnrn_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*));
+                auto d_e = cnrn_target_deviceptr(elm->c_right);
+                cnrn_target_memcpy_to_device(&(pelm->c_right), &d_e);
             }
         }
     }
 
     // Fill in the d_so->coef_list
     for (unsigned i = 0; i < so->coef_list_size; ++i) {
-        pd = (double*) cnrn_target_deviceptr(so->coef_list[i]);
-        cnrn_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*));
+        pd = cnrn_target_deviceptr(so->coef_list[i]);
+        cnrn_target_memcpy_to_device(&(d_coef_list[i]), &pd);
     }
 #endif
 }
@@ -1348,16 +1319,16 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
     unsigned n1 = so->neqn + 1;
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            cnrn_target_delete(elm->value, so->_cntml_padded * sizeof(double));
-            cnrn_target_delete(elm, sizeof(Elm));
+            cnrn_target_delete(elm->value, so->_cntml_padded);
+            cnrn_target_delete(elm);
         }
     }
-    cnrn_target_delete(so->coef_list, so->coef_list_size * sizeof(double*));
-    cnrn_target_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double));
-    cnrn_target_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
-    cnrn_target_delete(so->diag, n1 * sizeof(Elm*));
-    cnrn_target_delete(so->rowst, n1 * sizeof(Elm*));
-    cnrn_target_delete(so, sizeof(SparseObj));
+    cnrn_target_delete(so->coef_list, so->coef_list_size);
+    cnrn_target_delete(so->rhs, n1 * so->_cntml_padded);
+    cnrn_target_delete(so->ngetcall, so->_cntml_padded);
+    cnrn_target_delete(so->diag, n1);
+    cnrn_target_delete(so->rowst, n1);
+    cnrn_target_delete(so);
 #endif
 }
 
@@ -1365,14 +1336,13 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
 
 void nrn_ion_global_map_copyto_device() {
     if (nrn_ion_global_map_size) {
-        double** d_data = (double**) cnrn_gpu_copyin(nrn_ion_global_map,
-                                                sizeof(double*) * nrn_ion_global_map_size);
+        double** d_data = cnrn_target_copyin(nrn_ion_global_map,
+                                             nrn_ion_global_map_size);
         for (int j = 0; j < nrn_ion_global_map_size; j++) {
             if (nrn_ion_global_map[j]) {
-                double* d_mechmap = (double*) cnrn_gpu_copyin(nrn_ion_global_map[j],
-                                                         ion_global_map_member_size *
-                                                             sizeof(double));
-                cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*));
+                double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j],
+                                                       ion_global_map_member_size);
+                cnrn_target_memcpy_to_device(&(d_data[j]), &d_mechmap);
             }
         }
     }
@@ -1381,11 +1351,11 @@ void nrn_ion_global_map_copyto_device() {
 void nrn_ion_global_map_delete_from_device() {
     for (int j = 0; j < nrn_ion_global_map_size; j++) {
         if (nrn_ion_global_map[j]) {
-            cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double));
+            cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size);
         }
     }
     if (nrn_ion_global_map_size) {
-        cnrn_target_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size);
+        cnrn_target_delete(nrn_ion_global_map, nrn_ion_global_map_size);
     }
 }
 
@@ -1439,43 +1409,44 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i];
 
         /** just VecPlayContinuous object */
-        void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance, sizeof(VecPlayContinuous));
-        cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*));
-
-        VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p;
+        VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance);
+        cnrn_target_memcpy_to_device((VecPlayContinuous**) (&(d_vecplay[i])), &d_vecplay_instance);
 
         /** copy y_, t_ and discon_indices_ */
         copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_);
         copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_);
+        // OL211213: beware, the test suite does not currently include anything
+        // with a non-null discon_indices_.
         if (vecplay_instance->discon_indices_) {
+            IvocVect* d_discon_indices = cnrn_target_copyin(vecplay_instance->discon_indices_);
+            cnrn_target_memcpy_to_device(&(d_vecplay_instance->discon_indices_), &d_discon_indices);
             copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_),
-                                     *(d_vecplay_instance->discon_indices_),
-                                     true);
+                                     *(d_vecplay_instance->discon_indices_));
         }
 
         /** copy PlayRecordEvent : todo: verify this */
-        PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_,
-                                                              sizeof(PlayRecordEvent));
-        cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*));
-        cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*));
+        PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_);
+
+        cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**) (&d_vecplay_instance));
+        cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_);
 
         /** copy pd_ : note that it's pointer inside ml->data and hence data itself is
          * already on GPU */
-        double* d_pd_ = (double*) cnrn_target_deviceptr(vecplay_instance->pd_);
-        cnrn_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*));
+        double* d_pd_ = cnrn_target_deviceptr(vecplay_instance->pd_);
+        cnrn_target_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_);
     }
 }
 
 void nrn_VecPlay_delete_from_device(NrnThread* nt) {
     for (int i = 0; i < nt->n_vecplay; i++) {
         auto* vecplay_instance = reinterpret_cast<VecPlayContinuous*>(nt->_vecplay[i]);
-        cnrn_target_delete(vecplay_instance->e_, sizeof(PlayRecordEvent));
+        cnrn_target_delete(vecplay_instance->e_);
         if (vecplay_instance->discon_indices_) {
             delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
         }
         delete_ivoc_vect_from_device(vecplay_instance->t_);
         delete_ivoc_vect_from_device(vecplay_instance->y_);
-        cnrn_target_delete(vecplay_instance, sizeof(VecPlayContinuous));
+        cnrn_target_delete(vecplay_instance);
     }
 }
 

From 78081b435ba165fb5e8ed58adaa2bc30d65a334b Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Tue, 14 Dec 2021 08:48:12 +0100
Subject: [PATCH 15/31] Remove unused GPU code (#711)

We prefer selective host-to-device updates.
---
 coreneuron/gpu/nrn_acc_manager.cpp | 130 -----------------------------
 coreneuron/gpu/nrn_acc_manager.hpp |   6 +-
 2 files changed, 3 insertions(+), 133 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 4fe0004fd..e7bd09817 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -858,136 +858,6 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
 #endif
 }
 
-void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
-#ifdef _OPENACC
-
-    for (int i = 0; i < nthreads; i++) {
-        NrnThread* nt = threads + i;
-
-        if (nt->compute_gpu && (nt->end > 0)) {
-            /* -- copy data to device -- */
-
-            int ne = nrn_soa_padded_size(nt->end, 0);
-
-            nrn_pragma_acc(update device(
-                        nt->_actual_rhs[:ne],
-                        nt->_actual_d[:ne],
-                        nt->_actual_a[:ne],
-                        nt->_actual_b[:ne],
-                        nt->_actual_v[:ne],
-                        nt->_actual_area[:ne]))
-            nrn_pragma_omp(target update to(
-                        nt->_actual_rhs[:ne],
-                        nt->_actual_d[:ne],
-                        nt->_actual_a[:ne],
-                        nt->_actual_b[:ne],
-                        nt->_actual_v[:ne],
-                        nt->_actual_area[:ne]))
-
-            nrn_pragma_acc(update device(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
-            nrn_pragma_omp(target update to(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
-
-            /* @todo: nt._ml_list[tml->index] = tml->ml; */
-
-            /* -- copy NrnThreadMembList list ml to host -- */
-            for (auto tml = nt->tml; tml; tml = tml->next) {
-                Memb_list* ml = tml->ml;
-                int type = tml->index;
-                int n = ml->nodecount;
-                int szp = corenrn.get_prop_param_size()[type];
-                int szdp = corenrn.get_prop_dparam_size()[type];
-
-                int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
-
-                nrn_pragma_acc(update device(ml->data[:pcnt]))
-                nrn_pragma_omp(target update to(ml->data[:pcnt]))
-
-                nrn_pragma_acc(update device(ml->nodeindices[:n])
-                        if (!corenrn.get_is_artificial()[type]))
-                nrn_pragma_omp(target update to(ml->nodeindices[:n])
-                        if (!corenrn.get_is_artificial()[type]))
-                int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                nrn_pragma_acc(update device(ml->pdata[:dpcnt]) if (szdp))
-                nrn_pragma_omp(target update to(ml->pdata[:dpcnt]) if (szdp))
-
-                auto nrb = tml->ml->_net_receive_buffer;
-                nrn_pragma_acc(update device(nrb->_cnt,
-                            nrb->_size,
-                            nrb->_pnt_offset,
-                            nrb->_displ_cnt,
-                            nrb->_pnt_index[:nrb->_size],
-                            nrb->_weight_index[:nrb->_size],
-                            nrb->_displ[:nrb->_size],
-                            nrb->_nrb_index[:nrb->_size])
-                        if (nrb != nullptr))
-                nrn_pragma_omp(target update to(nrb->_cnt,
-                            nrb->_size,
-                            nrb->_pnt_offset,
-                            nrb->_displ_cnt,
-                            nrb->_pnt_index[:nrb->_size],
-                            nrb->_weight_index[:nrb->_size],
-                            nrb->_displ[:nrb->_size],
-                            nrb->_nrb_index[:nrb->_size])
-                        if (nrb != nullptr))
-            }
-            int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
-            /* copy shadow_rhs to host */
-            nrn_pragma_acc(update device(nt->_shadow_rhs[:pcnt],
-                        /* copy shadow_d to host */
-                        nt->_shadow_d[:pcnt])
-                    if (nt->shadow_rhs_cnt))
-            nrn_pragma_omp(target update to(nt->_shadow_rhs[:pcnt],
-                        /* copy shadow_d to host */
-                        nt->_shadow_d[:pcnt])
-                    if (nt->shadow_rhs_cnt))
-
-
-            nrn_pragma_acc(update device(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
-                        nt->nrn_fast_imem->nrn_sav_d[:nt->end])
-                    if (nt->nrn_fast_imem != nullptr))
-            nrn_pragma_omp(target update to(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
-                        nt->nrn_fast_imem->nrn_sav_d[:nt->end])
-                    if (nt->nrn_fast_imem != nullptr))
-
-                nrn_pragma_acc(update device(nt->pntprocs[:nt->n_pntproc])
-                        if (nt->n_pntproc))
-                nrn_pragma_omp(target update to(nt->pntprocs[:nt->n_pntproc])
-                        if (nt->n_pntproc))
-
-            nrn_pragma_acc(update device(nt->weights[:nt->n_weight]) if (nt->n_weight))
-            nrn_pragma_omp(target update to(nt->weights[:nt->n_weight]) if (nt->n_weight))
-
-                nrn_pragma_acc(update device(nt->presyns_helper[:nt->n_presyn],
-                            nt->presyns[:nt->n_presyn])
-                        if (nt->n_presyn))
-                nrn_pragma_omp(target update to(nt->presyns_helper[:nt->n_presyn],
-                            nt->presyns[:nt->n_presyn])
-                        if (nt->n_presyn))
-
-            {
-                TrajectoryRequests* tr = nt->trajec_requests;
-                if (tr && tr->varrays) {
-                    // The full buffers have `bsize` entries, but only `vsize`
-                    // of them are valid.
-                    for (int i = 0; i < tr->n_trajec; ++i) {
-                        nrn_pragma_acc(update device(tr->varrays[i][:tr->vsize]))
-                        nrn_pragma_omp(target update to(tr->varrays[i][:tr->vsize]))
-                    }
-                }
-            }
-
-            /* don't and don't update vdata, its pointer array
-               nrn_pragma_acc(update device(nt->_vdata[:nt->_nvdata) if nt->_nvdata)
-               nrn_pragma_omp(target update tp(nt->_vdata[:nt->_nvdata) if (nt->_nvdata))
-             */
-        }
-    }
-#else
-    (void) threads;
-    (void) nthreads;
-#endif
-}
-
 /**
  * Copy weights from GPU to CPU
  *
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
index 354bdc208..1334369e7 100644
--- a/coreneuron/gpu/nrn_acc_manager.hpp
+++ b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -19,13 +19,13 @@ namespace coreneuron {
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void delete_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void update_nrnthreads_on_host(NrnThread* threads, int nthreads);
-void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
-void modify_data_on_device(NrnThread* threads, int nthreads);
-void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);
 
 void update_net_receive_buffer(NrnThread* _nt);
+
+// Called by NModl
 void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
+
 void update_weights_from_gpu(NrnThread* threads, int nthreads);
 void init_gpu();
 

From 781d34f615c2ac9cbc5f1bc05b87db2e334bb78f Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 14 Dec 2021 10:31:57 +0100
Subject: [PATCH 16/31] Fixes and improvements from LLVM/XLC work. (#716)

Code fixes for XLC and Clang execution without build system changes.
This mainly adds missing OpenMP pragmas and makes cnrn_target_
wrappers visible to NMODL.
---
 CMake/MakefileBuildOptions.cmake          |  1 +
 coreneuron/gpu/nrn_acc_manager.cpp        | 55 ++---------------------
 coreneuron/kinderiv.py                    |  6 +++
 coreneuron/mechanism/eion.cpp             |  2 +
 coreneuron/mechanism/mech/dimplic.cpp     |  2 +
 coreneuron/mechanism/register_mech.cpp    |  2 +
 coreneuron/network/cvodestb.cpp           |  2 +
 coreneuron/network/netcvode.cpp           |  2 +-
 coreneuron/sim/scopmath/crout_thread.cpp  |  2 +
 coreneuron/sim/scopmath/newton_thread.cpp |  2 +
 coreneuron/sim/treeset_core.cpp           | 12 ++---
 coreneuron/utils/offload.hpp              | 53 ++++++++++++++++++++++
 extra/nrnivmodl_core_makefile.in          |  4 +-
 13 files changed, 85 insertions(+), 60 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index fc0b0b551..009dd3215 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -75,6 +75,7 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE)
 set(CORENRN_CXX_FLAGS
     "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}} ${CXX14_STD_FLAGS} ${NVHPC_ACC_COMP_FLAGS} ${NVHPC_CXX_INLINE_FLAGS}"
 )
+set(CORENRN_LD_FLAGS "${NVHPC_ACC_LINK_FLAGS}")
 
 # =============================================================================
 # nmodl/mod2c related options : TODO
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index e7bd09817..9bd635d77 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -24,16 +24,14 @@
 #include "coreneuron/mpi/nrnmpidec.h"
 #include "coreneuron/utils/utils.hpp"
 
+#ifdef CRAYPAT
+#include <pat_api.h>
+#endif
+
 #ifdef _OPENACC
 #include <openacc.h>
 #endif
-#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
-#include <omp.h>
-#endif
 
-#ifdef CRAYPAT
-#include <pat_api.h>
-#endif
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
 void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
@@ -43,51 +41,6 @@ void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
-template <typename T>
-T* cnrn_target_deviceptr(const T* h_ptr) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
-#else
-    throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
-#endif
-}
-
-template <typename T>
-T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    #pragma omp target enter data map(to:h_ptr[:len])
-    return cnrn_target_deviceptr(const_cast<T*>(h_ptr));
-#else
-    throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
-#endif
-}
-
-template <typename T>
-void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    acc_delete(h_ptr, len * sizeof(T));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    #pragma omp target exit data map(delete: h_ptr[:len])
-#else
-    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
-#endif
-}
-
-template <typename T>
-void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    omp_target_memcpy(d_ptr, const_cast<T*>(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device());
-#else
-    throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
-#endif
-}
-
 /* note: threads here are corresponding to global nrn_threads array */
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 #ifdef _OPENACC
diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py
index 35158908c..9b143c0cf 100644
--- a/coreneuron/kinderiv.py
+++ b/coreneuron/kinderiv.py
@@ -59,6 +59,9 @@ def write_out_kinderiv(fout):
     fout.write("\n/* declarations */\n")
     fout.write("\nnamespace coreneuron {\n")
 
+    if deriv or kin or euler:
+        fout.write('nrn_pragma_omp(declare target)\n')
+
     for item in deriv:
         fout.write('#pragma acc routine seq\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
@@ -73,6 +76,9 @@ def write_out_kinderiv(fout):
         fout.write('#pragma acc routine seq\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
 
+    if deriv or kin or euler:
+        fout.write('nrn_pragma_omp(end declare target)\n')
+
     fout.write("\n/* callback indices */\n")
     derivoffset = 1
     kinoffset = 1
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 727f30ea6..6cb3cf83d 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -177,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {
     }
 }
 
+nrn_pragma_omp(declare target)
 void nrn_wrote_conc(int type,
                     double* p1,
                     int p2,
@@ -193,6 +194,7 @@ void nrn_wrote_conc(int type,
         pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
     }
 }
+nrn_pragma_omp(end declare target)
 
 static double efun(double x) {
     if (fabs(x) < 1e-4) {
diff --git a/coreneuron/mechanism/mech/dimplic.cpp b/coreneuron/mechanism/mech/dimplic.cpp
index e3b08207e..de8970560 100644
--- a/coreneuron/mechanism/mech/dimplic.cpp
+++ b/coreneuron/mechanism/mech/dimplic.cpp
@@ -24,6 +24,7 @@
 #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
 #include "_kinderiv.h"
 namespace coreneuron {
+nrn_pragma_omp(declare target)
 int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) {
     difun(fun);
     return 0;
@@ -48,5 +49,6 @@ int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) {
     switch (fun) { _NRN_KINETIC_CASES }
     return 0;
 }
+nrn_pragma_omp(end declare target)
 
 }  // namespace coreneuron
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index a8bff7a50..433140b82 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -19,7 +19,9 @@
 
 namespace coreneuron {
 int secondorder = 0;
+nrn_pragma_omp(declare target)
 double t, dt, celsius, pi;
+nrn_pragma_omp(end declare target)
 int rev_dt;
 
 using Pfrv = void (*)();
diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp
index 31b2fec54..97c70950e 100644
--- a/coreneuron/network/cvodestb.cpp
+++ b/coreneuron/network/cvodestb.cpp
@@ -86,6 +86,7 @@ void fixed_play_continuous(NrnThread* nt) {
 
 // NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
 // for the ISPC backend. If changes are required, make sure to change ISPC as well.
+nrn_pragma_omp(declare target)
 int at_time(NrnThread* nt, double te) {
     double x = te - 1e-11;
     if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
@@ -93,5 +94,6 @@ int at_time(NrnThread* nt, double te) {
     }
     return 0;
 }
+nrn_pragma_omp(end declare target)
 
 }  // namespace coreneuron
diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp
index ee2e5cb3e..dd521afde 100644
--- a/coreneuron/network/netcvode.cpp
+++ b/coreneuron/network/netcvode.cpp
@@ -537,7 +537,7 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
     nrn_pragma_acc(parallel loop present(
         nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
                        copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
     for (int i = 0; i < nt->ncell; ++i) {
         PreSyn* ps = presyns + i;
         PreSynHelper* psh = presyns_helper + i;
diff --git a/coreneuron/sim/scopmath/crout_thread.cpp b/coreneuron/sim/scopmath/crout_thread.cpp
index b180ea107..72a5c017f 100644
--- a/coreneuron/sim/scopmath/crout_thread.cpp
+++ b/coreneuron/sim/scopmath/crout_thread.cpp
@@ -50,6 +50,7 @@ namespace coreneuron {
 #define ix(arg) ((arg) *_STRIDE)
 
 /* having a differnt permutation per instance may not be a good idea */
+nrn_pragma_omp(declare target)
 int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) {
     int save_i = 0;
 
@@ -224,4 +225,5 @@ void nrn_scopmath_solve_thread(int n,
         }
     }
 }
+nrn_pragma_omp(end declare target)
 }  // namespace coreneuron
diff --git a/coreneuron/sim/scopmath/newton_thread.cpp b/coreneuron/sim/scopmath/newton_thread.cpp
index 6c0f303ce..dc08ca04b 100644
--- a/coreneuron/sim/scopmath/newton_thread.cpp
+++ b/coreneuron/sim/scopmath/newton_thread.cpp
@@ -59,6 +59,7 @@ namespace coreneuron {
 #define ix(arg) ((arg) *_STRIDE)
 #define s_(arg) _p[s[arg] * _STRIDE]
 
+nrn_pragma_omp(declare target)
 int nrn_newton_thread(NewtonSpace* ns,
                       int n,
                       int* s,
@@ -136,6 +137,7 @@ int nrn_newton_thread(NewtonSpace* ns,
 
     return (error);
 }
+nrn_pragma_omp(end declare target)
 
 /*------------------------------------------------------------*/
 /*                                                            */
diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp
index bb92d2ab1..208058fe1 100644
--- a/coreneuron/sim/treeset_core.cpp
+++ b/coreneuron/sim/treeset_core.cpp
@@ -34,7 +34,7 @@ static void nrn_rhs(NrnThread* _nt) {
 
     nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
                        async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         vec_rhs[i] = 0.;
         vec_d[i] = 0.;
@@ -46,7 +46,7 @@ static void nrn_rhs(NrnThread* _nt) {
         nrn_pragma_acc(
             parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
                 async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             fast_imem_d[i] = 0.;
             fast_imem_rhs[i] = 0.;
@@ -76,7 +76,7 @@ static void nrn_rhs(NrnThread* _nt) {
         double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
         nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
                            async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] -= vec_rhs[i];
         }
@@ -93,7 +93,7 @@ static void nrn_rhs(NrnThread* _nt) {
                                          vec_v [0:i3],
                                          parent_index [0:i3]) if (_nt->compute_gpu)
                        async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         double dv = vec_v[parent_index[i]] - vec_v[i];
         /* our connection coefficients are negative so */
@@ -153,7 +153,7 @@ static void nrn_lhs(NrnThread* _nt) {
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_d;
         nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] += vec_d[i];
         }
@@ -163,7 +163,7 @@ static void nrn_lhs(NrnThread* _nt) {
     nrn_pragma_acc(parallel loop present(
         vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
                        async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         nrn_pragma_acc(atomic update)
         nrn_pragma_omp(atomic update)
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index d90cc10fd..7ec41f4f4 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -10,11 +10,64 @@
 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
 #define nrn_pragma_acc(x)
 #define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x))
+#include <omp.h>
 #elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
 #define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x))
 #define nrn_pragma_omp(x)
+#include <openacc.h>
 #else
 #define nrn_pragma_acc(x)
 #define nrn_pragma_omp(x)
+#include <stdexcept>
 #endif
+
+#include <cstddef>
+
+namespace coreneuron {
+template <typename T>
+T* cnrn_target_deviceptr(const T* h_ptr) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
+#else
+    throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+template <typename T>
+T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    #pragma omp target enter data map(to:h_ptr[:len])
+    return cnrn_target_deviceptr(const_cast<T*>(h_ptr));
+#else
+    throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+template <typename T>
+void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    acc_delete(h_ptr, len * sizeof(T));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    #pragma omp target exit data map(delete: h_ptr[:len])
+#else
+    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+template <typename T>
+void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    omp_target_memcpy(d_ptr, const_cast<T*>(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device());
+#else
+    throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+}
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index 5bd424865..f51571ae8 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -73,8 +73,8 @@ endif
 
 CXXFLAGS = @CORENRN_CXX_FLAGS@
 CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES)
-CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@
-CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
+CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_EXE_LINKER_FLAGS@
+CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
 
 # ISPC compilation and link commands
 ISPC = @CMAKE_ISPC_COMPILER@

From 1f01552833472d0ff0cf5ddc1b024d79bde55bb1 Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Thu, 16 Dec 2021 09:35:16 +0100
Subject: [PATCH 17/31] Use pragmas instead of omp_get_mapped_ptr (#705)

omp_get_mapped_ptr was added in OpenMP 5.1 and is not widely supported.

With this change then calling cnrn_target_deviceptr on a pointer that is not
present on the device is a hard error instead of returning nullptr, so avoid
calling it for artificial cells.
---
 coreneuron/gpu/nrn_acc_manager.cpp |  6 ++++--
 coreneuron/utils/offload.hpp       | 12 ++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 9bd635d77..2c18f22d9 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -188,8 +188,10 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int szdp = corenrn.get_prop_dparam_size()[type];
             int is_art = corenrn.get_is_artificial()[type];
 
-            // get device pointer for corresponding mechanism data
-            dptr = cnrn_target_deviceptr(tml->ml->data);
+            // If the mechanism is artificial data are not inside nt->_data but in a newly
+            // allocated block. As we never run code for artificial cell inside GPU
+            // we don't copy it.
+            dptr = is_art ? nullptr : cnrn_target_deviceptr(tml->ml->data);
             cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
 
 
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index 7ec41f4f4..ad4189ec1 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -30,7 +30,15 @@ T* cnrn_target_deviceptr(const T* h_ptr) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
     return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
+    T *d_ptr = nullptr;
+    T *_h_ptr = const_cast<T*>(h_ptr);
+
+    nrn_pragma_omp(target data use_device_ptr(_h_ptr))
+    {
+        d_ptr = _h_ptr;
+    }
+
+    return d_ptr;
 #else
     throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
 #endif
@@ -42,7 +50,7 @@ T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
     return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
     #pragma omp target enter data map(to:h_ptr[:len])
-    return cnrn_target_deviceptr(const_cast<T*>(h_ptr));
+    return cnrn_target_deviceptr(h_ptr);
 #else
     throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
 #endif

From d03c45f85f39985b318180603fc958f6edf1d401 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 17 Dec 2021 12:13:21 +0200
Subject: [PATCH 18/31] GPU implementation improvements (#718)

* Set nwarp to very big number for optimal parallelization and improve a bit grid config of CUDA solve_interleaved2
---
 coreneuron/apps/corenrn_parameters.hpp |  4 ++--
 coreneuron/permute/cellorder.cu        | 17 +++++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/coreneuron/apps/corenrn_parameters.hpp b/coreneuron/apps/corenrn_parameters.hpp
index 21f2f7767..e22cf348d 100644
--- a/coreneuron/apps/corenrn_parameters.hpp
+++ b/coreneuron/apps/corenrn_parameters.hpp
@@ -46,8 +46,8 @@ struct corenrn_parameters {
     unsigned ms_subint = 2;                /// Number of multisend interval. 1 or 2
     unsigned spkcompress = 0;              /// Spike Compression
     unsigned cell_interleave_permute = 0;  /// Cell interleaving permutation
-    unsigned nwarp = 1024;  /// Number of warps to balance for cell_interleave_permute == 2
-    unsigned num_gpus = 0;  /// Number of gpus to use per node
+    unsigned nwarp = 65536;  /// Number of warps to balance for cell_interleave_permute == 2
+    unsigned num_gpus = 0;   /// Number of gpus to use per node
     unsigned report_buff_size = report_buff_size_default;  /// Size in MB of the report buffer.
     int seed = -1;  /// Initialization seed for random number generator (int)
 
diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu
index 1226b4bf7..0c1b5af2e 100644
--- a/coreneuron/permute/cellorder.cu
+++ b/coreneuron/permute/cellorder.cu
@@ -92,12 +92,17 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
 void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) {
     auto cuda_stream = static_cast<cudaStream_t>(stream);
 
-    // the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e.
-    // 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
-    // The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU. 
-    // In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs.
-    int threadsPerBlock = 128;
-    int blocksPerGrid = 512;
+    /// the selection of these parameters has been done after running the channel-benchmark for
+    /// typical production runs, i.e. 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
+    /// In the OpenACC/OpenMP implementations threadsPerBlock is set to 32. From profiling the
+    /// channel-benchmark circuits mentioned above we figured out that the best performance was
+    /// achieved with this configuration
+    int threadsPerBlock = warpsize;
+    /// Max number of blocksPerGrid for NVIDIA GPUs is 65535, so we need to make sure that the
+    /// blocksPerGrid we launch the CUDA kernel with doesn't exceed this number
+    const auto maxBlocksPerGrid = 65535;
+    int provisionalBlocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
+    int blocksPerGrid = provisionalBlocksPerGrid <= maxBlocksPerGrid ? provisionalBlocksPerGrid : maxBlocksPerGrid;
 
     solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0, cuda_stream>>>(nt, info, ncore);
 

From 3fc7037842c3d80372d6ea1f3860643585d0f46b Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 17 Dec 2021 14:53:39 +0100
Subject: [PATCH 19/31] More CI + disable OpenACC in OpenMP builds (#717)

* Re-enable GitLab CI.
* Add NMODL + OpenACC test.
* Restore {clang,cmake}-format checks.
* Prefer OpenACC with MOD2C.
* Do not enable OpenACC in NMODL + OpenMP mode.
* Convert more #pragma acc to nrn_pragma_acc(...).
* Call cudaSetDevice in OpenMP mode.

Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>
---
 .../workflows/clang_cmake_format_check.yaml   |  37 +++
 .gitlab-ci.yml                                | 132 +++++++++-
 CMake/OpenAccHelper.cmake                     |  12 +-
 CMakeLists.txt                                |  14 +
 coreneuron/CMakeLists.txt                     |  16 +-
 coreneuron/apps/main1.cpp                     |  11 +-
 coreneuron/gpu/nrn_acc_manager.cpp            | 243 +++++++++---------
 coreneuron/gpu/nrn_acc_manager.hpp            |   4 -
 coreneuron/kinderiv.py                        |   8 +-
 .../mechanism/mech/mod2c_core_thread.hpp      |  31 ++-
 coreneuron/mechanism/mechanism.hpp            |   3 +-
 coreneuron/mechanism/membfunc.hpp             |  13 +-
 coreneuron/network/cvodestb.cpp               |   6 +-
 coreneuron/network/netcvode.cpp               |   3 -
 coreneuron/network/partrans.cpp               |   3 +-
 coreneuron/permute/cellorder.cpp              |  27 +-
 coreneuron/sim/scopmath/newton_struct.h       |  10 +-
 coreneuron/sim/scopmath/sparse_thread.cpp     |   2 +-
 coreneuron/sim/scopmath/ssimplic_thread.cpp   |   5 +-
 coreneuron/utils/ivocvect.cpp                 |   5 +-
 coreneuron/utils/ivocvect.hpp                 |  12 +-
 coreneuron/utils/offload.hpp                  |  61 +++--
 coreneuron/utils/profile/profiler_interface.h |   6 +-
 coreneuron/utils/randoms/nrnran123.h          |  13 +-
 external/nmodl                                |   2 +-
 25 files changed, 456 insertions(+), 223 deletions(-)
 create mode 100644 .github/workflows/clang_cmake_format_check.yaml

diff --git a/.github/workflows/clang_cmake_format_check.yaml b/.github/workflows/clang_cmake_format_check.yaml
new file mode 100644
index 000000000..b438a8080
--- /dev/null
+++ b/.github/workflows/clang_cmake_format_check.yaml
@@ -0,0 +1,37 @@
+name: clang-cmake-format-check
+
+concurrency:
+  group: ${{ github.workflow }}#${{ github.ref }}
+  cancel-in-progress: true
+
+on:
+    push:
+
+jobs:
+  build:
+    name: clang-cmake-format-check
+    runs-on: ubuntu-20.04
+    steps:
+        - name: Fetch repository
+          uses: actions/checkout@v2
+        - name: Install clang-format 11
+          run: |
+              sudo apt-get update
+              sudo apt-get install clang-format-11 python3-pip libboost-all-dev libopenmpi-dev openmpi-bin
+        - name: Install cmake-format 0.6.13
+          run:  python3 -m pip install cmake-format==0.6.13
+        - name: Configure
+          shell: bash
+          working-directory: ${{runner.workspace}}/CoreNeuron
+          run: |
+              export PATH=/home/runner/.local/bin:$PATH
+              mkdir BUILD && cd BUILD
+              cmake -DCORENRN_CLANG_FORMAT=ON -DCORENRN_CMAKE_FORMAT=ON -DCORENRN_ENABLE_MPI=ON -DCORENRN_ENABLE_OPENMP=OFF -DClangFormat_EXECUTABLE=$(which clang-format-11) -DCMakeFormat_EXECUTABLE=$(which cmake-format) ..
+        - name: Run clang-format
+          shell: bash
+          working-directory: ${{runner.workspace}}/CoreNeuron/BUILD
+          run: make check-clang-format VERBOSE=1
+        - name: Run cmake-format
+          shell: bash
+          working-directory: ${{runner.workspace}}/CoreNeuron/BUILD
+          run: make check-cmake-format
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 84e83c0ac..8b434bf81 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -35,6 +35,9 @@ spack_setup:
     - git diff
     - fi
 
+.spack_intel:
+  variables:
+    SPACK_PACKAGE_COMPILER: intel
 .spack_nvhpc:
   variables:
     SPACK_PACKAGE_COMPILER: nvhpc
@@ -42,11 +45,21 @@ spack_setup:
   variables:
     SPACK_PACKAGE: neuron
     SPACK_PACKAGE_REF: ''
-    SPACK_PACKAGE_SPEC: +coreneuron+debug+tests~legacy-unit
+    SPACK_PACKAGE_SPEC: +coreneuron+debug+tests~legacy-unit model_tests=channel-benchmark,olfactory
 .gpu_node:
   variables:
     bb5_constraint: volta
 
+build:nmodl:intel:
+  stage: build_nmodl
+  variables:
+    SPACK_PACKAGE: nmodl
+    SPACK_PACKAGE_REF: ''
+    SPACK_PACKAGE_SPEC: ~legacy-unit
+  extends:
+    - .spack_build
+    - .spack_intel
+
 build:nmodl:gpu:
   stage: build_nmodl
   variables:
@@ -58,21 +71,92 @@ build:nmodl:gpu:
     - .spack_build
     - .spack_nvhpc
 
+build:coreneuron+nmodl:intel:
+  variables:
+    SPACK_PACKAGE: coreneuron
+    SPACK_PACKAGE_SPEC: +nmodl+tests~legacy-unit build_type=Debug
+  extends:
+    - .spack_build
+    - .spack_intel
+  needs: ["build:nmodl:intel"]
+
+build:coreneuron:intel:
+  variables:
+    SPACK_PACKAGE: coreneuron
+    SPACK_PACKAGE_SPEC: +tests~legacy-unit build_type=Debug
+  extends:
+    - .spack_build
+    - .spack_intel
+
 build:coreneuron+nmodl:gpu:
   variables:
     SPACK_PACKAGE: coreneuron
     # +report pulls in a lot of dependencies and the tests fail.
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~report build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~report~sympy build_type=RelWithDebInfo
   extends:
     - .spack_build
     - .spack_nvhpc
   needs: ["build:nmodl:gpu"]
 
+build:coreneuron+nmodl~openmp:gpu:
+  variables:
+    SPACK_PACKAGE: coreneuron
+    # +report pulls in a lot of dependencies and the tests fail.
+    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
+    # Sympy + OpenMP target offload does not currently work with NVHPC
+    SPACK_PACKAGE_SPEC: +nmodl~openmp+gpu+tests~legacy-unit~report+sympy build_type=RelWithDebInfo
+  extends:
+    - .spack_build
+    - .spack_nvhpc
+  needs: ["build:nmodl:gpu"]
+
+build:coreneuron:gpu:
+  variables:
+    SPACK_PACKAGE: coreneuron
+    # +report pulls in a lot of dependencies and the tests fail.
+    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
+    SPACK_PACKAGE_SPEC: +gpu+openmp+tests~legacy-unit~report build_type=RelWithDebInfo
+  extends:
+    - .spack_build
+    - .spack_nvhpc
+
+test:coreneuron+nmodl:intel:
+  extends: [.ctest]
+  needs: ["build:coreneuron+nmodl:intel"]
+
+test:coreneuron:intel:
+  extends: [.ctest]
+  needs: ["build:coreneuron:intel"]
+
 test:coreneuron+nmodl:gpu:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron+nmodl:gpu"]
 
+test:coreneuron+nmodl~openmp:gpu:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron+nmodl~openmp:gpu"]
+
+test:coreneuron:gpu:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron:gpu"]
+
+build:neuron+nmodl:intel:
+  stage: build_neuron
+  extends:
+    - .spack_build
+    - .spack_neuron
+    - .spack_intel
+  needs: ["build:coreneuron+nmodl:intel"]
+
+build:neuron:intel:
+  stage: build_neuron
+  extends:
+    - .spack_build
+    - .spack_neuron
+    - .spack_intel
+  needs: ["build:coreneuron:intel"]
+
 build:neuron+nmodl:gpu:
   stage: build_neuron
   extends:
@@ -85,7 +169,51 @@ build:neuron+nmodl:gpu:
     - !reference [.spack_build, before_script]
   needs: ["build:coreneuron+nmodl:gpu"]
 
+build:neuron+nmodl~openmp:gpu:
+  stage: build_neuron
+  extends:
+    - .spack_build
+    - .spack_neuron
+    - .spack_nvhpc
+  before_script:
+    # Build py-cython and py-numpy with GCC instead of NVHPC.
+    - SPACK_PACKAGE_DEPENDENCIES="${SPACK_PACKAGE_DEPENDENCIES}^py-cython%gcc^py-numpy%gcc"
+    - !reference [.spack_build, before_script]
+  needs: ["build:coreneuron+nmodl~openmp:gpu"]
+
+build:neuron:gpu:
+  stage: build_neuron
+  extends:
+    - .spack_build
+    - .spack_neuron
+    - .spack_nvhpc
+  before_script:
+    # Build py-cython and py-numpy with GCC instead of NVHPC.
+    - SPACK_PACKAGE_DEPENDENCIES="${SPACK_PACKAGE_DEPENDENCIES}^py-cython%gcc^py-numpy%gcc"
+    - !reference [.spack_build, before_script]
+  needs: ["build:coreneuron:gpu"]
+
+test:neuron+nmodl:intel:
+  stage: test_neuron
+  extends: [.ctest]
+  needs: ["build:neuron+nmodl:intel"]
+
+test:neuron:intel:
+  stage: test_neuron
+  extends: [.ctest]
+  needs: ["build:neuron:intel"]
+
 test:neuron+nmodl:gpu:
   stage: test_neuron
   extends: [.ctest, .gpu_node]
   needs: ["build:neuron+nmodl:gpu"]
+
+test:neuron+nmodl~openmp:gpu:
+  stage: test_neuron
+  extends: [.ctest, .gpu_node]
+  needs: ["build:neuron+nmodl~openmp:gpu"]
+
+test:neuron:gpu:
+  stage: test_neuron
+  extends: [.ctest, .gpu_node]
+  needs: ["build:neuron:gpu"]
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index e8fa6738a..063b32003 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -55,20 +55,26 @@ if(CORENRN_ENABLE_GPU)
   # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
   # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
   # CUDA version as is used for the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
-  set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
+  set(NVHPC_ACC_COMP_FLAGS "-Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
+  set(NVHPC_ACC_LINK_FLAGS "-cuda")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
   foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
     string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
   endforeach()
-  if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
+  if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
     # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
     # for a region then prefer OpenMP.
     add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
     string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
     string(APPEND NVHPC_ACC_LINK_FLAGS " -mp=gpu")
+  elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
+    # Only enable OpenACC offload for GPU
+    string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
+    string(APPEND NVHPC_ACC_LINK_FLAGS " -acc")
+  else()
+    message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
   endif()
   # avoid PGI adding standard compliant "-A" flags
   set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 963703975..df528a965 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,6 +121,7 @@ else()
   set(CORENRN_HAVE_NVHPC_COMPILER OFF)
 endif()
 
+set(CORENRN_ACCELERATOR_OFFLOAD "Disabled")
 if(CORENRN_ENABLE_GPU)
   # Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after
   # https://github.com/BlueBrain/CoreNeuron/pull/609.
@@ -189,6 +190,18 @@ if(CORENRN_ENABLE_GPU)
   set(CMAKE_CUDA_FLAGS
       "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
   )
+
+  if(CORENRN_ENABLE_NMODL)
+    # NMODL supports both OpenACC and OpenMP target offload
+    if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
+      set(CORENRN_ACCELERATOR_OFFLOAD "OpenMP")
+    else()
+      set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
+    endif()
+  else()
+    # MOD2C only supports OpenACC offload
+    set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
+  endif()
 endif()
 
 # =============================================================================
@@ -530,6 +543,7 @@ message(STATUS "MOD2CPP PATH        | ${CORENRN_MOD2CPP_BINARY}")
 message(STATUS "GPU Support         | ${CORENRN_ENABLE_GPU}")
 if(CORENRN_ENABLE_GPU)
   message(STATUS "  CUDA              | ${CUDAToolkit_LIBRARY_DIR}")
+  message(STATUS "  Offload           | ${CORENRN_ACCELERATOR_OFFLOAD}")
   message(STATUS "  Unified Memory    | ${CORENRN_ENABLE_CUDA_UNIFIED_MEMORY}")
 endif()
 message(STATUS "Auto Timeout        | ${CORENRN_ENABLE_TIMEOUT}")
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 437eb8ea7..f42568a27 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -321,13 +321,15 @@ if(EXISTS "${CORENRN_EXTERNAL_BENCHMARK_DATA}")
     COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms")
   list(APPEND all_output_binaries ${output_binaries})
   string(
-    CONCAT
-      benchmark_command
-      "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
-      " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
-      " --tstop 1 --gpu &&"
-      "diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'"
-  )
+    CONCAT benchmark_command
+           "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
+           " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
+           " --tstop 1 --mpi")
+  if(CORENRN_ENABLE_GPU)
+    string(APPEND benchmark_command " --gpu")
+  endif()
+  string(APPEND benchmark_command " && diff out.dat "
+         "'${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'")
   add_test(NAME benchmark COMMAND sh -c "${benchmark_command}")
 endif()
 set(modfile_directory "${CORENEURON_PROJECT_SOURCE_DIR}/tests/integration/ring_gap/mod")
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index 6a4d43bea..5bfda9421 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -193,10 +193,11 @@ void nrn_init_and_load_data(int argc,
     // precedence is: set by user, globals.dat, 34.0
     celsius = corenrn_param.celsius;
 
-#if _OPENACC
+#if CORENEURON_ENABLE_GPU
     if (!corenrn_param.gpu && corenrn_param.cell_interleave_permute == 2) {
         fprintf(stderr,
-                "compiled with _OPENACC does not allow the combination of --cell-permute=2 and "
+                "compiled with CORENEURON_ENABLE_GPU does not allow the combination of "
+                "--cell-permute=2 and "
                 "missing --gpu\n");
         exit(1);
     }
@@ -497,7 +498,7 @@ extern "C" void mk_mech_init(int argc, char** argv) {
     }
 #endif
 
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     if (corenrn_param.gpu) {
         init_gpu();
     }
@@ -558,8 +559,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
 #endif
     bool compute_gpu = corenrn_param.gpu;
 
-    nrn_pragma_acc(update device(celsius, secondorder, pi) if(compute_gpu))
-    nrn_pragma_omp(target update to(celsius, secondorder, pi) if(compute_gpu))
+    nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu))
+    nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu))
     {
         double v = corenrn_param.voltage;
         double dt = corenrn_param.dt;
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 2c18f22d9..edf9b6d63 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -28,8 +28,8 @@
 #include <pat_api.h>
 #endif
 
-#ifdef _OPENACC
-#include <openacc.h>
+#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+#include <cuda_runtime_api.h>
 #endif
 
 namespace coreneuron {
@@ -41,9 +41,44 @@ void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
+int cnrn_target_get_num_devices() {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+    // choose nvidia GPU by default
+    acc_device_t device_type = acc_device_nvidia;
+    // check how many gpu devices available per node
+    return acc_get_num_devices(device_type);
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    return omp_get_num_devices();
+#else
+    throw std::runtime_error(
+        "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+void cnrn_target_set_default_device(int device_num) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+    acc_set_device_num(device_num, acc_device_nvidia);
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    omp_set_default_device(device_num);
+    // It seems that with NVHPC 21.9 then only setting the default OpenMP device
+    // is not enough: there were errors on some nodes when not-the-0th GPU was
+    // used. These seemed to be related to the NMODL instance structs, which are
+    // allocated using cudaMallocManaged.
+    auto const cuda_code = cudaSetDevice(device_num);
+    assert(cuda_code == cudaSuccess);
+#else
+    throw std::runtime_error(
+        "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
 /* note: threads here are corresponding to global nrn_threads array */
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // initialize NrnThreads for gpu execution
     // empty thread or only artificial cells should be on cpu
     for (int i = 0; i < nthreads; i++) {
@@ -148,8 +183,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index));
 
         /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
-        Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list,
-                                                         corenrn.get_memb_funcs().size());
+        Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, corenrn.get_memb_funcs().size());
         cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list));
 
         /* -- copy NrnThreadMembList list ml to device -- */
@@ -306,8 +340,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         if (nt->n_pntproc) {
             /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
              */
-            Point_process* pntptr =
-                cnrn_target_copyin(nt->pntprocs, nt->n_pntproc);
+            Point_process* pntptr = cnrn_target_copyin(nt->pntprocs, nt->n_pntproc);
             cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr);
         }
 
@@ -330,8 +363,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
              * while updating PreSyn objects which has virtual base class. May be this is issue due
              * to
              * VTable and alignment */
-            PreSynHelper* d_presyns_helper =
-                cnrn_target_copyin(nt->presyns_helper, nt->n_presyn);
+            PreSynHelper* d_presyns_helper = cnrn_target_copyin(nt->presyns_helper, nt->n_presyn);
             cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper);
             PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
             cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns);
@@ -340,7 +372,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         if (nt->_net_send_buffer_size) {
             /* copy send_receive buffer */
             int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer,
-                    nt->_net_send_buffer_size);
+                                                        nt->_net_send_buffer_size);
             cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer);
         }
 
@@ -446,13 +478,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 }
 
 void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     /// by default `to` is desitionation pointer on a device
     IvocVect* d_iv = &to;
 
     size_t n = from.size();
     if (n) {
-        double* d_data =  cnrn_target_copyin(from.data(), n);
+        double* d_data = cnrn_target_copyin(from.data(), n);
         cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data);
     }
 #else
@@ -462,7 +494,7 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
 }
 
 void delete_ivoc_vect_from_device(IvocVect& vec) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     auto const n = vec.size();
     if (n) {
         cnrn_target_delete(vec.data(), n);
@@ -479,7 +511,7 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
         return;
     }
 
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     if (nt->compute_gpu) {
         // free existing vectors in buffers on gpu
         cnrn_target_delete(nrb->_pnt_index, nrb->_size);
@@ -500,7 +532,7 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
     nrb->_displ = (int*) erealloc(nrb->_displ, (nrb->_size + 1) * sizeof(int));
     nrb->_nrb_index = (int*) erealloc(nrb->_nrb_index, nrb->_size * sizeof(int));
 
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     if (nt->compute_gpu) {
         int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index;
         double *d_nrb_t, *d_nrb_flag;
@@ -628,7 +660,7 @@ void update_net_receive_buffer(NrnThread* nt) {
 }
 
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     if (!nt->compute_gpu)
         return;
 
@@ -643,22 +675,22 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
     if (nsb->_cnt) {
         Instrumentor::phase p_net_receive_buffer_order("net-send-buf-gpu2cpu");
     }
-    nrn_pragma_acc(update self(
-                nsb->_sendtype[:nsb->_cnt],
-                nsb->_vdata_index[:nsb->_cnt],
-                nsb->_pnt_index[:nsb->_cnt],
-                nsb->_weight_index[:nsb->_cnt],
-                nsb->_nsb_t[:nsb->_cnt],
-                nsb->_nsb_flag[:nsb->_cnt])
-        if (nsb->_cnt))
-    nrn_pragma_omp(target update from(
-                nsb->_sendtype[:nsb->_cnt],
-                nsb->_vdata_index[:nsb->_cnt],
-                nsb->_pnt_index[:nsb->_cnt],
-                nsb->_weight_index[:nsb->_cnt],
-                nsb->_nsb_t[:nsb->_cnt],
-                nsb->_nsb_flag[:nsb->_cnt])
-        if (nsb->_cnt))
+    // clang-format off
+    nrn_pragma_acc(update self(nsb->_sendtype[:nsb->_cnt],
+                               nsb->_vdata_index[:nsb->_cnt],
+                               nsb->_pnt_index[:nsb->_cnt],
+                               nsb->_weight_index[:nsb->_cnt],
+                               nsb->_nsb_t[:nsb->_cnt],
+                               nsb->_nsb_flag[:nsb->_cnt])
+                          if (nsb->_cnt))
+    nrn_pragma_omp(target update from(nsb->_sendtype[:nsb->_cnt],
+                                      nsb->_vdata_index[:nsb->_cnt],
+                                      nsb->_pnt_index[:nsb->_cnt],
+                                      nsb->_weight_index[:nsb->_cnt],
+                                      nsb->_nsb_t[:nsb->_cnt],
+                                      nsb->_nsb_flag[:nsb->_cnt])
+                                 if (nsb->_cnt))
+    // clang-format on
 #else
     (void) nt;
     (void) nsb;
@@ -666,7 +698,7 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
 }
 
 void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
 
     for (int i = 0; i < nthreads; i++) {
         NrnThread* nt = threads + i;
@@ -676,23 +708,24 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
 
             int ne = nrn_soa_padded_size(nt->end, 0);
 
-            nrn_pragma_acc(update self(
-                        nt->_actual_rhs[:ne],
-                        nt->_actual_d[:ne],
-                        nt->_actual_a[:ne],
-                        nt->_actual_b[:ne],
-                        nt->_actual_v[:ne],
-                        nt->_actual_area[:ne]))
-            nrn_pragma_omp(target update from(
-                        nt->_actual_rhs[:ne],
-                        nt->_actual_d[:ne],
-                        nt->_actual_a[:ne],
-                        nt->_actual_b[:ne],
-                        nt->_actual_v[:ne],
-                        nt->_actual_area[:ne]))
+            // clang-format off
+            nrn_pragma_acc(update self(nt->_actual_rhs[:ne],
+                                       nt->_actual_d[:ne],
+                                       nt->_actual_a[:ne],
+                                       nt->_actual_b[:ne],
+                                       nt->_actual_v[:ne],
+                                       nt->_actual_area[:ne]))
+            nrn_pragma_omp(target update from(nt->_actual_rhs[:ne],
+                                              nt->_actual_d[:ne],
+                                              nt->_actual_a[:ne],
+                                              nt->_actual_b[:ne],
+                                              nt->_actual_v[:ne],
+                                              nt->_actual_area[:ne]))
+            // clang-format on
 
             nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
-            nrn_pragma_omp(target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
+            nrn_pragma_omp(
+                target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
 
             /* @todo: nt._ml_list[tml->index] = tml->ml; */
 
@@ -700,10 +733,8 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
             for (auto tml = nt->tml; tml; tml = tml->next) {
                 Memb_list* ml = tml->ml;
 
-                nrn_pragma_acc(update self(tml->index,
-                                           ml->nodecount))
-                nrn_pragma_omp(target update from(tml->index,
-                            ml->nodecount))
+                nrn_pragma_acc(update self(tml->index, ml->nodecount))
+                nrn_pragma_omp(target update from(tml->index, ml->nodecount))
 
                 int type = tml->index;
                 int n = ml->nodecount;
@@ -720,10 +751,8 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
 
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
 
-                nrn_pragma_acc(update self(ml->data[:pcnt],
-                            ml->nodeindices[:n]))
-                nrn_pragma_omp(target update from(ml->data[:pcnt],
-                            ml->nodeindices[:n]))
+                nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n]))
+                nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n]))
 
                 int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
                 nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp))
@@ -731,46 +760,44 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
 
                 auto nrb = tml->ml->_net_receive_buffer;
 
-                nrn_pragma_acc(update self(
-                            nrb->_cnt,
-                            nrb->_size,
-                            nrb->_pnt_offset,
-                            nrb->_displ_cnt,
-
-                            nrb->_pnt_index[:nrb->_size],
-                            nrb->_weight_index[:nrb->_size],
-                            nrb->_displ[:nrb->_size + 1],
-                            nrb->_nrb_index[:nrb->_size])
-                        if (nrb != nullptr))
-                nrn_pragma_omp(target update from(
-                            nrb->_cnt,
-                            nrb->_size,
-                            nrb->_pnt_offset,
-                            nrb->_displ_cnt,
-
-                            nrb->_pnt_index[:nrb->_size],
-                            nrb->_weight_index[:nrb->_size],
-                            nrb->_displ[:nrb->_size + 1],
-                            nrb->_nrb_index[:nrb->_size])
-                        if (nrb != nullptr))
+                // clang-format off
+                nrn_pragma_acc(update self(nrb->_cnt,
+                                           nrb->_size,
+                                           nrb->_pnt_offset,
+                                           nrb->_displ_cnt,
+                                           nrb->_pnt_index[:nrb->_size],
+                                           nrb->_weight_index[:nrb->_size],
+                                           nrb->_displ[:nrb->_size + 1],
+                                           nrb->_nrb_index[:nrb->_size])
+                                      if (nrb != nullptr))
+                nrn_pragma_omp(target update from(nrb->_cnt,
+                                                  nrb->_size,
+                                                  nrb->_pnt_offset,
+                                                  nrb->_displ_cnt,
+                                                  nrb->_pnt_index[:nrb->_size],
+                                                  nrb->_weight_index[:nrb->_size],
+                                                  nrb->_displ[:nrb->_size + 1],
+                                                  nrb->_nrb_index[:nrb->_size])
+                                             if (nrb != nullptr))
+                // clang-format on
             }
 
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
             /* copy shadow_rhs to host */
             /* copy shadow_d to host */
-            nrn_pragma_acc(update self(nt->_shadow_rhs[:pcnt],
-                        nt->_shadow_d[:pcnt])
-                    if (nt->shadow_rhs_cnt))
-                nrn_pragma_omp(target update from(nt->_shadow_rhs[:pcnt],
-                            nt->_shadow_d[:pcnt])
-                        if (nt->shadow_rhs_cnt))
+            nrn_pragma_acc(
+                update self(nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))
+            nrn_pragma_omp(target update from(
+                nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))
 
+            // clang-format off
             nrn_pragma_acc(update self(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
-                        nt->nrn_fast_imem->nrn_sav_d[:nt->end])
-                    if (nt->nrn_fast_imem != nullptr))
+                                       nt->nrn_fast_imem->nrn_sav_d[:nt->end])
+                                  if (nt->nrn_fast_imem != nullptr))
             nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
-                        nt->nrn_fast_imem->nrn_sav_d[:nt->end])
-                    if (nt->nrn_fast_imem != nullptr))
+                                              nt->nrn_fast_imem->nrn_sav_d[:nt->end])
+                                         if (nt->nrn_fast_imem != nullptr))
+            // clang-format on
 
             nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
             nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
@@ -779,13 +806,9 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
             nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight))
 
             nrn_pragma_acc(update self(
-                nt->presyns_helper[:nt->n_presyn],
-                nt->presyns[:nt->n_presyn])
-                    if (nt->n_presyn))
+                nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))
             nrn_pragma_omp(target update from(
-                nt->presyns_helper[:nt->n_presyn],
-                nt->presyns[:nt->n_presyn])
-                    if (nt->n_presyn))
+                nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))
 
             {
                 TrajectoryRequests* tr = nt->trajec_requests;
@@ -793,10 +816,8 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
                     // The full buffers have `bsize` entries, but only `vsize`
                     // of them are valid.
                     for (int i = 0; i < tr->n_trajec; ++i) {
-                        nrn_pragma_acc(update self(
-                            tr->varrays[i][:tr->vsize]))
-                        nrn_pragma_omp(target update from(
-                            tr->varrays[i][:tr->vsize]))
+                        nrn_pragma_acc(update self(tr->varrays[i][:tr->vsize]))
+                        nrn_pragma_omp(target update from(tr->varrays[i][:tr->vsize]))
                     }
                 }
             }
@@ -858,7 +879,7 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
  *  the same process.
  */
 void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     for (int i = 0; i < nthreads; i++) {
         NrnThread* nt = threads + i;
         {
@@ -991,7 +1012,7 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
 
 void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1033,7 +1054,7 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
 }
 
 void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1052,7 +1073,7 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
 }
 
 void nrn_sparseobj_copyto_device(SparseObj* so) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1135,7 +1156,7 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
 }
 
 void nrn_sparseobj_delete_from_device(SparseObj* so) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1157,12 +1178,11 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
 #endif
 }
 
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
 
 void nrn_ion_global_map_copyto_device() {
     if (nrn_ion_global_map_size) {
-        double** d_data = cnrn_target_copyin(nrn_ion_global_map,
-                                             nrn_ion_global_map_size);
+        double** d_data = cnrn_target_copyin(nrn_ion_global_map, nrn_ion_global_map_size);
         for (int j = 0; j < nrn_ion_global_map_size; j++) {
             if (nrn_ion_global_map[j]) {
                 double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j],
@@ -1185,11 +1205,8 @@ void nrn_ion_global_map_delete_from_device() {
 }
 
 void init_gpu() {
-    // choose nvidia GPU by default
-    acc_device_t device_type = acc_device_nvidia;
-
     // check how many gpu devices available per node
-    int num_devices_per_node = acc_get_num_devices(device_type);
+    int num_devices_per_node = cnrn_target_get_num_devices();
 
     // if no gpu found, can't run on GPU
     if (num_devices_per_node == 0) {
@@ -1217,11 +1234,7 @@ void init_gpu() {
     }
 #endif
 
-    int device_num = local_rank % num_devices_per_node;
-    acc_set_device_num(device_num, device_type);
-#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
-    omp_set_default_device(device_num);
-#endif
+    cnrn_target_set_default_device(local_rank % num_devices_per_node);
 
     if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
         std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
index 1334369e7..72d222cdd 100644
--- a/coreneuron/gpu/nrn_acc_manager.hpp
+++ b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -9,10 +9,6 @@
 #ifndef _nrn_device_manager_
 #define _nrn_device_manager_
 
-#if defined(_OPENACC)
-#include <openacc.h>
-#endif
-
 #include "coreneuron/sim/multicore.hpp"
 
 namespace coreneuron {
diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py
index 9b143c0cf..67cd93ebb 100644
--- a/coreneuron/kinderiv.py
+++ b/coreneuron/kinderiv.py
@@ -63,17 +63,17 @@ def write_out_kinderiv(fout):
         fout.write('nrn_pragma_omp(declare target)\n')
 
     for item in deriv:
-        fout.write('#pragma acc routine seq\n')
+        fout.write('nrn_pragma_acc(routine seq)\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
-        fout.write('#pragma acc routine seq\n')
+        fout.write('nrn_pragma_acc(routine seq)\n')
         fout.write('extern int _newton_%s%s(_threadargsproto_);\n' % (item[0], item[1]))
 
     for item in kin:
-        fout.write('#pragma acc routine seq\n')
+        fout.write('nrn_pragma_acc(routine seq)\n')
         fout.write('extern int %s%s(void*, double*, _threadargsproto_);\n' % (item[0], item[1]))
 
     for item in euler:
-        fout.write('#pragma acc routine seq\n')
+        fout.write('nrn_pragma_acc(routine seq)\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
 
     if deriv or kin or euler:
diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
index e4dee09ac..4c572dd18 100644
--- a/coreneuron/mechanism/mech/mod2c_core_thread.hpp
+++ b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
@@ -11,6 +11,7 @@
 
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/mechanism/mechanism.hpp"
+#include "coreneuron/utils/offload.hpp"
 
 namespace coreneuron {
 
@@ -35,15 +36,17 @@ using DIFUN = int;
 using NEWTFUN = int;
 using SPFUN = int;
 using EULFUN = int;
-#pragma acc routine seq
+nrn_pragma_omp(declare target)
+nrn_pragma_acc(routine seq)
 extern int nrn_derivimplicit_steer(int, _threadargsproto_);
 #define difun(arg) nrn_derivimplicit_steer(arg, _threadargs_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int nrn_newton_steer(int, _threadargsproto_);
 #define newtfun(arg) nrn_newton_steer(arg, _threadargs_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int nrn_euler_steer(int, _threadargsproto_);
 #define eulerfun(arg) nrn_euler_steer(arg, _threadargs_);
+nrn_pragma_omp(end declare target)
 
 struct Elm {
     unsigned row;        /* Row location */
@@ -89,15 +92,19 @@ struct SparseObj {          /* all the state information */
     int do_flag;
 };
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
 extern double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml);
+nrn_pragma_omp(end declare target)
 
 extern void* nrn_cons_sparseobj(SPFUN, int, Memb_list*, _threadargsproto_);
 
 extern void _nrn_destroy_sparseobj_thread(SparseObj* so);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
 extern int nrn_kinetic_steer(int, SparseObj*, double*, _threadargsproto_);
+nrn_pragma_omp(end declare target)
 #define spfun(arg1, arg2, arg3) nrn_kinetic_steer(arg1, arg2, arg3, _threadargs_);
 
 // derived from nrn/src/scopmath/euler.c
@@ -116,14 +123,15 @@ static inline int euler_thread(int neqn, int* var, int* der, DIFUN fun, _threada
     return 0;
 }
 
-#pragma acc routine seq
+nrn_pragma_omp(declare target)
+nrn_pragma_acc(routine seq)
 extern int derivimplicit_thread(int, int*, int*, DIFUN, _threadargsproto_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int _ss_derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int
 sparse_thread(SparseObj*, int, int*, int*, double*, double, SPFUN, int, _threadargsproto_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 int _ss_sparse_thread(SparseObj*,
                       int n,
                       int* s,
@@ -134,10 +142,11 @@ int _ss_sparse_thread(SparseObj*,
                       int linflag,
                       _threadargsproto_);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern double _modl_get_dt_thread(NrnThread*);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern void _modl_set_dt_thread(double, NrnThread*);
+nrn_pragma_omp(end declare target)
 
 void nrn_sparseobj_copyto_device(SparseObj* so);
 void nrn_sparseobj_delete_from_device(SparseObj* so);
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index 3e7046e4e..62be093a3 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -16,6 +16,7 @@
 namespace coreneuron {
 // OpenACC with PGI compiler has issue when union is used and hence use struct
 // \todo check if newer PGI versions has resolved this issue
+// OL211214: bump
 #if defined(_OPENACC)
 struct ThreadDatum {
     int i;
@@ -88,7 +89,7 @@ struct NetSendBuffer_t: MemoryManaged {
     }
 
     void grow() {
-#if defined(_OPENACC)
+#ifdef CORENEURON_ENABLE_GPU
         int cannot_reallocate_on_device = 0;
         assert(cannot_reallocate_on_device);
 #else
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 7598edf50..ba7bf9281 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -11,6 +11,7 @@
 #include <vector>
 
 #include "coreneuron/mechanism/mechanism.hpp"
+#include "coreneuron/utils/offload.hpp"
 namespace coreneuron {
 
 using Pfrpdat = Datum* (*) (void);
@@ -109,12 +110,14 @@ extern void hoc_register_watch_check(nrn_watch_check_t, int);
 
 extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int);
 extern void nrn_writes_conc(int, int);
-#pragma acc routine seq
+nrn_pragma_omp(declare target)
+nrn_pragma_acc(routine seq)
 extern void nrn_wrote_conc(int, double*, int, int, double**, double, int);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 double nrn_nernst(double ci, double co, double z, double celsius);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern double nrn_ghk(double v, double ci, double co, double z);
+nrn_pragma_omp(end declare target)
 extern void hoc_register_prop_size(int, int, int);
 extern void hoc_register_dparam_semantics(int type, int, const char* name);
 
@@ -175,8 +178,10 @@ extern void artcell_net_move(void**, Point_process*, double);
 extern void nrn2ncs_outputevent(int netcon_output_index, double firetime);
 extern bool nrn_use_localgid_;
 extern void net_sem_from_gpu(int sendtype, int i_vdata, int, int ith, int ipnt, double, double);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
 extern int at_time(NrnThread*, double);
+nrn_pragma_omp(end declare target)
 
 // _OPENACC and/or NET_RECEIVE_BUFFERING
 extern void net_sem_from_gpu(int, int, int, int, int, double, double);
diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp
index 97c70950e..31c18807e 100644
--- a/coreneuron/network/cvodestb.cpp
+++ b/coreneuron/network/cvodestb.cpp
@@ -55,15 +55,15 @@ void init_net_events() {
         net_cvode_instance->init_events();
     }
 
-#if defined(_OPENACC)
+#ifdef CORENEURON_ENABLE_GPU
     /* weight vectors could be updated (from INITIAL block of NET_RECEIVE, update those on GPU's */
     for (int ith = 0; ith < nrn_nthread; ++ith) {
         NrnThread* nt = nrn_threads + ith;
         double* weights = nt->weights;
         int n_weight = nt->n_weight;
         if (n_weight && nt->compute_gpu) {
-            nrn_pragma_acc(update device(weights[0:n_weight]))
-            nrn_pragma_omp(target update to(weights[0:n_weight]))
+            nrn_pragma_acc(update device(weights [0:n_weight]))
+            nrn_pragma_omp(target update to(weights [0:n_weight]))
         }
     }
 #endif
diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp
index dd521afde..4fb1d165f 100644
--- a/coreneuron/network/netcvode.cpp
+++ b/coreneuron/network/netcvode.cpp
@@ -26,9 +26,6 @@
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/utils/nrnoc_aux.hpp"
 
-#ifdef _OPENACC
-#include <openacc.h>
-#endif
 namespace coreneuron {
 #define PP2NT(pp) (nrn_threads + (pp)->_tid)
 #define PP2t(pp)  (PP2NT(pp)->_t)
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
index 1bd822f54..abc3a5a03 100644
--- a/coreneuron/network/partrans.cpp
+++ b/coreneuron/network/partrans.cpp
@@ -114,7 +114,8 @@ void nrnthread_v_transfer(NrnThread* _nt) {
     int* insrc_indices = ttd.insrc_indices.data();
     double* tar_data = _nt->_data;
     // last element in the displacement vector gives total length
-#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
     int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
     int ndata = _nt->_ndata;
 #endif
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index fd784fe38..6b4014a64 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -446,7 +446,7 @@ static void triang_interleaved(NrnThread* nt,
         if (istride < icellsize) {  // only first icellsize strides matter
             // what is the index
             int ip = GPU_PARENT(i);
-#ifndef _OPENACC
+#ifndef CORENEURON_ENABLE_GPU
             nrn_assert(ip >= 0);  // if (ip < 0) return;
 #endif
             double p = GPU_A(i) / GPU_D(i);
@@ -468,7 +468,7 @@ static void bksub_interleaved(NrnThread* nt,
     GPU_RHS(icell) /= GPU_D(icell);  // the root
     for (int istride = 0; istride < icellsize; ++istride) {
         int ip = GPU_PARENT(i);
-#ifndef _OPENACC
+#ifndef CORENEURON_ENABLE_GPU
         nrn_assert(ip >= 0);
 #endif
         GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
@@ -482,7 +482,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     int icycle = ncycle - 1;
     int istride = stride[icycle];
     int i = lastnode - istride + icore;
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
 #endif
 
@@ -492,7 +492,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     // clang-format off
     nrn_pragma_acc(loop seq)
     for (; has_subtrees_to_compute; ) {  // ncycle loop
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         // serial test, gpu does this in parallel
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
@@ -508,7 +508,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
                 nrn_pragma_omp(atomic update)
                 GPU_RHS(ip) -= p * GPU_RHS(i);
             }
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         }
 #endif
         // if finished with all tree depths then ready to break
@@ -520,7 +520,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
         --icycle;
         istride = stride[icycle];
         i -= istride;
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         ii -= istride;
 #endif
     }
@@ -535,7 +535,7 @@ static void bksub_interleaved2(NrnThread* nt,
                                int ncycle,
                                int* stride,
                                int firstnode) {
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
     for (int i = root; i < lastroot; i += 1) {
 #else
     nrn_pragma_acc(loop seq)
@@ -545,12 +545,12 @@ static void bksub_interleaved2(NrnThread* nt,
     }
 
     int i = firstnode + icore;
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
 #endif
     for (int icycle = 0; icycle < ncycle; ++icycle) {
         int istride = stride[icycle];
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         // serial test, gpu does this in parallel
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
@@ -561,7 +561,7 @@ static void bksub_interleaved2(NrnThread* nt,
                 GPU_RHS(i) /= GPU_D(i);
             }
             i += istride;
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         }
         ii += istride;
 #endif
@@ -596,7 +596,8 @@ void solve_interleaved2(int ith) {
         int* strides = ii.stride;           // sum ncycles of these (bad since ncompart/warpsize)
         int* rootbegin = ii.firstnode;      // nwarp+1 of these
         int* nodebegin = ii.lastnode;       // nwarp+1 of these
-#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
         int nstride = stridedispl[nwarp];
 #endif
         nrn_pragma_acc(parallel loop gang vector vector_length(
@@ -616,12 +617,12 @@ void solve_interleaved2(int ith) {
             int lastroot = rootbegin[iwarp + 1];
             int firstnode = nodebegin[iwarp];
             int lastnode = nodebegin[iwarp + 1];
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
             if (ic == 0) {  // serial test mode. triang and bksub do all cores in warp
 #endif
                 triang_interleaved2(nt, ic, ncycle, stride, lastnode);
                 bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
             }  // serial test mode
 #endif
         }
diff --git a/coreneuron/sim/scopmath/newton_struct.h b/coreneuron/sim/scopmath/newton_struct.h
index 8cd52732c..d01bfb822 100644
--- a/coreneuron/sim/scopmath/newton_struct.h
+++ b/coreneuron/sim/scopmath/newton_struct.h
@@ -25,10 +25,11 @@ struct NewtonSpace {
     double* rowmax;
 };
 
-#pragma acc routine seq
+nrn_pragma_omp(declare target)
+nrn_pragma_acc(routine seq)
 extern int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern void nrn_scopmath_solve_thread(int n,
                                       double** a,
                                       double* value,
@@ -37,7 +38,7 @@ extern void nrn_scopmath_solve_thread(int n,
                                       int* s,
                                       _threadargsproto_);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int nrn_newton_thread(NewtonSpace* ns,
                              int n,
                              int* s,
@@ -45,7 +46,7 @@ extern int nrn_newton_thread(NewtonSpace* ns,
                              double* value,
                              _threadargsproto_);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern void nrn_buildjacobian_thread(NewtonSpace* ns,
                                      int n,
                                      int* s,
@@ -53,6 +54,7 @@ extern void nrn_buildjacobian_thread(NewtonSpace* ns,
                                      double* value,
                                      double** jacobian,
                                      _threadargsproto_);
+nrn_pragma_omp(end declare target)
 
 extern NewtonSpace* nrn_cons_newtonspace(int n, int n_instance);
 extern void nrn_destroy_newtonspace(NewtonSpace* ns);
diff --git a/coreneuron/sim/scopmath/sparse_thread.cpp b/coreneuron/sim/scopmath/sparse_thread.cpp
index d936e269a..71643430a 100644
--- a/coreneuron/sim/scopmath/sparse_thread.cpp
+++ b/coreneuron/sim/scopmath/sparse_thread.cpp
@@ -105,7 +105,7 @@ static void check_assert(SparseObj* so);
 static void re_link(SparseObj* so, unsigned i);
 static SparseObj* create_sparseobj();
 
-#if defined(_OPENACC)
+#ifdef CORENEURON_ENABLE_GPU
 #undef emalloc
 #undef ecalloc
 #define emalloc(arg)        malloc(arg)
diff --git a/coreneuron/sim/scopmath/ssimplic_thread.cpp b/coreneuron/sim/scopmath/ssimplic_thread.cpp
index fe11411d0..511e45d2b 100644
--- a/coreneuron/sim/scopmath/ssimplic_thread.cpp
+++ b/coreneuron/sim/scopmath/ssimplic_thread.cpp
@@ -9,12 +9,15 @@
 #include "coreneuron/mechanism/mech/cfile/scoplib.h"
 #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
 #include "coreneuron/sim/scopmath/errcodes.h"
+#include "coreneuron/utils/offload.hpp"
 
 namespace coreneuron {
 #define s_(arg) _p[s[arg] * _STRIDE]
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
 static int check_state(int, int*, _threadargsproto_);
+nrn_pragma_omp(end declare target)
 
 int _ss_sparse_thread(SparseObj* v,
                       int n,
diff --git a/coreneuron/utils/ivocvect.cpp b/coreneuron/utils/ivocvect.cpp
index 1315d409f..b51a96ab8 100644
--- a/coreneuron/utils/ivocvect.cpp
+++ b/coreneuron/utils/ivocvect.cpp
@@ -7,6 +7,7 @@
 */
 
 #include "coreneuron/utils/ivocvect.hpp"
+#include "coreneuron/utils/offload.hpp"
 
 namespace coreneuron {
 IvocVect* vector_new(int n) {
@@ -26,12 +27,12 @@ void* vector_new1(int n) {
     return (void*) (new IvocVect(n));
 }
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 int vector_capacity(void* v) {
     return ((IvocVect*) v)->size();
 }
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 double* vector_vec(void* v) {
     return ((IvocVect*) v)->data();
 }
diff --git a/coreneuron/utils/ivocvect.hpp b/coreneuron/utils/ivocvect.hpp
index af4286e09..80440c74d 100644
--- a/coreneuron/utils/ivocvect.hpp
+++ b/coreneuron/utils/ivocvect.hpp
@@ -9,6 +9,8 @@
 #ifndef ivoc_vector_h
 #define ivoc_vector_h
 
+#include "coreneuron/utils/offload.hpp"
+
 #include <cstdio>
 #include <utility>
 
@@ -52,17 +54,17 @@ class fixed_vector {
         return data_[i];
     }
 
-#pragma acc routine seq
+    nrn_pragma_acc(routine seq)
     const T* data(void) const {
         return data_;
     }
 
-#pragma acc routine seq
+    nrn_pragma_acc(routine seq)
     T* data(void) {
         return data_;
     }
 
-#pragma acc routine seq
+    nrn_pragma_acc(routine seq)
     size_t size() const {
         return n_;
     }
@@ -76,9 +78,9 @@ extern double* vector_vec(IvocVect* v);
 
 // retro-compatibility API
 extern void* vector_new1(int n);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int vector_capacity(void* v);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern double* vector_vec(void* v);
 
 }  // namespace coreneuron
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
index ad4189ec1..078990107 100644
--- a/coreneuron/utils/offload.hpp
+++ b/coreneuron/utils/offload.hpp
@@ -27,55 +27,70 @@
 namespace coreneuron {
 template <typename T>
 T* cnrn_target_deviceptr(const T* h_ptr) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
     return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    T *d_ptr = nullptr;
-    T *_h_ptr = const_cast<T*>(h_ptr);
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    T const* d_ptr{};
 
-    nrn_pragma_omp(target data use_device_ptr(_h_ptr))
-    {
-        d_ptr = _h_ptr;
-    }
+    nrn_pragma_omp(target data use_device_ptr(h_ptr))
+    { d_ptr = h_ptr; }
 
-    return d_ptr;
+    return const_cast<T*>(d_ptr);
 #else
-    throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error(
+        "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
 template <typename T>
 T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
     return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    #pragma omp target enter data map(to:h_ptr[:len])
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    nrn_pragma_omp(target enter data map(to : h_ptr[:len]))
     return cnrn_target_deviceptr(h_ptr);
 #else
-    throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error(
+        "cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
 template <typename T>
 void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
     acc_delete(h_ptr, len * sizeof(T));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    #pragma omp target exit data map(delete: h_ptr[:len])
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    nrn_pragma_omp(target exit data map(delete : h_ptr[:len]))
 #else
-    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error(
+        "cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
 template <typename T>
 void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
     acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    omp_target_memcpy(d_ptr, const_cast<T*>(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device());
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    omp_target_memcpy(d_ptr,
+                      const_cast<T*>(h_ptr),
+                      len * sizeof(T),
+                      0,
+                      0,
+                      omp_get_default_device(),
+                      omp_get_initial_device());
 #else
-    throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error(
+        "cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
-}
+}  // namespace coreneuron
diff --git a/coreneuron/utils/profile/profiler_interface.h b/coreneuron/utils/profile/profiler_interface.h
index f6a24eb2e..2c68a0ae1 100644
--- a/coreneuron/utils/profile/profiler_interface.h
+++ b/coreneuron/utils/profile/profiler_interface.h
@@ -15,7 +15,7 @@
 #include <caliper/cali.h>
 #endif
 
-#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC))
+#ifdef CORENEURON_CUDA_PROFILING
 #include <cuda_profiler_api.h>
 #endif
 
@@ -163,7 +163,7 @@ struct Caliper {
 
 #endif
 
-#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC))
+#ifdef CORENEURON_CUDA_PROFILING
 
 struct CudaProfiling {
     inline static void phase_begin(const char* name){};
@@ -270,7 +270,7 @@ using InstrumentorImpl = detail::Instrumentor<
 #if defined CORENEURON_CALIPER
     detail::Caliper,
 #endif
-#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC))
+#ifdef CORENEURON_CUDA_PROFILING
     detail::CudaProfiling,
 #endif
 #if defined(CRAYPAT)
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index ab432f89c..c97592161 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -37,6 +37,8 @@ of the full distribution available from
 #define R123_USE_GNU_UINT128            1
 #endif
 
+#include "coreneuron/utils/offload.hpp"
+
 #include <Random123/philox.h>
 #include <inttypes.h>
 
@@ -46,17 +48,12 @@ of the full distribution available from
 #define CORENRN_HOST_DEVICE
 #endif
 
-// Is there actually any harm leaving the pragma in when DISABLE_OPENACC is true?
-#if defined(_OPENACC) && !defined(DISABLE_OPENACC)
-#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE _Pragma("acc routine seq")
-#else
-#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE
-#endif
+#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE nrn_pragma_acc(routine seq)
 
 // Some files are compiled with DISABLE_OPENACC, and some builds have no GPU
 // support at all. In these two cases, request that the random123 state is
 // allocated using new/delete instead of CUDA unified memory.
-#if (defined(__CUDACC__) || defined(_OPENACC)) && !defined(DISABLE_OPENACC)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(DISABLE_OPENACC)
 #define CORENRN_RAN123_USE_UNIFIED_MEMORY true
 #else
 #define CORENRN_RAN123_USE_UNIFIED_MEMORY false
@@ -100,6 +97,7 @@ void nrnran123_deletestream(nrnran123_State* s,
                             bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY);
 
 /* minimal data stream */
+nrn_pragma_omp(declare target)
 CORENRN_HOST_DEVICE_ACC void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which);
 CORENRN_HOST_DEVICE_ACC void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2);
 CORENRN_HOST_DEVICE_ACC void nrnran123_getids3(nrnran123_State*,
@@ -128,6 +126,7 @@ CORENRN_HOST_DEVICE_ACC nrnran123_array4x32 nrnran123_iran(uint32_t seq,
                                                            uint32_t id1,
                                                            uint32_t id2);
 CORENRN_HOST_DEVICE_ACC double nrnran123_uint2dbl(uint32_t);
+nrn_pragma_omp(end declare target)
 }  // namespace coreneuron
 
 #endif
diff --git a/external/nmodl b/external/nmodl
index a60c5e903..fc85090f3 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit a60c5e903126ad95cfe2bceb904d0efe83ba9d8a
+Subproject commit fc85090f3fbb5736f8647170d1151af85f891467

From 9a98f73117e43688e2f0963d4451b2043ae4241d Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Fri, 17 Dec 2021 15:16:34 +0100
Subject: [PATCH 20/31] NMODL -> hackathon_main.

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index fc85090f3..ddb0c518c 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit fc85090f3fbb5736f8647170d1151af85f891467
+Subproject commit ddb0c518c1c227eb6df80dc8ddcc7598cde9e3ee

From 5ce52d5569f9311ecea6c871136725d52d59ec93 Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Tue, 21 Dec 2021 14:05:15 +0100
Subject: [PATCH 21/31] Separate handling of ml inside nrn_acc_manager (#719)

---
 coreneuron/gpu/nrn_acc_manager.cpp | 377 +++++++++++++++--------------
 external/nmodl                     |   2 +-
 2 files changed, 194 insertions(+), 185 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index edf9b6d63..bafb17346 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -76,6 +76,189 @@ void cnrn_target_set_default_device(int device_num) {
 #endif
 }
 
+static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
+    // As we never run code for artificial cell inside GPU we don't copy it.
+    int is_art = corenrn.get_is_artificial()[type];
+    if (is_art) {
+        return nullptr;
+    }
+
+    auto d_ml = cnrn_target_copyin(ml);
+
+    int n = ml->nodecount;
+    int szp = corenrn.get_prop_param_size()[type];
+    int szdp = corenrn.get_prop_dparam_size()[type];
+
+    double* dptr = cnrn_target_deviceptr(ml->data);
+    cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
+
+
+    int* d_nodeindices = cnrn_target_copyin(ml->nodeindices, n);
+    cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);
+
+    if (szdp) {
+        int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
+        int* d_pdata = cnrn_target_copyin(ml->pdata, pcnt);
+        cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata);
+    }
+
+    int ts = corenrn.get_memb_funcs()[type].thread_size_;
+    if (ts) {
+        ThreadDatum* td = cnrn_target_copyin(ml->_thread, ts);
+        cnrn_target_memcpy_to_device(&(d_ml->_thread), &td);
+    }
+
+    // net_receive buffer associated with mechanism
+    NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
+
+    // if net receive buffer exist for mechanism
+    if (nrb) {
+        NetReceiveBuffer_t* d_nrb = cnrn_target_copyin(nrb);
+        cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb);
+
+        int* d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
+
+        int* d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
+
+        double* d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
+
+        double* d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
+
+        int* d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
+        cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
+
+        int* d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
+    }
+
+    /* copy NetSendBuffer_t on to GPU */
+    NetSendBuffer_t* nsb = ml->_net_send_buffer;
+
+    if (nsb) {
+        NetSendBuffer_t* d_nsb;
+        int* d_iptr;
+        double* d_dptr;
+
+        d_nsb = cnrn_target_copyin(nsb);
+        cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb);
+
+        d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr);
+
+        d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr);
+
+        d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr);
+
+        d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr);
+
+        d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr);
+
+        d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr);
+    }
+
+    return d_ml;
+}
+
+static void update_ml_on_host(const Memb_list* ml, int type) {
+    int is_art = corenrn.get_is_artificial()[type];
+    if (is_art) {
+        // Artificial mechanisms such as PatternStim and IntervalFire
+        // are not copied onto the GPU. They should not, therefore, be
+        // updated from the GPU.
+        return;
+    }
+
+    int n = ml->nodecount;
+    int szp = corenrn.get_prop_param_size()[type];
+    int szdp = corenrn.get_prop_dparam_size()[type];
+
+    int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
+
+    nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n]))
+    nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n]))
+
+    int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
+    nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp))
+    nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp))
+
+    auto nrb = ml->_net_receive_buffer;
+
+    // clang-format off
+    nrn_pragma_acc(update self(nrb->_cnt,
+                               nrb->_size,
+                               nrb->_pnt_offset,
+                               nrb->_displ_cnt,
+                               nrb->_pnt_index[:nrb->_size],
+                               nrb->_weight_index[:nrb->_size],
+                               nrb->_displ[:nrb->_size + 1],
+                               nrb->_nrb_index[:nrb->_size])
+                          if (nrb != nullptr))
+    nrn_pragma_omp(target update from(nrb->_cnt,
+                                      nrb->_size,
+                                      nrb->_pnt_offset,
+                                      nrb->_displ_cnt,
+                                      nrb->_pnt_index[:nrb->_size],
+                                      nrb->_weight_index[:nrb->_size],
+                                      nrb->_displ[:nrb->_size + 1],
+                                      nrb->_nrb_index[:nrb->_size])
+                                 if (nrb != nullptr))
+    // clang-format on
+}
+
+static void delete_ml_from_device(Memb_list* ml, int type) {
+    int is_art = corenrn.get_is_artificial()[type];
+    if (is_art) {
+        return;
+    }
+    // Cleanup the net send buffer if it exists
+    {
+        NetSendBuffer_t* nsb{ml->_net_send_buffer};
+        if (nsb) {
+            cnrn_target_delete(nsb->_nsb_flag, nsb->_size);
+            cnrn_target_delete(nsb->_nsb_t, nsb->_size);
+            cnrn_target_delete(nsb->_weight_index, nsb->_size);
+            cnrn_target_delete(nsb->_pnt_index, nsb->_size);
+            cnrn_target_delete(nsb->_vdata_index, nsb->_size);
+            cnrn_target_delete(nsb->_sendtype, nsb->_size);
+            cnrn_target_delete(nsb);
+        }
+    }
+    // Cleanup the net receive buffer if it exists.
+    {
+        NetReceiveBuffer_t* nrb{ml->_net_receive_buffer};
+        if (nrb) {
+            cnrn_target_delete(nrb->_nrb_index, nrb->_size);
+            cnrn_target_delete(nrb->_displ, nrb->_size + 1);
+            cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
+            cnrn_target_delete(nrb->_nrb_t, nrb->_size);
+            cnrn_target_delete(nrb->_weight_index, nrb->_size);
+            cnrn_target_delete(nrb->_pnt_index, nrb->_size);
+            cnrn_target_delete(nrb);
+        }
+    }
+    int n = ml->nodecount;
+    int szdp = corenrn.get_prop_dparam_size()[type];
+    int ts = corenrn.get_memb_funcs()[type].thread_size_;
+    if (ts) {
+        cnrn_target_delete(ml->_thread, ts);
+    }
+    if (szdp) {
+        int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
+        cnrn_target_delete(ml->pdata, pcnt);
+    }
+    cnrn_target_delete(ml->nodeindices, n);
+    cnrn_target_delete(ml);
+}
+
 /* note: threads here are corresponding to global nrn_threads array */
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 #ifdef CORENEURON_ENABLE_GPU
@@ -210,103 +393,10 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             d_last_tml = d_tml;
 
             /* now for every tml, there is a ml. copy that and setup pointer */
-            auto d_ml = cnrn_target_copyin(tml->ml);
+            Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
             cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml);
-
             /* setup nt._ml_list */
             cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml);
-
-            int type = tml->index;
-            int n = tml->ml->nodecount;
-            int szp = corenrn.get_prop_param_size()[type];
-            int szdp = corenrn.get_prop_dparam_size()[type];
-            int is_art = corenrn.get_is_artificial()[type];
-
-            // If the mechanism is artificial data are not inside nt->_data but in a newly
-            // allocated block. As we never run code for artificial cell inside GPU
-            // we don't copy it.
-            dptr = is_art ? nullptr : cnrn_target_deviceptr(tml->ml->data);
-            cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
-
-
-            if (!is_art) {
-                int* d_nodeindices = cnrn_target_copyin(tml->ml->nodeindices, n);
-                cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);
-            }
-
-            if (szdp) {
-                int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                int* d_pdata = cnrn_target_copyin(tml->ml->pdata, pcnt);
-                cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata);
-            }
-
-            int ts = corenrn.get_memb_funcs()[type].thread_size_;
-            if (ts) {
-                ThreadDatum* td = cnrn_target_copyin(tml->ml->_thread, ts);
-                cnrn_target_memcpy_to_device(&(d_ml->_thread), &td);
-            }
-
-            NetReceiveBuffer_t *nrb, *d_nrb;
-            int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index;
-            double *d_nrb_t, *d_nrb_flag;
-
-            // net_receive buffer associated with mechanism
-            nrb = tml->ml->_net_receive_buffer;
-
-            // if net receive buffer exist for mechanism
-            if (nrb) {
-                d_nrb = cnrn_target_copyin(nrb);
-                cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb);
-
-                d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
-                cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
-
-                d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
-                cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
-
-                d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
-                cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
-
-                d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
-                cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
-
-                d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
-                cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
-
-                d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
-                cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
-            }
-
-            /* copy NetSendBuffer_t on to GPU */
-            NetSendBuffer_t* nsb;
-            nsb = tml->ml->_net_send_buffer;
-
-            if (nsb) {
-                NetSendBuffer_t* d_nsb;
-                int* d_iptr;
-                double* d_dptr;
-
-                d_nsb = cnrn_target_copyin(nsb);
-                cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb);
-
-                d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size);
-                cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr);
-
-                d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size);
-                cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr);
-
-                d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size);
-                cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr);
-
-                d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size);
-                cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr);
-
-                d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size);
-                cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr);
-
-                d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size);
-                cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr);
-            }
         }
 
         if (nt->shadow_rhs_cnt) {
@@ -619,6 +709,10 @@ static void net_receive_buffer_order(NetReceiveBuffer_t* nrb) {
 void update_net_receive_buffer(NrnThread* nt) {
     Instrumentor::phase p_update_net_receive_buffer("update-net-receive-buf");
     for (auto tml = nt->tml; tml; tml = tml->next) {
+        int is_art = corenrn.get_is_artificial()[tml->index];
+        if (is_art) {
+            continue;
+        }
         // net_receive buffer to copy
         NetReceiveBuffer_t* nrb = tml->ml->_net_receive_buffer;
 
@@ -731,55 +825,11 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
 
             /* -- copy NrnThreadMembList list ml to host -- */
             for (auto tml = nt->tml; tml; tml = tml->next) {
-                Memb_list* ml = tml->ml;
-
-                nrn_pragma_acc(update self(tml->index, ml->nodecount))
-                nrn_pragma_omp(target update from(tml->index, ml->nodecount))
-
-                int type = tml->index;
-                int n = ml->nodecount;
-                int szp = corenrn.get_prop_param_size()[type];
-                int szdp = corenrn.get_prop_dparam_size()[type];
-                int is_art = corenrn.get_is_artificial()[type];
-
-                // Artificial mechanisms such as PatternStim and IntervalFire
-                // are not copied onto the GPU. They should not, therefore, be
-                // updated from the GPU.
-                if (is_art) {
-                    continue;
+                if (!corenrn.get_is_artificial()[tml->index]) {
+                    nrn_pragma_acc(update self(tml->index, tml->ml->nodecount))
+                    nrn_pragma_omp(target update from(tml->index, tml->ml->nodecount))
                 }
-
-                int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
-
-                nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n]))
-                nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n]))
-
-                int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp))
-                nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp))
-
-                auto nrb = tml->ml->_net_receive_buffer;
-
-                // clang-format off
-                nrn_pragma_acc(update self(nrb->_cnt,
-                                           nrb->_size,
-                                           nrb->_pnt_offset,
-                                           nrb->_displ_cnt,
-                                           nrb->_pnt_index[:nrb->_size],
-                                           nrb->_weight_index[:nrb->_size],
-                                           nrb->_displ[:nrb->_size + 1],
-                                           nrb->_nrb_index[:nrb->_size])
-                                      if (nrb != nullptr))
-                nrn_pragma_omp(target update from(nrb->_cnt,
-                                                  nrb->_size,
-                                                  nrb->_pnt_offset,
-                                                  nrb->_displ_cnt,
-                                                  nrb->_pnt_index[:nrb->_size],
-                                                  nrb->_weight_index[:nrb->_size],
-                                                  nrb->_displ[:nrb->_size + 1],
-                                                  nrb->_nrb_index[:nrb->_size])
-                                             if (nrb != nullptr))
-                // clang-format on
+                update_ml_on_host(tml->ml, tml->index);
             }
 
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
@@ -957,48 +1007,7 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         }
 
         for (auto tml = nt->tml; tml; tml = tml->next) {
-            // Cleanup the net send buffer if it exists
-            {
-                NetSendBuffer_t* nsb{tml->ml->_net_send_buffer};
-                if (nsb) {
-                    cnrn_target_delete(nsb->_nsb_flag, nsb->_size);
-                    cnrn_target_delete(nsb->_nsb_t, nsb->_size);
-                    cnrn_target_delete(nsb->_weight_index, nsb->_size);
-                    cnrn_target_delete(nsb->_pnt_index, nsb->_size);
-                    cnrn_target_delete(nsb->_vdata_index, nsb->_size);
-                    cnrn_target_delete(nsb->_sendtype, nsb->_size);
-                    cnrn_target_delete(nsb);
-                }
-            }
-            // Cleanup the net receive buffer if it exists.
-            {
-                NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer};
-                if (nrb) {
-                    cnrn_target_delete(nrb->_nrb_index, nrb->_size);
-                    cnrn_target_delete(nrb->_displ, nrb->_size + 1);
-                    cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
-                    cnrn_target_delete(nrb->_nrb_t, nrb->_size);
-                    cnrn_target_delete(nrb->_weight_index, nrb->_size);
-                    cnrn_target_delete(nrb->_pnt_index, nrb->_size);
-                    cnrn_target_delete(nrb);
-                }
-            }
-            int type = tml->index;
-            int n = tml->ml->nodecount;
-            int szdp = corenrn.get_prop_dparam_size()[type];
-            int is_art = corenrn.get_is_artificial()[type];
-            int ts = corenrn.get_memb_funcs()[type].thread_size_;
-            if (ts) {
-                cnrn_target_delete(tml->ml->_thread, ts);
-            }
-            if (szdp) {
-                int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                cnrn_target_delete(tml->ml->pdata, pcnt);
-            }
-            if (!is_art) {
-                cnrn_target_delete(tml->ml->nodeindices, n);
-            }
-            cnrn_target_delete(tml->ml);
+            delete_ml_from_device(tml->ml, tml->index);
             cnrn_target_delete(tml);
         }
         cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size());
diff --git a/external/nmodl b/external/nmodl
index ddb0c518c..8535e828a 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit ddb0c518c1c227eb6df80dc8ddcc7598cde9e3ee
+Subproject commit 8535e828a7f1a4e12ffabd59c90233efc2993608

From a6c70784fe9a9961d0bf8e179cc62d50628f49c9 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 21 Dec 2021 17:19:29 +0100
Subject: [PATCH 22/31] Fixing jenkins tests

---
 coreneuron/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index f42568a27..55d2baa3c 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -320,13 +320,15 @@ if(EXISTS "${CORENRN_EXTERNAL_BENCHMARK_DATA}")
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark
     COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms")
   list(APPEND all_output_binaries ${output_binaries})
+  list(JOIN TEST_EXEC_PREFIX " " BENCHMARK_SRUN_COMAND)
   string(
     CONCAT benchmark_command
+           "OMP_NUM_THREADS=1 ${BENCHMARK_SRUN_COMAND} "
            "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
            " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
            " --tstop 1 --mpi")
   if(CORENRN_ENABLE_GPU)
-    string(APPEND benchmark_command " --gpu")
+    string(APPEND benchmark_command " --gpu --cell-permute=2")
   endif()
   string(APPEND benchmark_command " && diff out.dat "
          "'${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'")

From 6b8b6c3afc394029714c0886b28dceec860a1ead Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 21 Dec 2021 19:20:03 +0100
Subject: [PATCH 23/31] Address review comments.

---
 CMake/OpenAccHelper.cmake          |  2 +-
 CMakeLists.txt                     |  3 ---
 coreneuron/CMakeLists.txt          | 36 ++----------------------------
 coreneuron/mechanism/eion.cpp      | 24 +++++++++++---------
 coreneuron/mechanism/mechanism.hpp |  1 -
 coreneuron/sim/fadvance_core.cpp   |  1 -
 6 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 063b32003..2d8158be9 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -94,7 +94,7 @@ if(CORENRN_ENABLE_GPU)
     GLOBAL
     PROPERTY
       CORENEURON_LIB_LINK_FLAGS
-      "${NVHPC_ACC_COMP_FLAGS} ${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
+      "${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
   )
 else()
   set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS
diff --git a/CMakeLists.txt b/CMakeLists.txt
index df528a965..d3e1950d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,9 +104,6 @@ option(CORENRN_ENABLE_SHARED "Enable shared library build" ON)
 option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF)
 option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF)
 
-set(CORENRN_EXTERNAL_BENCHMARK_DATA
-    "/gpfs/bbp.cscs.ch/project/proj12/nersc-gpu-hackathon-dec-2021"
-    CACHE PATH "Path to input data files and mechanisms for benchmarks")
 set(CORENRN_NMODL_DIR
     ""
     CACHE PATH "Path to nmodl source-to-source compiler installation")
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 55d2baa3c..d370df1df 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -288,6 +288,7 @@ if(CORENRN_ENABLE_GPU)
   # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available.
   find_package(Boost QUIET)
   if(Boost_FOUND)
+    message(STATUS "Boost found, enabling use of memory pools for Random123...")
     target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
     target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL)
   endif()
@@ -302,38 +303,6 @@ set_target_properties(
 # =============================================================================
 # create special-core with halfgap.mod for tests
 # =============================================================================
-set(all_output_binaries)
-if(EXISTS "${CORENRN_EXTERNAL_BENCHMARK_DATA}")
-  # Hack for the december 2021 hackathon, build an extra special-core with channel-benchmark
-  # mechanisms.
-  set(modfile_directory
-      "${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark/benchmark/channels/lib/modlib")
-  file(GLOB modfiles "${modfile_directory}/*.mod")
-  set(output_binaries "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core"
-                      "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech.a")
-  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark)
-  add_custom_command(
-    OUTPUT ${output_binaries}
-    DEPENDS scopmath coreneuron ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
-    COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b STATIC -m ${CORENRN_MOD2CPP_BINARY} -p 6
-            "${modfile_directory}"
-    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark
-    COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms")
-  list(APPEND all_output_binaries ${output_binaries})
-  list(JOIN TEST_EXEC_PREFIX " " BENCHMARK_SRUN_COMAND)
-  string(
-    CONCAT benchmark_command
-           "OMP_NUM_THREADS=1 ${BENCHMARK_SRUN_COMAND} "
-           "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
-           " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
-           " --tstop 1 --mpi")
-  if(CORENRN_ENABLE_GPU)
-    string(APPEND benchmark_command " --gpu --cell-permute=2")
-  endif()
-  string(APPEND benchmark_command " && diff out.dat "
-         "'${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'")
-  add_test(NAME benchmark COMMAND sh -c "${benchmark_command}")
-endif()
 set(modfile_directory "${CORENEURON_PROJECT_SOURCE_DIR}/tests/integration/ring_gap/mod")
 file(GLOB modfiles "${modfile_directory}/*.mod")
 set(output_binaries "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/special-core"
@@ -345,8 +314,7 @@ add_custom_command(
           "${modfile_directory}"
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin
   COMMENT "Running nrnivmodl-core with halfgap.mod")
-list(APPEND all_output_binaries ${output_binaries})
-add_custom_target(nrniv-core ALL DEPENDS ${all_output_binaries})
+add_custom_target(nrniv-core ALL DEPENDS ${output_binaries})
 
 include_directories(${CORENEURON_PROJECT_SOURCE_DIR})
 
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 6cb3cf83d..8b58e858d 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -263,11 +263,13 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
     int _cntml_padded = ml->_nodecount_padded;
     pd = ml->data;
     ppd = ml->pdata;
-    nrn_pragma_acc(parallel loop present(
-        pd [0:_cntml_padded * 5],
-        nrn_ion_global_map
-        [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)
-                       async(nt->stream_id))
+    // clang-format off
+    nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
+                                         nrn_ion_global_map[0:nrn_ion_global_map_size]
+                                                           [0:ion_global_map_member_size])
+                                 if (nt->compute_gpu)
+                                 async(nt->stream_id))
+    // clang-format on
     nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
         dcurdv = 0.;
@@ -300,11 +302,13 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) {
     // no `nowait` clause has been added to the OpenMP implementation. TODO:
     // verify if this can be made asynchronous or if there is a strong reason it
     // needs to be like this.
-    nrn_pragma_acc(parallel loop present(
-        pd [0:_cntml_padded * 5],
-        ppd [0:1],
-        nrn_ion_global_map
-        [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu))
+    // clang-format off
+    nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
+                                         ppd[0:1],
+                                         nrn_ion_global_map[0:nrn_ion_global_map_size]
+                                                           [0:ion_global_map_member_size])
+                                 if (nt->compute_gpu))
+    // clang-format on
     nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
         if (iontype & 04) {
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index 62be093a3..65d7b29ce 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -16,7 +16,6 @@
 namespace coreneuron {
 // OpenACC with PGI compiler has issue when union is used and hence use struct
 // \todo check if newer PGI versions has resolved this issue
-// OL211214: bump
 #if defined(_OPENACC)
 struct ThreadDatum {
     int i;
diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp
index a46f83535..ab6fc4bfb 100644
--- a/coreneuron/sim/fadvance_core.cpp
+++ b/coreneuron/sim/fadvance_core.cpp
@@ -320,7 +320,6 @@ void nrncore2nrn_send_values(NrnThread* nth) {
                 nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu))
             }
             nrn_pragma_acc(wait(nth->stream_id))
-            nrn_pragma_omp(taskwait)
             for (int i = 0; i < tr->n_trajec; ++i) {
                 *(tr->scatter[i]) = *(tr->gather[i]);
             }

From 531c4fe7bf6ca81a91e7dcf3523d23ab9a4ca298 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 21 Dec 2021 19:47:44 +0100
Subject: [PATCH 24/31] Add CUDA toolkit includes.

Presumably this was working before because our nvhpc localrc files
accidentally included CUDA include directories before
https://github.com/BlueBrain/spack/pull/1392.
---
 CMake/OpenAccHelper.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 2d8158be9..78d02777c 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -10,6 +10,10 @@
 if(CORENRN_ENABLE_GPU)
   # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs
   add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
+  # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for
+  # example, starting and stopping profiling. This makes sure those headers can
+  # be found.
+  include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
   # cuda unified memory support
   if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY)
     add_compile_definitions(CORENEURON_UNIFIED_MEMORY)

From e3aeafc93f0c7a83817501511bc3a7fe168ba52c Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 21 Dec 2021 19:56:33 +0100
Subject: [PATCH 25/31] Fixup cmake-format.

---
 CMake/OpenAccHelper.cmake | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 78d02777c..99469f0cc 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -10,9 +10,8 @@
 if(CORENRN_ENABLE_GPU)
   # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs
   add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
-  # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for
-  # example, starting and stopping profiling. This makes sure those headers can
-  # be found.
+  # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and
+  # stopping profiling. This makes sure those headers can be found.
   include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
   # cuda unified memory support
   if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY)

From 9fddc7de7319127e08dc28722b692f610a0cac44 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 22 Dec 2021 10:23:17 +0100
Subject: [PATCH 26/31] Compile with -cuda. (#721)

* Compile NVHPC+Open{ACC,MP} with -cuda.
* Pull in NMODL+Eigen fixes to make this work.
---
 CMake/OpenAccHelper.cmake | 2 +-
 external/nmodl            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 99469f0cc..225b5ff45 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -58,7 +58,7 @@ if(CORENRN_ENABLE_GPU)
   # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
   # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
   # CUDA version as is used for the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   set(NVHPC_ACC_LINK_FLAGS "-cuda")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
diff --git a/external/nmodl b/external/nmodl
index 8535e828a..5ebca71ff 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 8535e828a7f1a4e12ffabd59c90233efc2993608
+Subproject commit 5ebca71ffc43e8cfa9ebbee5b15628bf81a546ce

From 1fbba172ff017d6fc5a68441b64f08d766cd0831 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 22 Dec 2021 10:38:14 +0100
Subject: [PATCH 27/31] Cleanup CMake for GPU offload.

---
 CMake/MakefileBuildOptions.cmake |  1 -
 CMake/OpenAccHelper.cmake        | 24 +++++++++---------------
 extra/nrnivmodl_core_makefile.in |  4 ++--
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
index 009dd3215..fc0b0b551 100644
--- a/CMake/MakefileBuildOptions.cmake
+++ b/CMake/MakefileBuildOptions.cmake
@@ -75,7 +75,6 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE)
 set(CORENRN_CXX_FLAGS
     "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}} ${CXX14_STD_FLAGS} ${NVHPC_ACC_COMP_FLAGS} ${NVHPC_CXX_INLINE_FLAGS}"
 )
-set(CORENRN_LD_FLAGS "${NVHPC_ACC_LINK_FLAGS}")
 
 # =============================================================================
 # nmodl/mod2c related options : TODO
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 225b5ff45..5838742f8 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -50,16 +50,12 @@ if(CORENRN_ENABLE_GPU)
     endif()
     set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
   endif()
-  # -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be
-  # required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA
-  # device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which
-  # apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when
-  # compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences
-  # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
-  # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
-  # CUDA version as is used for the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-cuda -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
-  set(NVHPC_ACC_LINK_FLAGS "-cuda")
+  # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
+  # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
+  # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
+  # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
+  # code is compiled with the same CUDA version as the explicit CUDA code.
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
@@ -70,18 +66,16 @@ if(CORENRN_ENABLE_GPU)
     # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
     # for a region then prefer OpenMP.
     add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
-    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
-    string(APPEND NVHPC_ACC_LINK_FLAGS " -mp=gpu")
+    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
   elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
     # Only enable OpenACC offload for GPU
     string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
-    string(APPEND NVHPC_ACC_LINK_FLAGS " -acc")
   else()
     message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
   endif()
   # avoid PGI adding standard compliant "-A" flags
   set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
-  string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
+  string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")
   # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
   # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
   # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
@@ -97,7 +91,7 @@ if(CORENRN_ENABLE_GPU)
     GLOBAL
     PROPERTY
       CORENEURON_LIB_LINK_FLAGS
-      "${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
+      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
   )
 else()
   set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index f51571ae8..fc339fb04 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -73,8 +73,8 @@ endif
 
 CXXFLAGS = @CORENRN_CXX_FLAGS@
 CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES)
-CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_EXE_LINKER_FLAGS@
-CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
+CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@
+CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
 
 # ISPC compilation and link commands
 ISPC = @CMAKE_ISPC_COMPILER@

From 847d415d0a4e5b626a342ae405e05abd03f2e5c1 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 22 Dec 2021 11:36:14 +0100
Subject: [PATCH 28/31] fixup

---
 coreneuron/gpu/nrn_acc_manager.cpp | 4 ++--
 extra/nrnivmodl_core_makefile.in   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index bafb17346..33b526676 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -68,8 +68,8 @@ void cnrn_target_set_default_device(int device_num) {
     // is not enough: there were errors on some nodes when not-the-0th GPU was
     // used. These seemed to be related to the NMODL instance structs, which are
     // allocated using cudaMallocManaged.
-    auto const cuda_code = cudaSetDevice(device_num);
-    assert(cuda_code == cudaSuccess);
+    //auto const cuda_code = cudaSetDevice(device_num);
+    //assert(cuda_code == cudaSuccess);
 #else
     throw std::runtime_error(
         "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
index fc339fb04..5bd424865 100644
--- a/extra/nrnivmodl_core_makefile.in
+++ b/extra/nrnivmodl_core_makefile.in
@@ -74,7 +74,7 @@ endif
 CXXFLAGS = @CORENRN_CXX_FLAGS@
 CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES)
 CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@
-CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
+CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
 
 # ISPC compilation and link commands
 ISPC = @CMAKE_ISPC_COMPILER@

From 53b0c5fda91e98c24000238285f7ef1330fc06a4 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 22 Dec 2021 11:38:54 +0100
Subject: [PATCH 29/31] fixup the fixup :facepalm:

---
 coreneuron/gpu/nrn_acc_manager.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 33b526676..bafb17346 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -68,8 +68,8 @@ void cnrn_target_set_default_device(int device_num) {
     // is not enough: there were errors on some nodes when not-the-0th GPU was
     // used. These seemed to be related to the NMODL instance structs, which are
     // allocated using cudaMallocManaged.
-    //auto const cuda_code = cudaSetDevice(device_num);
-    //assert(cuda_code == cudaSuccess);
+    auto const cuda_code = cudaSetDevice(device_num);
+    assert(cuda_code == cudaSuccess);
 #else
     throw std::runtime_error(
         "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");

From 2c7377c40d885d86ddac419c07c0be6dbd0b7ed9 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 22 Dec 2021 18:00:56 +0100
Subject: [PATCH 30/31] NMODL -> master after #783.

---
 external/nmodl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/nmodl b/external/nmodl
index 5ebca71ff..46f8baf2b 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 5ebca71ffc43e8cfa9ebbee5b15628bf81a546ce
+Subproject commit 46f8baf2bbeaa0d21559d6306ec37b94c601f1ee

From 5c5b8a37c8bc659a2cecaaf54b16775851939d44 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 22 Dec 2021 18:02:05 +0100
Subject: [PATCH 31/31] Drop two OpenMP taskwait directives.

---
 coreneuron/gpu/nrn_acc_manager.cpp | 1 -
 coreneuron/network/partrans.cpp    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index bafb17346..d5e723527 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -750,7 +750,6 @@ void update_net_receive_buffer(NrnThread* nt) {
         }
     }
     nrn_pragma_acc(wait(nt->stream_id))
-    nrn_pragma_omp(taskwait)
 }
 
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
index abc3a5a03..4c517e999 100644
--- a/coreneuron/network/partrans.cpp
+++ b/coreneuron/network/partrans.cpp
@@ -72,7 +72,6 @@ void nrnmpi_v_transfer() {
         if (nrn_threads[tid].compute_gpu) {
             compute_gpu = true;
             nrn_pragma_acc(wait(nrn_threads[tid].stream_id))
-            nrn_pragma_omp(taskwait)
         }
         TransferThreadData& ttd = transfer_thread_data_[tid];
         size_t n_outsrc_indices = ttd.outsrc_indices.size();