diff --git a/.clang-format.changes b/.clang-format.changes
index 01b58702d..4c2b11b59 100644
--- a/.clang-format.changes
+++ b/.clang-format.changes
@@ -1,2 +1,3 @@
-SortIncludes: false
 IndentCaseLabels: true
+SortIncludes: false
+StatementMacros: [nrn_pragma_acc, nrn_pragma_omp]
diff --git a/.cmake-format.changes.yaml b/.cmake-format.changes.yaml
index 19ea9c084..2f20247f7 100644
--- a/.cmake-format.changes.yaml
+++ b/.cmake-format.changes.yaml
@@ -1,9 +1,4 @@
 additional_commands:
-  cuda_add_library:
-    pargs: '*'
-    flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"]
-    kwargs:
-      OPTIONS: '*'
   cpp_cc_build_time_copy:
     flags: ['NO_TARGET']
     kwargs:
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 7767a3672..c7f91a7c9 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -55,7 +55,7 @@ if(CORENRN_ENABLE_GPU)
   # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
   # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
   # CUDA version as is used for the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
+  set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
@@ -63,6 +63,12 @@ if(CORENRN_ENABLE_GPU)
   foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
     string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
   endforeach()
+  if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
+    # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+    # for a region then prefer OpenMP.
+    add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
+  endif()
   # avoid PGI adding standard compliant "-A" flags
   set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
   string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e53a5de6..963703975 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11)
 # Build options
 # =============================================================================
 option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
+option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
 option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
 option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
 option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
@@ -104,7 +105,7 @@ option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF)
 option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF)
 
 set(CORENRN_EXTERNAL_BENCHMARK_DATA
-    ""
+    "/gpfs/bbp.cscs.ch/project/proj12/nersc-gpu-hackathon-dec-2021"
     CACHE PATH "Path to input data files and mechanisms for benchmarks")
 set(CORENRN_NMODL_DIR
     ""
@@ -138,7 +139,7 @@ if(CORENRN_ENABLE_GPU)
 
   # Set some sensible default CUDA architectures.
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
+    set(CMAKE_CUDA_ARCHITECTURES 70 80)
     message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
   endif()
 
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 60bd2b370..2308ab99a 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -325,7 +325,7 @@ if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "")
       benchmark_command
       "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
       " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
-      " --tstop 1 &&"
+      " --tstop 1 --gpu &&"
       "diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'"
   )
   add_test(NAME benchmark COMMAND sh -c "${benchmark_command}")
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index 0fdaa509b..6a4d43bea 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -558,10 +558,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
 #endif
     bool compute_gpu = corenrn_param.gpu;
 
-    // clang-format off
-
-    #pragma acc update device(celsius, secondorder, pi) if (compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(update device(celsius, secondorder, pi) if(compute_gpu))
+    nrn_pragma_omp(target update to(celsius, secondorder, pi) if(compute_gpu))
     {
         double v = corenrn_param.voltage;
         double dt = corenrn_param.dt;
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index ac98f5420..b249875dc 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -27,6 +27,9 @@
 #ifdef _OPENACC
 #include <openacc.h>
 #endif
+#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
+#include <omp.h>
+#endif
 
 #ifdef CRAYPAT
 #include <pat_api.h>
@@ -605,25 +608,36 @@ void update_net_receive_buffer(NrnThread* nt) {
             // instance order to avoid race. setup _displ and _nrb_index
             net_receive_buffer_order(nrb);
 
-#ifdef _OPENACC
             if (nt->compute_gpu) {
                 Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu");
                 // note that dont update nrb otherwise we lose pointers
 
+                // clang-format off
+
                 /* update scalar elements */
-                acc_update_device(&nrb->_cnt, sizeof(int));
-                acc_update_device(&nrb->_displ_cnt, sizeof(int));
-
-                acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt);
-                acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt);
-                acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt);
-                acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt);
-                acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1));
-                acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt);
+                nrn_pragma_acc(update device(nrb->_cnt,
+                                             nrb->_displ_cnt,
+                                             nrb->_pnt_index[:nrb->_cnt],
+                                             nrb->_weight_index[:nrb->_cnt],
+                                             nrb->_nrb_t[:nrb->_cnt],
+                                             nrb->_nrb_flag[:nrb->_cnt],
+                                             nrb->_displ[:nrb->_displ_cnt + 1],
+                                             nrb->_nrb_index[:nrb->_cnt])
+                                             async(nt->stream_id))
+                nrn_pragma_omp(target update to(nrb->_cnt,
+                                                nrb->_displ_cnt,
+                                                nrb->_pnt_index[:nrb->_cnt],
+                                                nrb->_weight_index[:nrb->_cnt],
+                                                nrb->_nrb_t[:nrb->_cnt],
+                                                nrb->_nrb_flag[:nrb->_cnt],
+                                                nrb->_displ[:nrb->_displ_cnt + 1],
+                                                nrb->_nrb_index[:nrb->_cnt]))
+                // clang-format on
             }
-#endif
         }
     }
+    nrn_pragma_acc(wait(nt->stream_id))
+    nrn_pragma_omp(taskwait)
 }
 
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
@@ -894,67 +908,12 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
         size_t n_weight = nt->n_weight;
         if (nt->compute_gpu && n_weight > 0) {
             double* weights = nt->weights;
-            // clang-format off
-
-            #pragma acc update host(weights [0:n_weight])
-            // clang-format on
+            nrn_pragma_acc(update host(weights [0:n_weight]))
+            nrn_pragma_omp(target update from(weights [0:n_weight]))
         }
     }
 }
 
-void update_matrix_from_gpu(NrnThread* _nt) {
-#ifdef _OPENACC
-    if (_nt->compute_gpu && (_nt->end > 0)) {
-        /* before copying, make sure all computations in the stream are completed */
-
-        // clang-format off
-
-        #pragma acc wait(_nt->stream_id)
-
-        /* openacc routine doesn't allow asyn, use pragma */
-        // acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double));
-
-        /* RHS and D are contigious, copy them in one go!
-         * NOTE: in pragma you have to give actual pointer like below and not nt->rhs...
-         */
-        double* rhs = _nt->_actual_rhs;
-        int ne = nrn_soa_padded_size(_nt->end, 0);
-
-        #pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id)
-        #pragma acc wait(_nt->stream_id)
-        // clang-format on
-    }
-#else
-    (void) _nt;
-#endif
-}
-
-void update_matrix_to_gpu(NrnThread* _nt) {
-#ifdef _OPENACC
-    if (_nt->compute_gpu && (_nt->end > 0)) {
-        /* before copying, make sure all computations in the stream are completed */
-
-        // clang-format off
-
-        #pragma acc wait(_nt->stream_id)
-
-        /* while discussion with Michael we found that RHS is also needed on
-         * gpu because nrn_cap_jacob uses rhs which is being updated on GPU
-         */
-        double* v = _nt->_actual_v;
-        double* rhs = _nt->_actual_rhs;
-        int ne = nrn_soa_padded_size(_nt->end, 0);
-
-        #pragma acc update device(v[0 : ne]) async(_nt->stream_id)
-        #pragma acc update device(rhs[0 : ne]) async(_nt->stream_id)
-        #pragma acc wait(_nt->stream_id)
-        // clang-format on
-    }
-#else
-    (void) _nt;
-#endif
-}
-
 /** Cleanup device memory that is being tracked by the OpenACC runtime.
  *
  *  This function painstakingly calls `acc_delete` in reverse order on all
@@ -1343,8 +1302,11 @@ void init_gpu() {
 
     int device_num = local_rank % num_devices_per_node;
     acc_set_device_num(device_num, device_type);
+#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
+    omp_set_default_device(device_num);
+#endif
 
-    if (nrnmpi_myid == 0) {
+    if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
         std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
                   << " ranks per node\n";
     }
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
index 67e6a058c..354bdc208 100644
--- a/coreneuron/gpu/nrn_acc_manager.hpp
+++ b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -23,8 +23,6 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void modify_data_on_device(NrnThread* threads, int nthreads);
 void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);
 
-void update_matrix_from_gpu(NrnThread* _nt);
-void update_matrix_to_gpu(NrnThread* _nt);
 void update_net_receive_buffer(NrnThread* _nt);
 void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
diff --git a/coreneuron/io/lfp.cpp b/coreneuron/io/lfp.cpp
index 646fbf5a0..2a001b85a 100644
--- a/coreneuron/io/lfp.cpp
+++ b/coreneuron/io/lfp.cpp
@@ -7,9 +7,6 @@
 
 
 namespace coreneuron {
-// extern variables require acc declare
-#pragma acc declare create(pi)
-
 namespace lfputils {
 
 double line_source_lfp_factor(const Point3D& e_pos,
diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp
index ee62f660d..42c65cb18 100644
--- a/coreneuron/mechanism/capac.cpp
+++ b/coreneuron/mechanism/capac.cpp
@@ -12,25 +12,9 @@
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/permute/data_layout.hpp"
 
-// clang-format off
-
-#if defined(_OPENACC)
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  \
-    _Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   \
-    _Pragma(                        \
-        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \
-    _Pragma(                        \
-        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#else
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  _Pragma("")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   _Pragma("")
-#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("")
-#endif
-
-// clang-format on
-
+#define _PRAGMA_FOR_INIT_ACC_LOOP_                                                               \
+    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
 #define _STRIDE _cntml_padded + _iml
 
 namespace coreneuron {
@@ -78,15 +62,16 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
     (void) _cntml_padded; /* unused when layout=1*/
 
     double* _vec_d = _nt->_actual_d;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     { /*if (use_cachevec) {*/
         int* ni = ml->nodeindices;
 
         vdata = ml->data;
-        _PRAGMA_FOR_JACOB_ACC_LOOP_
+        nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
+                                             ni [0:_cntml_actual],
+                                             _vec_d [0:_nt->end]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (_iml = 0; _iml < _cntml_actual; _iml++) {
             _vec_d[ni[_iml]] += cfac * cm;
         }
@@ -126,12 +111,13 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
     /* no need to distinguish secondorder */
     int* ni = ml->nodeindices;
     double* _vec_rhs = _nt->_actual_rhs;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     vdata = ml->data;
-    _PRAGMA_FOR_CUR_ACC_LOOP_
+    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
+                                         ni [0:_cntml_actual],
+                                         _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; _iml++) {
         i_cap = cfac * cm * _vec_rhs[ni[_iml]];
     }
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 76adc9045..727f30ea6 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -19,26 +19,6 @@
 
 #define _STRIDE _cntml_padded + _iml
 
-// clang-format off
-
-#if defined(_OPENACC)
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  \
-    _Pragma(                        \
-        "acc parallel loop present(pd[0:_cntml_padded*5], ppd[0:1], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu)")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   \
-    _Pragma(                        \
-        "acc parallel loop present(pd[0:_cntml_padded*5], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu) async(stream_id)")
-#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_     \
-    _Pragma(                                    \
-        "acc parallel loop present(pd[0:_cntml_padded*5], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#else
-#define _PRAGMA_FOR_INIT_ACC_LOOP_          _Pragma("")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_           _Pragma("")
-#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ _Pragma("")
-#endif
-
-// clang-format on
-
 namespace coreneuron {
 
 // for each ion it refers to internal concentration, external concentration, and charge,
@@ -277,14 +257,16 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
     double* pd;
     Datum* ppd;
     (void) nt; /* unused */
-#if defined(_OPENACC)
-    int stream_id = nt->stream_id;
-#endif
     /*printf("ion_cur %s\n", memb_func[type].sym->name);*/
     int _cntml_padded = ml->_nodecount_padded;
     pd = ml->data;
     ppd = ml->pdata;
-    _PRAGMA_FOR_CUR_ACC_LOOP_
+    nrn_pragma_acc(parallel loop present(
+        pd [0:_cntml_padded * 5],
+        nrn_ion_global_map
+        [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)
+                       async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
         dcurdv = 0.;
         cur = 0.;
@@ -312,7 +294,16 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) {
     int _cntml_padded = ml->_nodecount_padded;
     pd = ml->data;
     ppd = ml->pdata;
-    _PRAGMA_FOR_INIT_ACC_LOOP_
+    // There was no async(...) clause in the initial OpenACC implementation, so
+    // no `nowait` clause has been added to the OpenMP implementation. TODO:
+    // verify if this can be made asynchronous or if there is a strong reason it
+    // needs to be like this.
+    nrn_pragma_acc(parallel loop present(
+        pd [0:_cntml_padded * 5],
+        ppd [0:1],
+        nrn_ion_global_map
+        [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
         if (iontype & 04) {
             conci = conci0;
@@ -332,9 +323,6 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
     int _cntml_padded;
     double* pd;
     (void) _nt; /* unused */
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
     double* _vec_rhs = _nt->_actual_rhs;
 
     if (secondorder == 2) {
@@ -345,7 +333,11 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
                 int* ni = ml->nodeindices;
                 _cntml_padded = ml->_nodecount_padded;
                 pd = ml->data;
-                _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_
+                nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5],
+                                                     ni [0:_cntml_actual],
+                                                     _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
+                                   async(_nt->stream_id))
+                nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
                 for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
                     cur += dcurdv * (_vec_rhs[ni[_iml]]);
                 }
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 3acdff1ea..a8bff7a50 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -20,10 +20,6 @@
 namespace coreneuron {
 int secondorder = 0;
 double t, dt, celsius, pi;
-// declare copyin required for correct initialization
-#pragma acc declare copyin(secondorder)
-#pragma acc declare copyin(celsius)
-#pragma acc declare copyin(pi)
 int rev_dt;
 
 using Pfrv = void (*)();
diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp
index 6ed52dc34..31b2fec54 100644
--- a/coreneuron/network/cvodestb.cpp
+++ b/coreneuron/network/cvodestb.cpp
@@ -61,11 +61,9 @@ void init_net_events() {
         NrnThread* nt = nrn_threads + ith;
         double* weights = nt->weights;
         int n_weight = nt->n_weight;
-        if (n_weight) {
-            // clang-format off
-
-            #pragma acc update device(weights[0 : n_weight]) if (nt->compute_gpu)
-            // clang-format on
+        if (n_weight && nt->compute_gpu) {
+            nrn_pragma_acc(update device(weights[0:n_weight]))
+            nrn_pragma_omp(target update to(weights[0:n_weight]))
         }
     }
 #endif
diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp
index 899bc1e14..ee2e5cb3e 100644
--- a/coreneuron/network/netcvode.cpp
+++ b/coreneuron/network/netcvode.cpp
@@ -531,28 +531,13 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
     PreSynHelper* presyns_helper = nt->presyns_helper;
     double* actual_v = nt->_actual_v;
 
-#if defined(_OPENACC)
-    int stream_id = nt->stream_id;
-#endif
-
     if (nt->ncell == 0)
         return;
 
-        //_net_send_buffer_cnt is no longer used in openacc kernel, remove this?
-        //#ifdef _OPENACC
-        //    if(nt->compute_gpu)
-        //        acc_update_device(&(nt->_net_send_buffer_cnt), sizeof(int));
-        //#endif
-
-        // on GPU...
-        // clang-format off
-
-    #pragma acc parallel loop present(                  \
-        nt[0:1], presyns_helper[0:nt->n_presyn],        \
-        presyns[0:nt->n_presyn], actual_v[0:nt->end])   \
-        copy(net_send_buf_count) if (nt->compute_gpu)   \
-        async(stream_id)
-    // clang-format on
+    nrn_pragma_acc(parallel loop present(
+        nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
+                       copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu))
     for (int i = 0; i < nt->ncell; ++i) {
         PreSyn* ps = presyns + i;
         PreSynHelper* psh = presyns_helper + i;
@@ -563,7 +548,7 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
         int* flag = &(psh->flag_);
 
         if (pscheck(v, threshold, flag)) {
-#ifndef _OPENACC
+#ifndef CORENEURON_ENABLE_GPU
             nt->_net_send_buffer_cnt = net_send_buf_count;
             if (nt->_net_send_buffer_cnt >= nt->_net_send_buffer_size) {
                 nt->_net_send_buffer_size *= 2;
@@ -572,31 +557,23 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
             }
 #endif
 
-            // clang-format off
-
-            #pragma acc atomic capture
-            // clang-format on
+            nrn_pragma_acc(atomic capture)
+            nrn_pragma_omp(atomic capture)
             idx = net_send_buf_count++;
 
             nt->_net_send_buffer[idx] = i;
         }
     }
-
-    // clang-format off
-
-    #pragma acc wait(stream_id)
-    // clang-format on
+    nrn_pragma_acc(wait(nt->stream_id))
     nt->_net_send_buffer_cnt = net_send_buf_count;
 
-    if (nt->_net_send_buffer_cnt) {
-#ifdef _OPENACC
+    if (nt->compute_gpu && nt->_net_send_buffer_cnt) {
+#ifdef CORENEURON_ENABLE_GPU
         int* nsbuffer = nt->_net_send_buffer;
 #endif
-        // clang-format off
-
-        #pragma acc update host(nsbuffer[0:nt->_net_send_buffer_cnt]) if (nt->compute_gpu) async(stream_id)
-        #pragma acc wait(stream_id)
-        // clang-format on
+        nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id))
+        nrn_pragma_acc(wait(nt->stream_id))
+        nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]))
     }
 
     // on CPU...
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
index e74d866ce..1bd822f54 100644
--- a/coreneuron/network/partrans.cpp
+++ b/coreneuron/network/partrans.cpp
@@ -41,40 +41,39 @@ void nrnmpi_v_transfer() {
     // gather the source values. can be done in parallel
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         auto& ttd = transfer_thread_data_[tid];
-        auto& nt = nrn_threads[tid];
+        auto* nt = &nrn_threads[tid];
         int n = int(ttd.outsrc_indices.size());
         if (n == 0) {
             continue;
         }
-        double* src_data = nt._data;
+        double* src_data = nt->_data;
         int* src_indices = ttd.src_indices.data();
 
         // gather sources on gpu and copy to cpu, cpu scatters to outsrc_buf
         double* src_gather = ttd.src_gather.data();
         size_t n_src_gather = ttd.src_gather.size();
-        // clang-format off
 
-        #pragma acc parallel loop present(                                          \
-            src_indices[0:n_src_gather], src_data[0:nt._ndata],                     \
-            src_gather[0 : n_src_gather]) /*copyout(src_gather[0:n_src_gather])*/   \
-            if (nt.compute_gpu) async(nt.stream_id)
+        nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather],
+                                             src_data [0:nt->_ndata],
+                                             src_gather [0:n_src_gather]) if (nt->compute_gpu)
+                           async(nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
         for (int i = 0; i < n_src_gather; ++i) {
             src_gather[i] = src_data[src_indices[i]];
         }
-        // do not know why the copyout above did not work
-        // and the following update is needed
-        #pragma acc update host(src_gather[0 : n_src_gather])   \
-            if (nrn_threads[0].compute_gpu)                     \
-            async(nt.stream_id)
-        // clang-format on
+        nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu)
+                           async(nt->stream_id))
+        nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu))
     }
 
     // copy gathered source values to outsrc_buf_
+    bool compute_gpu = false;
     for (int tid = 0; tid < nrn_nthread; ++tid) {
-        // clang-format off
-
-        #pragma acc wait(nrn_threads[tid].stream_id)
-        // clang-format on
+        if (nrn_threads[tid].compute_gpu) {
+            compute_gpu = true;
+            nrn_pragma_acc(wait(nrn_threads[tid].stream_id))
+            nrn_pragma_omp(taskwait)
+        }
         TransferThreadData& ttd = transfer_thread_data_[tid];
         size_t n_outsrc_indices = ttd.outsrc_indices.size();
         int* outsrc_indices = ttd.outsrc_indices.data();
@@ -102,12 +101,8 @@ void nrnmpi_v_transfer() {
     }
 
     // insrc_buf_ will get copied to targets via nrnthread_v_transfer
-    // clang-format off
-
-    #pragma acc update device(          \
-        insrc_buf_[0:n_insrc_buf])      \
-        if (nrn_threads[0].compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(update device(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu))
+    nrn_pragma_omp(target update to(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu))
 }
 
 void nrnthread_v_transfer(NrnThread* _nt) {
@@ -119,33 +114,32 @@ void nrnthread_v_transfer(NrnThread* _nt) {
     int* insrc_indices = ttd.insrc_indices.data();
     double* tar_data = _nt->_data;
     // last element in the displacement vector gives total length
+#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
     int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
     int ndata = _nt->_ndata;
+#endif
 
-    // clang-format off
-
-    #pragma acc parallel loop present(  \
-        insrc_indices[0:ntar],          \
-        tar_data[0:ndata],              \
-        insrc_buf_[0:n_insrc_buf])      \
-    if (_nt->compute_gpu)               \
-        async(_nt->stream_id)
-    // clang-format on
+    nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar],
+                                         tar_data [0:ndata],
+                                         insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu))
     for (size_t i = 0; i < ntar; ++i) {
         tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]];
     }
 }
 
+/// TODO: Corresponding exit data cluase for OpenACC/OpenMP is missing and hence
+///       GPU buffers are not freed.
 void nrn_partrans::gap_update_indices() {
     // Ensure index vectors, src_gather, and insrc_buf_ are on the gpu.
     if (insrcdspl_) {
         int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
+        nrn_pragma_acc(enter data create(insrc_buf_ [0:n_insrc_buf]) if (corenrn_param.gpu))
+        // clang-format off
+        nrn_pragma_omp(target enter data map(alloc: insrc_buf_[0:n_insrc_buf])
+                                         if(corenrn_param.gpu))
         // clang-format off
-
-        #pragma acc enter data create(      \
-            insrc_buf_[0:n_insrc_buf])      \
-            if (nrn_threads[0].compute_gpu)
-        // clang-format on
     }
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         TransferThreadData& ttd = transfer_thread_data_[tid];
@@ -154,21 +148,25 @@ void nrn_partrans::gap_update_indices() {
         size_t n_src_gather = ttd.src_gather.size();
         NrnThread* nt = nrn_threads + tid;
         if (n_src_indices) {
+            int* src_indices = ttd.src_indices.data();
+            double* src_gather = ttd.src_gather.data();
+            nrn_pragma_acc(enter data copyin(src_indices[0:n_src_indices]) if(nt->compute_gpu))
+            nrn_pragma_acc(enter data create(src_gather[0:n_src_gather]) if(nt->compute_gpu))
             // clang-format off
-
-            int *src_indices = ttd.src_indices.data();
-            double *src_gather = ttd.src_gather.data();
-            #pragma acc enter data copyin(src_indices[0 : n_src_indices]) if (nt->compute_gpu)
-            #pragma acc enter data create(src_gather[0 : n_src_gather]) if (nt->compute_gpu)
+            nrn_pragma_omp(target enter data map(to: src_indices [0:n_src_indices])
+                                             map(alloc: src_gather[0:n_src_gather])
+                                             if(nt->compute_gpu))
             // clang-format on
         }
 
         if (ttd.insrc_indices.size()) {
-            // clang-format off
-
-            int *insrc_indices = ttd.insrc_indices.data();
+            int* insrc_indices = ttd.insrc_indices.data();
             size_t n_insrc_indices = ttd.insrc_indices.size();
-            #pragma acc enter data copyin(insrc_indices[0 : n_insrc_indices]) if (nt->compute_gpu)
+            nrn_pragma_acc(
+                enter data copyin(insrc_indices [0:n_insrc_indices]) if (nt->compute_gpu))
+            // clang-format off
+            nrn_pragma_omp(target enter data map(to: insrc_indices[0:n_insrc_indices])
+                                             if(nt->compute_gpu))
             // clang-format on
         }
     }
diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h
index 2c7fb8bb9..225d6d2ad 100644
--- a/coreneuron/nrnconf.h
+++ b/coreneuron/nrnconf.h
@@ -9,6 +9,8 @@
 #ifndef _H_NRNCONF_
 #define _H_NRNCONF_
 
+#include "coreneuron/utils/offload.hpp"
+
 #include <cstdio>
 #include <cmath>
 #include <cassert>
@@ -32,14 +34,16 @@ using Symbol = char;
 #define VECTORIZE   1
 
 // extern variables require acc declare
+nrn_pragma_omp(declare target)
 extern double celsius;
-#pragma acc declare create(celsius)
+nrn_pragma_acc(declare create(celsius))
 
 extern double pi;
-#pragma acc declare create(pi)
+nrn_pragma_acc(declare create(pi))
 
 extern int secondorder;
-#pragma acc declare create(secondorder)
+nrn_pragma_acc(declare create(secondorder))
+nrn_pragma_omp(end declare target)
 
 extern double t, dt;
 extern int rev_dt;
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index 2b6167f57..fd784fe38 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -6,8 +6,6 @@
 # =============================================================================
 */
 
-#include <set>
-
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/utils/nrn_assert.h"
@@ -15,6 +13,7 @@
 #include "coreneuron/network/tnode.hpp"
 #include "coreneuron/utils/lpt.hpp"
 #include "coreneuron/utils/memory.h"
+#include "coreneuron/utils/offload.hpp"
 #include "coreneuron/apps/corenrn_parameters.hpp"
 
 #include "coreneuron/permute/node_permute.h"  // for print_quality
@@ -22,6 +21,9 @@
 #ifdef _OPENACC
 #include <openacc.h>
 #endif
+
+#include <set>
+
 namespace coreneuron {
 int interleave_permute_type;
 InterleaveInfo* interleave_info;  // nrn_nthread array
@@ -488,8 +490,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     bool has_subtrees_to_compute = true;
 
     // clang-format off
-
-    #pragma acc loop seq
+    nrn_pragma_acc(loop seq)
     for (; has_subtrees_to_compute; ) {  // ncycle loop
 #if !defined(_OPENACC)
         // serial test, gpu does this in parallel
@@ -500,9 +501,11 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
                 // what is the index
                 int ip = GPU_PARENT(i);
                 double p = GPU_A(i) / GPU_D(i);
-                #pragma acc atomic update
+                nrn_pragma_acc(atomic update)
+                nrn_pragma_omp(atomic update)
                 GPU_D(ip) -= p * GPU_B(i);
-                #pragma acc atomic update
+                nrn_pragma_acc(atomic update)
+                nrn_pragma_omp(atomic update)
                 GPU_RHS(ip) -= p * GPU_RHS(i);
             }
 #if !defined(_OPENACC)
@@ -535,10 +538,7 @@ static void bksub_interleaved2(NrnThread* nt,
 #if !defined(_OPENACC)
     for (int i = root; i < lastroot; i += 1) {
 #else
-    // clang-format off
-
-    #pragma acc loop seq
-    // clang-format on
+    nrn_pragma_acc(loop seq)
     for (int i = root; i < lastroot; i += warpsize) {
 #endif
         GPU_RHS(i) /= GPU_D(i);  // the root
@@ -596,21 +596,17 @@ void solve_interleaved2(int ith) {
         int* strides = ii.stride;           // sum ncycles of these (bad since ncompart/warpsize)
         int* rootbegin = ii.firstnode;      // nwarp+1 of these
         int* nodebegin = ii.lastnode;       // nwarp+1 of these
-#ifdef _OPENACC
+#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
         int nstride = stridedispl[nwarp];
-        int stream_id = nt->stream_id;
-#endif
-
-#ifdef _OPENACC
-        // clang-format off
-        
-        #pragma acc parallel loop gang vector vector_length(warpsize) \
-            present(nt[0:1], strides[0:nstride],                      \
-            ncycles[0:nwarp], stridedispl[0:nwarp+1],                 \
-            rootbegin[0:nwarp+1], nodebegin[0:nwarp+1])               \
-            if (nt->compute_gpu) async(stream_id)
-// clang-format on
 #endif
+        nrn_pragma_acc(parallel loop gang vector vector_length(
+            warpsize) present(nt [0:1],
+                              strides [0:nstride],
+                              ncycles [0:nwarp],
+                              stridedispl [0:nwarp + 1],
+                              rootbegin [0:nwarp + 1],
+                              nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
         for (int icore = 0; icore < ncore; ++icore) {
             int iwarp = icore / warpsize;     // figure out the >> value
             int ic = icore & (warpsize - 1);  // figure out the & mask
@@ -629,9 +625,7 @@ void solve_interleaved2(int ith) {
             }  // serial test mode
 #endif
         }
-#ifdef _OPENACC
-#pragma acc wait(nt->stream_id)
-#endif
+        nrn_pragma_acc(wait(nt->stream_id))
 #ifdef _OPENACC
     }
 #endif
@@ -656,28 +650,23 @@ void solve_interleaved1(int ith) {
     int* firstnode = ii.firstnode;
     int* lastnode = ii.lastnode;
     int* cellsize = ii.cellsize;
-#if _OPENACC
-    int stream_id = nt->stream_id;
-#endif
 
-#ifdef _OPENACC
-    // clang-format off
-
-    #pragma acc parallel loop present(              \
-        nt[0:1], stride[0:nstride],                 \
-        firstnode[0:ncell], lastnode[0:ncell],      \
-        cellsize[0:ncell]) if (nt->compute_gpu)     \
-        async(stream_id)
-// clang-format on
-#endif
+    // OL211123: can we preserve the error checking behaviour of OpenACC's
+    // present clause with OpenMP? It is a bug if these data are not present,
+    // so diagnostics are helpful...
+    nrn_pragma_acc(parallel loop present(nt [0:1],
+                                         stride [0:nstride],
+                                         firstnode [0:ncell],
+                                         lastnode [0:ncell],
+                                         cellsize [0:ncell]) if (nt->compute_gpu)
+                       async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int icell = 0; icell < ncell; ++icell) {
         int icellsize = cellsize[icell];
         triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
         bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
     }
-#ifdef _OPENACC
-#pragma acc wait(stream_id)
-#endif
+    nrn_pragma_acc(wait(nt->stream_id))
 }
 
 void solve_interleaved(int ith) {
diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp
index 8f4ac14cf..a46f83535 100644
--- a/coreneuron/sim/fadvance_core.cpp
+++ b/coreneuron/sim/fadvance_core.cpp
@@ -78,10 +78,11 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
             } else {
                 nt->cj = 1.0 / dt;
             }
+            nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj)
+                               async(nt->stream_id) if (nt->compute_gpu))
             // clang-format off
-
-            #pragma acc update device(nt->_t, nt->_dt, nt->cj) \
-                    async(nt->stream_id) if(nt->compute_gpu)
+            nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj)
+                                         if(nt->compute_gpu))
             // clang-format on
         }
     }
@@ -201,35 +202,24 @@ void update(NrnThread* _nt) {
     double* vec_v = &(VEC_V(0));
     double* vec_rhs = &(VEC_RHS(0));
     int i2 = _nt->end;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     /* do not need to worry about linmod or extracellular*/
     if (secondorder) {
-        // clang-format off
-
-        #pragma acc parallel loop present(          \
-            vec_v[0:i2], vec_rhs[0:i2])             \
-            if (_nt->compute_gpu) async(stream_id)
-        // clang-format on
+        nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = 0; i < i2; ++i) {
             vec_v[i] += 2. * vec_rhs[i];
         }
     } else {
-        // clang-format off
-
-        #pragma acc parallel loop present(              \
-                vec_v[0:i2], vec_rhs[0:i2])             \
-                if (_nt->compute_gpu) async(stream_id)
-        // clang-format on
+        nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = 0; i < i2; ++i) {
             vec_v[i] += vec_rhs[i];
         }
     }
 
-    // update_matrix_to_gpu(_nt);
-
     if (_nt->tml) {
         assert(_nt->tml->index == CAP);
         nrn_cur_capacitance(_nt, _nt->tml->ml, _nt->tml->index);
@@ -304,10 +294,9 @@ void nrncore2nrn_send_values(NrnThread* nth) {
             // make sure we do not overflow the `varrays` buffers
             assert(vs < tr->bsize);
 
-            // clang-format off
-
-            #pragma acc parallel loop present(tr[0:1]) if(nth->compute_gpu) async(nth->stream_id)
-            // clang-format on
+            nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu)
+                               async(nth->stream_id))
+            nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu))
             for (int i = 0; i < tr->n_trajec; ++i) {
                 tr->varrays[i][vs] = *tr->gather[i];
             }
@@ -326,12 +315,12 @@ void nrncore2nrn_send_values(NrnThread* nth) {
             // https://github.com/BlueBrain/CoreNeuron/issues/611
             for (int i = 0; i < tr->n_trajec; ++i) {
                 double* gather_i = tr->gather[i];
-                // clang-format off
-
-                #pragma acc update self(gather_i[0:1]) if(nth->compute_gpu) async(nth->stream_id)
+                nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu)
+                                   async(nth->stream_id))
+                nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu))
             }
-            #pragma acc wait(nth->stream_id)
-            // clang-format on
+            nrn_pragma_acc(wait(nth->stream_id))
+            nrn_pragma_omp(taskwait)
             for (int i = 0; i < tr->n_trajec; ++i) {
                 *(tr->scatter[i]) = *(tr->gather[i]);
             }
@@ -351,15 +340,11 @@ static void* nrn_fixed_step_thread(NrnThread* nth) {
     nth->_t += .5 * nth->_dt;
 
     if (nth->ncell) {
-#if defined(_OPENACC)
-        int stream_id = nth->stream_id;
-        /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */
-        // clang-format off
-
-        #pragma acc update device(nth->_t) if (nth->compute_gpu) async(stream_id)
-        #pragma acc wait(stream_id)
-// clang-format on
-#endif
+        /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can
+        launch kernel) */
+        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
+        nrn_pragma_acc(wait(nth->stream_id))
+        nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
         fixed_play_continuous(nth);
 
         {
@@ -393,12 +378,9 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) {
 
     if (nth->ncell) {
         /*@todo: do we need to update nth->_t on GPU */
-        // clang-format off
-
-        #pragma acc update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)
-        #pragma acc wait(nth->stream_id)
-        // clang-format on
-
+        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
+        nrn_pragma_acc(wait(nth->stream_id))
+        nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
         fixed_play_continuous(nth);
         nonvint(nth);
         nrncore2nrn_send_values(nth);
diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp
index 8dfb0cd76..1218b7967 100644
--- a/coreneuron/sim/fast_imem.cpp
+++ b/coreneuron/sim/fast_imem.cpp
@@ -50,10 +50,10 @@ void nrn_calc_fast_imem(NrnThread* nt) {
 
     double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d;
     double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(vec_rhs,     \
-                                  vec_area,    \
-                                  fast_imem_d, \
-                                  fast_imem_rhs) if (nt->compute_gpu) async(nt->stream_id)
+    nrn_pragma_acc(
+        parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu)
+            async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
     }
@@ -68,8 +68,9 @@ void nrn_calc_fast_imem_init(NrnThread* nt) {
     double* vec_area = nt->_actual_area;
 
     double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) \
-    async(nt->stream_id)
+    nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu)
+                       async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
     }
diff --git a/coreneuron/sim/finitialize.cpp b/coreneuron/sim/finitialize.cpp
index 1ae79a92f..d711ae247 100644
--- a/coreneuron/sim/finitialize.cpp
+++ b/coreneuron/sim/finitialize.cpp
@@ -53,12 +53,9 @@ void nrn_finitialize(int setv, double v) {
     if (setv) {
         for (auto _nt = nrn_threads; _nt < nrn_threads + nrn_nthread; ++_nt) {
             double* vec_v = &(VEC_V(0));
-            // clang-format off
-
-            #pragma acc parallel loop present(      \
-                _nt[0:1], vec_v[0:_nt->end])        \
-                if (_nt->compute_gpu)
-            // clang-format on
+            nrn_pragma_acc(
+                parallel loop present(_nt [0:1], vec_v [0:_nt->end]) if (_nt->compute_gpu))
+            nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
             for (int i = 0; i < _nt->end; ++i) {
                 vec_v[i] = v;
             }
diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp
index a24c8360f..60ba2b660 100644
--- a/coreneuron/sim/solve_core.cpp
+++ b/coreneuron/sim/solve_core.cpp
@@ -24,7 +24,9 @@ void nrn_solve_minimal(NrnThread* _nt) {
     }
 }
 
-/** TODO loops are executed seq in OpenACC just for debugging, remove it! */
+/** @todo OpenACC GPU offload is sequential/slow. Because --cell-permute=0 and
+ *  --gpu is forbidden anyway, no OpenMP target offload equivalent is implemented.
+ */
 
 /* triangularization of the matrix equations */
 static void triang(NrnThread* _nt) {
@@ -37,17 +39,9 @@ static void triang(NrnThread* _nt) {
     double* vec_rhs = &(VEC_RHS(0));
     int* parent_index = _nt->_v_parent_index;
 
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
-    /** @todo: just for benchmarking, otherwise produces wrong results */
-    // clang-format off
-
-    #pragma acc parallel loop seq present(      \
-        vec_a[0:i3], vec_b[0:i3], vec_d[0:i3],  \
-        vec_rhs[0:i3], parent_index[0:i3])      \
-        async(stream_id) if (_nt->compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(parallel loop seq present(
+        vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
+                       async(_nt->stream_id) if (_nt->compute_gpu))
     for (int i = i3 - 1; i >= i2; --i) {
         double p = vec_a[i] / vec_d[i];
         vec_d[parent_index[i]] -= p * vec_b[i];
@@ -66,33 +60,22 @@ static void bksub(NrnThread* _nt) {
     double* vec_rhs = &(VEC_RHS(0));
     int* parent_index = _nt->_v_parent_index;
 
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
-    /** @todo: just for benchmarking, otherwise produces wrong results */
-    // clang-format off
-
-    #pragma acc parallel loop seq present(      \
-        vec_d[0:i2], vec_rhs[0:i2])             \
-        async(stream_id) if (_nt->compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2])
+                       async(_nt->stream_id) if (_nt->compute_gpu))
     for (int i = i1; i < i2; ++i) {
         vec_rhs[i] /= vec_d[i];
     }
 
-    /** @todo: just for benchmarking, otherwise produces wrong results */
-    // clang-format off
-
-    #pragma acc parallel loop seq present(          \
-        vec_b[0:i3], vec_d[0:i3], vec_rhs[0:i3],    \
-        parent_index[0:i3]) async(stream_id)        \
-        if (_nt->compute_gpu)
+    nrn_pragma_acc(
+        parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
+            async(_nt->stream_id) if (_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]];
         vec_rhs[i] /= vec_d[i];
     }
 
-    #pragma acc wait(stream_id)
-    // clang-format on
+    if (_nt->compute_gpu) {
+        nrn_pragma_acc(wait(_nt->stream_id))
+    }
 }
 }  // namespace coreneuron
diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp
index 943980bcd..bb92d2ab1 100644
--- a/coreneuron/sim/treeset_core.cpp
+++ b/coreneuron/sim/treeset_core.cpp
@@ -32,12 +32,9 @@ static void nrn_rhs(NrnThread* _nt) {
     double* vec_v = &(VEC_V(0));
     int* parent_index = _nt->_v_parent_index;
 
-    // clang-format off
-
-    #pragma acc parallel loop present(          \
-        vec_rhs[0:i3], vec_d[0:i3])             \
-        if (_nt->compute_gpu) async(_nt->stream_id)
-    // clang-format on
+    nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         vec_rhs[i] = 0.;
         vec_d[i] = 0.;
@@ -46,9 +43,10 @@ static void nrn_rhs(NrnThread* _nt) {
     if (_nt->nrn_fast_imem) {
         double* fast_imem_d = _nt->nrn_fast_imem->nrn_sav_d;
         double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(fast_imem_d [i1:i3],                         \
-                                  fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) \
-    async(_nt->stream_id)
+        nrn_pragma_acc(
+            parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
+                async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             fast_imem_d[i] = 0.;
             fast_imem_rhs[i] = 0.;
@@ -76,7 +74,9 @@ static void nrn_rhs(NrnThread* _nt) {
            so here we transform so it only has membrane current contribution
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id)
+        nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] -= vec_rhs[i];
         }
@@ -86,22 +86,24 @@ static void nrn_rhs(NrnThread* _nt) {
     The extracellular mechanism contribution is already done.
             rhs += ai_j*(vi_j - vi)
     */
-    // clang-format off
-
-    #pragma acc parallel loop present(          \
-        vec_rhs[0:i3], vec_d[0:i3],             \
-        vec_a[0:i3], vec_b[0:i3],               \
-        vec_v[0:i3], parent_index[0:i3])        \
-        if (_nt->compute_gpu) async(_nt->stream_id)
+    nrn_pragma_acc(parallel loop present(vec_rhs [0:i3],
+                                         vec_d [0:i3],
+                                         vec_a [0:i3],
+                                         vec_b [0:i3],
+                                         vec_v [0:i3],
+                                         parent_index [0:i3]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         double dv = vec_v[parent_index[i]] - vec_v[i];
         /* our connection coefficients are negative so */
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_rhs[i] -= vec_b[i] * dv;
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_rhs[parent_index[i]] += vec_a[i] * dv;
     }
-    // clang-format on
 }
 
 /* calculate left hand side of
@@ -150,34 +152,32 @@ static void nrn_lhs(NrnThread* _nt) {
            so here we transform so it only has membrane current contribution
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_d;
-#pragma acc parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)
+        nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] += vec_d[i];
         }
     }
 
     /* now add the axial currents */
-    // clang-format off
-
-    #pragma acc parallel loop present(          \
-        vec_d[0:i3], vec_a[0:i3],               \
-        vec_b[0:i3], parent_index[0:i3])        \
-        if (_nt->compute_gpu) async(_nt->stream_id)
+    nrn_pragma_acc(parallel loop present(
+        vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_d[i] -= vec_b[i];
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_d[parent_index[i]] -= vec_a[i];
     }
-    // clang-format on
 }
 
 /* for the fixed step method */
 void* setup_tree_matrix_minimal(NrnThread* _nt) {
     nrn_rhs(_nt);
     nrn_lhs(_nt);
-    // update_matrix_from_gpu(_nt);
-
     return nullptr;
 }
 }  // namespace coreneuron
diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h
index 965c06e78..2f0e24458 100644
--- a/coreneuron/utils/memory.h
+++ b/coreneuron/utils/memory.h
@@ -115,8 +115,7 @@ auto allocate_unique(const Alloc& alloc, Args&&... args) {
 }  // namespace coreneuron
 
 /// for gpu builds with unified memory support
-/// OL210812: why do we include __CUDACC__ here?
-#if (defined(__CUDACC__) || defined(CORENEURON_UNIFIED_MEMORY))
+#ifdef CORENEURON_UNIFIED_MEMORY
 
 #include <cuda_runtime_api.h>
 
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
new file mode 100644
index 000000000..d90cc10fd
--- /dev/null
+++ b/coreneuron/utils/offload.hpp
@@ -0,0 +1,20 @@
+/*
+# =============================================================================
+# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================
+*/
+#pragma once
+#define nrn_pragma_stringify(x) #x
+#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+#define nrn_pragma_acc(x)
+#define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x))
+#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+#define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x))
+#define nrn_pragma_omp(x)
+#else
+#define nrn_pragma_acc(x)
+#define nrn_pragma_omp(x)
+#endif
diff --git a/external/nmodl b/external/nmodl
index 794b419f5..a60c5e903 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 794b419f5256f40efcdca1674f712a6e544c235a
+Subproject commit a60c5e903126ad95cfe2bceb904d0efe83ba9d8a
diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt
index 3e2ac8e80..ec795f178 100644
--- a/tests/unit/lfp/CMakeLists.txt
+++ b/tests/unit/lfp/CMakeLists.txt
@@ -22,3 +22,4 @@ set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
 target_compile_options(lfp_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
 add_dependencies(lfp_test_bin nrniv-core)
 add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:lfp_test_bin>)
+set_tests_properties(lfp_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1)