From 0c42f406d0a3c5af9f6aa74b4f97a58138a09611 Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Fri, 10 Dec 2021 14:23:40 +0100
Subject: [PATCH 1/5] Make a try with pragmas

---
 coreneuron/gpu/nrn_acc_manager.cpp | 366 ++++++++++++++---------------
 1 file changed, 178 insertions(+), 188 deletions(-)
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 373fcdbc3..d8eded736 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -43,56 +43,49 @@ void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
-void* cnrn_gpu_copyin(void* h_ptr, std::size_t len) {
+void* cnrn_target_deviceptr(void* h_ptr) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return acc_copyin(h_ptr, len);
+    return acc_deviceptr(h_ptr);
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    auto host_id = omp_get_initial_device();
     auto device_id = omp_get_default_device();
-    auto* d_ptr = omp_target_alloc(len, device_id);
-    nrn_assert(d_ptr != nullptr);
-    nrn_assert(omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id) == 0);
-    nrn_assert(omp_target_associate_ptr(h_ptr, d_ptr, len, 0, device_id) == 0);
-    return d_ptr;
+    return omp_get_mapped_ptr(h_ptr, device_id);
 #else
-    throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
-void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) {
+template <typename T>
+void* cnrn_gpu_copyin(T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    acc_memcpy_to_device(d_ptr, h_ptr, len);
+    return acc_copyin(h_ptr, len * sizeof(T));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    auto host_id = omp_get_initial_device();
-    auto device_id = omp_get_default_device();
-    omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id);
+    #pragma omp target enter data map(to:h_ptr[:len])
+    return cnrn_target_deviceptr(h_ptr);
 #else
-    throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
-void cnrn_target_delete(void* h_ptr, size_t len) {
+template <typename T>
+void cnrn_target_delete(T* h_ptr, size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    acc_delete(h_ptr, len);
+    acc_delete(h_ptr, len * sizeof(T));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    (void)len;
-    auto device_id = omp_get_default_device();
-    omp_target_disassociate_ptr(h_ptr, device_id);
-    auto* d_ptr = omp_get_mapped_ptr(h_ptr, device_id);
-    omp_target_free(d_ptr, device_id);
+    #pragma omp target exit data map(delete: h_ptr[:len])
 #else
     throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
-void* cnrn_target_deviceptr(void* h_ptr) {
+void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return acc_deviceptr(h_ptr);
+    acc_memcpy_to_device(d_ptr, h_ptr, len);
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    auto host_id = omp_get_initial_device();
     auto device_id = omp_get_default_device();
-    return omp_get_mapped_ptr(h_ptr, device_id);
+    omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id);
 #else
-    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
@@ -114,13 +107,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         NrnThread* nt = threads + i;  // NrnThread on host
 
         if (nt->n_presyn) {
-            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, nt->n_presyn);
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay);
             // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu,
             // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay
             // inside nt on cpu (due to unified memory).
@@ -138,7 +131,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
      * find
      * corresponding NrnThread using Point_process in NET_RECEIVE block
      */
-    NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, sizeof(NrnThread) * nthreads);
+    NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, nthreads);
 
     if (interleave_info == nullptr) {
         printf("\n Warning: No permutation data? Required for linear algebra!");
@@ -157,7 +150,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         /* -- copy _data to device -- */
 
         /*copy all double data for thread */
-        d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata * sizeof(double));
+        d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata);
 
 
         /* Here is the example of using OpenACC data enter/exit
@@ -199,13 +192,12 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*));
         }
 
-        int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end * sizeof(int));
+        int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end);
         cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*));
 
         /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
         Memb_list** d_ml_list = (Memb_list**) cnrn_gpu_copyin(nt->_ml_list,
-                                                         corenrn.get_memb_funcs().size() *
-                                                             sizeof(Memb_list*));
+                                                         corenrn.get_memb_funcs().size());
         cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**));
 
         /* -- copy NrnThreadMembList list ml to device -- */
@@ -217,7 +209,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         for (auto tml = nt->tml; tml; tml = tml->next) {
             /*copy tml to device*/
             /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/
-            auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml, sizeof(NrnThreadMembList));
+            auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml);
 
             /*first tml is pointed by nt */
             if (first_tml) {
@@ -232,7 +224,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             d_last_tml = d_tml;
 
             /* now for every tml, there is a ml. copy that and setup pointer */
-            auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml, sizeof(Memb_list));
+            auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml);
             cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*));
 
             /* setup nt._ml_list */
@@ -250,20 +242,19 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
 
             if (!is_art) {
-                int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, sizeof(int) * n);
+                int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, n);
                 cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*));
             }
 
             if (szdp) {
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, sizeof(int) * pcnt);
+                int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, pcnt);
                 cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*));
             }
 
             int ts = corenrn.get_memb_funcs()[type].thread_size_;
             if (ts) {
-                ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread,
-                                                            ts * sizeof(ThreadDatum));
+                ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread, ts);
                 cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*));
             }
 
@@ -276,27 +267,27 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
             // if net receive buffer exist for mechanism
             if (nrb) {
-                d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb, sizeof(NetReceiveBuffer_t));
+                d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb);
                 cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer),
                                      &d_nrb,
                                      sizeof(NetReceiveBuffer_t*));
 
-                d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
+                d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
 
-                d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
+                d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
 
-                d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
+                d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
 
-                d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
+                d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
 
-                d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+                d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1);
                 cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
 
-                d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
+                d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
             }
 
@@ -309,25 +300,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 int* d_iptr;
                 double* d_dptr;
 
-                d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb, sizeof(NetSendBuffer_t));
+                d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb);
                 cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*));
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, sizeof(int) * nsb->_size);
+                d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*));
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size);
+                d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*));
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size);
+                d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*));
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, sizeof(int) * nsb->_size);
+                d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*));
 
-                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size);
+                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*));
 
-                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size);
+                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*));
             }
         }
@@ -338,27 +329,27 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
 
             /* copy shadow_rhs to device and fix-up the pointer */
-            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt * sizeof(double));
+            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt);
             cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*));
 
             /* copy shadow_d to device and fix-up the pointer */
-            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt * sizeof(double));
+            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt);
             cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*));
         }
 
         /* Fast membrane current calculation struct */
         if (nt->nrn_fast_imem) {
             auto* d_fast_imem = reinterpret_cast<NrnFastImem*>(
-                cnrn_gpu_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem)));
+                cnrn_gpu_copyin(nt->nrn_fast_imem));
             cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*));
             {
                 auto* d_ptr = reinterpret_cast<double*>(
-                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)));
+                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end));
                 cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*));
             }
             {
                 auto* d_ptr = reinterpret_cast<double*>(
-                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)));
+                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end));
                 cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*));
             }
         }
@@ -367,20 +358,20 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
              */
             Point_process* pntptr =
-                (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+                (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc);
             cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*));
         }
 
         if (nt->n_weight) {
             /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
-            double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, sizeof(double) * nt->n_weight);
+            double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, nt->n_weight);
             cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*));
         }
 
         if (nt->_nvdata) {
             /* copy vdata which is setup in bbcore_read. This contains cuda allocated
              * nrnran123_State * */
-            void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata);
+            void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, nt->_nvdata);
             cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**));
         }
 
@@ -391,23 +382,23 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
              * to
              * VTable and alignment */
             PreSynHelper* d_presyns_helper =
-                (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+                (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, nt->n_presyn);
             cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*));
-            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, nt->n_presyn);
             cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*));
         }
 
         if (nt->_net_send_buffer_size) {
             /* copy send_receive buffer */
             int* d_net_send_buffer = (int*) cnrn_gpu_copyin(nt->_net_send_buffer,
-                                                       sizeof(int) * nt->_net_send_buffer_size);
+                    nt->_net_send_buffer_size);
             cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*));
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay);
             cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**));
 
             nrn_VecPlay_copyto_device(nt, d_vecplay);
@@ -417,40 +408,40 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (interleave_permute_type == 1) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo));
                 int* d_ptr = nullptr;
+                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * (info->nstride + 1));
+                d_ptr = (int*) cnrn_gpu_copyin(info->stride, info->nstride + 1);
                 cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * nt->ncell);
+                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, nt->ncell);
                 cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * nt->ncell);
+                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, nt->ncell);
                 cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * nt->ncell);
+                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, nt->ncell);
                 cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
 
             } else if (interleave_permute_type == 2) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo));
+                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info);
                 int* d_ptr = nullptr;
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * info->nstride);
+                d_ptr = (int*) cnrn_gpu_copyin(info->stride, info->nstride);
                 cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1));
+                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, info->nwarp + 1);
                 cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1));
+                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, info->nwarp + 1);
                 cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1));
+                d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, info->nwarp + 1);
                 cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * info->nwarp);
+                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, info->nwarp);
                 cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
             } else {
                 printf("\n ERROR: only --cell_permute = [12] implemented");
@@ -466,20 +457,20 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 // Create a device-side copy of the `trajec_requests` struct and
                 // make sure the device-side NrnThread object knows about it.
                 auto* d_trajec_requests = reinterpret_cast<TrajectoryRequests*>(
-                    cnrn_gpu_copyin(tr, sizeof(TrajectoryRequests)));
+                    cnrn_gpu_copyin(tr));
                 cnrn_memcpy_to_device(&(d_nt->trajec_requests),
                                      &d_trajec_requests,
                                      sizeof(TrajectoryRequests*));
                 // Initialise the double** gather member of the struct.
                 auto* d_tr_gather = reinterpret_cast<double**>(
-                    cnrn_gpu_copyin(tr->gather, sizeof(double*) * tr->n_trajec));
+                    cnrn_gpu_copyin(tr->gather, tr->n_trajec));
                 cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**));
                 // Initialise the double** varrays member of the struct if it's
                 // set.
                 double** d_tr_varrays{nullptr};
                 if (tr->varrays) {
                     d_tr_varrays = reinterpret_cast<double**>(
-                        cnrn_gpu_copyin(tr->varrays, sizeof(double*) * tr->n_trajec));
+                        cnrn_gpu_copyin(tr->varrays, tr->n_trajec));
                     cnrn_memcpy_to_device(&(d_trajec_requests->varrays),
                                          &d_tr_varrays,
                                          sizeof(double**));
@@ -490,7 +481,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                         // make a device-side copy of it and store a pointer to it in
                         // the device-side version of tr->varrays.
                         auto* d_buf_traj_i = reinterpret_cast<double*>(
-                            cnrn_gpu_copyin(tr->varrays[i], tr->bsize * sizeof(double)));
+                            cnrn_gpu_copyin(tr->varrays[i], tr->bsize));
                         cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*));
                     }
                     // tr->gather[i] is a double* referring to (host) data in the
@@ -521,12 +512,12 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_co
     /// if we need to copy IvocVect vector then newly alloated vector
     /// on the device is a new destination pointer
     if(vector_copy_needed) {
-        d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from, sizeof(IvocVect));
+        d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from);
         cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*));
     }
     size_t n = from.size();
     if (n) {
-        double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), sizeof(double) * n);
+        double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), n);
         cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*));
     }
 #else
@@ -539,9 +530,9 @@ void delete_ivoc_vect_from_device(IvocVect& vec) {
 #ifdef _OPENACC
     auto const n = vec.size();
     if (n) {
-        cnrn_target_delete(vec.data(), sizeof(double) * n);
+        cnrn_target_delete(vec.data(), n);
     }
-    cnrn_target_delete(&vec, sizeof(IvocVect));
+    cnrn_target_delete(&vec);
 #else
     (void) vec;
 #endif
@@ -556,12 +547,12 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
 #ifdef _OPENACC
     if (nt->compute_gpu) {
         // free existing vectors in buffers on gpu
-        cnrn_target_delete(nrb->_pnt_index, nrb->_size * sizeof(int));
-        cnrn_target_delete(nrb->_weight_index, nrb->_size * sizeof(int));
-        cnrn_target_delete(nrb->_nrb_t, nrb->_size * sizeof(double));
-        cnrn_target_delete(nrb->_nrb_flag, nrb->_size * sizeof(double));
-        cnrn_target_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int));
-        cnrn_target_delete(nrb->_nrb_index, nrb->_size * sizeof(int));
+        cnrn_target_delete(nrb->_pnt_index, nrb->_size);
+        cnrn_target_delete(nrb->_weight_index, nrb->_size);
+        cnrn_target_delete(nrb->_nrb_t, nrb->_size);
+        cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
+        cnrn_target_delete(nrb->_displ, nrb->_size + 1);
+        cnrn_target_delete(nrb->_nrb_index, nrb->_size);
     }
 #endif
 
@@ -586,22 +577,22 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
         NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb);
 
         // recopy the vectors in the buffer
-        d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
+        d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
 
-        d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
+        d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
 
-        d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
+        d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
 
-        d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
+        d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
 
-        d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+        d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1);
         cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
 
-        d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
+        d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
     }
 #endif
@@ -1070,73 +1061,73 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (tr) {
                 if (tr->varrays) {
                     for (int i = 0; i < tr->n_trajec; ++i) {
-                        cnrn_target_delete(tr->varrays[i], tr->bsize * sizeof(double));
+                        cnrn_target_delete(tr->varrays[i], tr->bsize);
                     }
-                    cnrn_target_delete(tr->varrays, sizeof(double*) * tr->n_trajec);
+                    cnrn_target_delete(tr->varrays, tr->n_trajec);
                 }
-                cnrn_target_delete(tr->gather, sizeof(double*) * tr->n_trajec);
-                cnrn_target_delete(tr, sizeof(TrajectoryRequests));
+                cnrn_target_delete(tr->gather, tr->n_trajec);
+                cnrn_target_delete(tr);
             }
         }
         if (nt->_permute) {
             if (interleave_permute_type == 1) {
                 InterleaveInfo* info = interleave_info + i;
-                cnrn_target_delete(info->cellsize, sizeof(int) * nt->ncell);
-                cnrn_target_delete(info->lastnode, sizeof(int) * nt->ncell);
-                cnrn_target_delete(info->firstnode, sizeof(int) * nt->ncell);
-                cnrn_target_delete(info->stride, sizeof(int) * (info->nstride + 1));
-                cnrn_target_delete(info, sizeof(InterleaveInfo));
+                cnrn_target_delete(info->cellsize, nt->ncell);
+                cnrn_target_delete(info->lastnode, nt->ncell);
+                cnrn_target_delete(info->firstnode, nt->ncell);
+                cnrn_target_delete(info->stride, info->nstride + 1);
+                cnrn_target_delete(info);
             } else if (interleave_permute_type == 2) {
                 InterleaveInfo* info = interleave_info + i;
-                cnrn_target_delete(info->cellsize, sizeof(int) * info->nwarp);
-                cnrn_target_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1));
-                cnrn_target_delete(info->lastnode, sizeof(int) * (info->nwarp + 1));
-                cnrn_target_delete(info->firstnode, sizeof(int) * (info->nwarp + 1));
-                cnrn_target_delete(info->stride, sizeof(int) * info->nstride);
-                cnrn_target_delete(info, sizeof(InterleaveInfo));
+                cnrn_target_delete(info->cellsize, info->nwarp);
+                cnrn_target_delete(info->stridedispl, info->nwarp + 1);
+                cnrn_target_delete(info->lastnode, info->nwarp + 1);
+                cnrn_target_delete(info->firstnode, info->nwarp + 1);
+                cnrn_target_delete(info->stride, info->nstride);
+                cnrn_target_delete(info);
             }
         }
 
         if (nt->n_vecplay) {
             nrn_VecPlay_delete_from_device(nt);
-            cnrn_target_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            cnrn_target_delete(nt->_vecplay, nt->n_vecplay);
         }
 
         // Cleanup send_receive buffer.
         if (nt->_net_send_buffer_size) {
-            cnrn_target_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size);
+            cnrn_target_delete(nt->_net_send_buffer, nt->_net_send_buffer_size);
         }
 
         if (nt->n_presyn) {
-            cnrn_target_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            cnrn_target_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+            cnrn_target_delete(nt->presyns, nt->n_presyn);
+            cnrn_target_delete(nt->presyns_helper, nt->n_presyn);
         }
 
         // Cleanup data that's setup in bbcore_read.
         if (nt->_nvdata) {
-            cnrn_target_delete(nt->_vdata, sizeof(void*) * nt->_nvdata);
+            cnrn_target_delete(nt->_vdata, nt->_nvdata);
         }
 
         // Cleanup weight vector used in NET_RECEIVE
         if (nt->n_weight) {
-            cnrn_target_delete(nt->weights, sizeof(double) * nt->n_weight);
+            cnrn_target_delete(nt->weights, nt->n_weight);
         }
 
         // Cleanup point processes
         if (nt->n_pntproc) {
-            cnrn_target_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+            cnrn_target_delete(nt->pntprocs, nt->n_pntproc);
         }
 
         if (nt->nrn_fast_imem) {
-            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
-            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
-            cnrn_target_delete(nt->nrn_fast_imem, sizeof(NrnFastImem));
+            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end);
+            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
+            cnrn_target_delete(nt->nrn_fast_imem);
         }
 
         if (nt->shadow_rhs_cnt) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
-            cnrn_target_delete(nt->_shadow_d, pcnt * sizeof(double));
-            cnrn_target_delete(nt->_shadow_rhs, pcnt * sizeof(double));
+            cnrn_target_delete(nt->_shadow_d, pcnt);
+            cnrn_target_delete(nt->_shadow_rhs, pcnt);
         }
 
         for (auto tml = nt->tml; tml; tml = tml->next) {
@@ -1144,26 +1135,26 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             {
                 NetSendBuffer_t* nsb{tml->ml->_net_send_buffer};
                 if (nsb) {
-                    cnrn_target_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size);
-                    cnrn_target_delete(nsb->_nsb_t, sizeof(double) * nsb->_size);
-                    cnrn_target_delete(nsb->_weight_index, sizeof(int) * nsb->_size);
-                    cnrn_target_delete(nsb->_pnt_index, sizeof(int) * nsb->_size);
-                    cnrn_target_delete(nsb->_vdata_index, sizeof(int) * nsb->_size);
-                    cnrn_target_delete(nsb->_sendtype, sizeof(int) * nsb->_size);
-                    cnrn_target_delete(nsb, sizeof(NetSendBuffer_t));
+                    cnrn_target_delete(nsb->_nsb_flag, nsb->_size);
+                    cnrn_target_delete(nsb->_nsb_t, nsb->_size);
+                    cnrn_target_delete(nsb->_weight_index, nsb->_size);
+                    cnrn_target_delete(nsb->_pnt_index, nsb->_size);
+                    cnrn_target_delete(nsb->_vdata_index, nsb->_size);
+                    cnrn_target_delete(nsb->_sendtype, nsb->_size);
+                    cnrn_target_delete(nsb);
                 }
             }
             // Cleanup the net receive buffer if it exists.
             {
                 NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer};
                 if (nrb) {
-                    cnrn_target_delete(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                    cnrn_target_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                    cnrn_target_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-                    cnrn_target_delete(nrb->_nrb_t, sizeof(double) * nrb->_size);
-                    cnrn_target_delete(nrb->_weight_index, sizeof(int) * nrb->_size);
-                    cnrn_target_delete(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                    cnrn_target_delete(nrb, sizeof(NetReceiveBuffer_t));
+                    cnrn_target_delete(nrb->_nrb_index, nrb->_size);
+                    cnrn_target_delete(nrb->_displ, nrb->_size + 1);
+                    cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
+                    cnrn_target_delete(nrb->_nrb_t, nrb->_size);
+                    cnrn_target_delete(nrb->_weight_index, nrb->_size);
+                    cnrn_target_delete(nrb->_pnt_index, nrb->_size);
+                    cnrn_target_delete(nrb);
                 }
             }
             int type = tml->index;
@@ -1172,23 +1163,23 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int is_art = corenrn.get_is_artificial()[type];
             int ts = corenrn.get_memb_funcs()[type].thread_size_;
             if (ts) {
-                cnrn_target_delete(tml->ml->_thread, ts * sizeof(ThreadDatum));
+                cnrn_target_delete(tml->ml->_thread, ts);
             }
             if (szdp) {
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                cnrn_target_delete(tml->ml->pdata, sizeof(int) * pcnt);
+                cnrn_target_delete(tml->ml->pdata, pcnt);
             }
             if (!is_art) {
-                cnrn_target_delete(tml->ml->nodeindices, sizeof(int) * n);
+                cnrn_target_delete(tml->ml->nodeindices, n);
             }
-            cnrn_target_delete(tml->ml, sizeof(Memb_list));
-            cnrn_target_delete(tml, sizeof(NrnThreadMembList));
+            cnrn_target_delete(tml->ml);
+            cnrn_target_delete(tml);
         }
-        cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*));
-        cnrn_target_delete(nt->_v_parent_index, nt->end * sizeof(int));
-        cnrn_target_delete(nt->_data, nt->_ndata * sizeof(double));
+        cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size());
+        cnrn_target_delete(nt->_v_parent_index, nt->end);
+        cnrn_target_delete(nt->_data, nt->_ndata);
     }
-    cnrn_target_delete(threads, sizeof(NrnThread) * nthreads);
+    cnrn_target_delete(threads, nthreads);
     nrn_ion_global_map_delete_from_device();
 #endif
 }
@@ -1204,30 +1195,30 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
 
     int n = ns->n * ns->n_instance;
     // actually, the values of double do not matter, only the  pointers.
-    NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns, sizeof(NewtonSpace));
+    NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns);
 
     double* pd;
 
-    pd = (double*) cnrn_gpu_copyin(ns->delta_x, n * sizeof(double));
+    pd = (double*) cnrn_gpu_copyin(ns->delta_x, n);
     cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*));
 
-    pd = (double*) cnrn_gpu_copyin(ns->high_value, n * sizeof(double));
+    pd = (double*) cnrn_gpu_copyin(ns->high_value, n);
     cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*));
 
-    pd = (double*) cnrn_gpu_copyin(ns->low_value, n * sizeof(double));
+    pd = (double*) cnrn_gpu_copyin(ns->low_value, n);
     cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*));
 
-    pd = (double*) cnrn_gpu_copyin(ns->rowmax, n * sizeof(double));
+    pd = (double*) cnrn_gpu_copyin(ns->rowmax, n);
     cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*));
 
-    auto pint = (int*) cnrn_gpu_copyin(ns->perm, n * sizeof(int));
+    auto pint = (int*) cnrn_gpu_copyin(ns->perm, n);
     cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*));
 
-    auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n * sizeof(double*));
+    auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n);
     cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**));
 
     // the actual jacobian doubles were allocated as a single array
-    double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n * sizeof(double));
+    double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n);
 
     for (int i = 0; i < ns->n; ++i) {
         pd = d_jacdat + i * n;
@@ -1244,14 +1235,14 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
         return;
     }
     int n = ns->n * ns->n_instance;
-    cnrn_target_delete(ns->jacobian[0], ns->n * n * sizeof(double));
-    cnrn_target_delete(ns->jacobian, ns->n * sizeof(double*));
-    cnrn_target_delete(ns->perm, n * sizeof(int));
-    cnrn_target_delete(ns->rowmax, n * sizeof(double));
-    cnrn_target_delete(ns->low_value, n * sizeof(double));
-    cnrn_target_delete(ns->high_value, n * sizeof(double));
-    cnrn_target_delete(ns->delta_x, n * sizeof(double));
-    cnrn_target_delete(ns, sizeof(NewtonSpace));
+    cnrn_target_delete(ns->jacobian[0], ns->n * n);
+    cnrn_target_delete(ns->jacobian, ns->n);
+    cnrn_target_delete(ns->perm, n);
+    cnrn_target_delete(ns->rowmax, n);
+    cnrn_target_delete(ns->low_value, n);
+    cnrn_target_delete(ns->high_value, n);
+    cnrn_target_delete(ns->delta_x, n);
+    cnrn_target_delete(ns);
 #endif
 }
 
@@ -1264,33 +1255,33 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
     }
 
     unsigned n1 = so->neqn + 1;
-    SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so, sizeof(SparseObj));
+    SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so);
     // only pointer fields in SparseObj that need setting up are
     //   rowst, diag, rhs, ngetcall, coef_list
     // only pointer fields in Elm that need setting up are
     //   r_down, c_right, value
     // do not care about the Elm* ptr value, just the space.
 
-    Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1 * sizeof(Elm*));
+    Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1);
     cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**));
 
-    Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1 * sizeof(Elm*));
+    Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1);
     cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**));
 
-    auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
+    auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded);
     cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**));
 
-    auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double));
+    auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded);
     cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*));
 
-    auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size * sizeof(double*));
+    auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size);
     cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**));
 
     // Fill in relevant Elm pointer values
 
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            Elm* pelm = (Elm*) cnrn_gpu_copyin(elm, sizeof(Elm));
+            Elm* pelm = (Elm*) cnrn_gpu_copyin(elm);
 
             if (elm == so->rowst[irow]) {
                 cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*));
@@ -1310,7 +1301,7 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
                 }
             }
 
-            pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded * sizeof(double));
+            pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded);
             cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*));
         }
     }
@@ -1348,16 +1339,16 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
     unsigned n1 = so->neqn + 1;
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            cnrn_target_delete(elm->value, so->_cntml_padded * sizeof(double));
-            cnrn_target_delete(elm, sizeof(Elm));
+            cnrn_target_delete(elm->value, so->_cntml_padded);
+            cnrn_target_delete(elm);
         }
     }
-    cnrn_target_delete(so->coef_list, so->coef_list_size * sizeof(double*));
-    cnrn_target_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double));
-    cnrn_target_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
-    cnrn_target_delete(so->diag, n1 * sizeof(Elm*));
-    cnrn_target_delete(so->rowst, n1 * sizeof(Elm*));
-    cnrn_target_delete(so, sizeof(SparseObj));
+    cnrn_target_delete(so->coef_list, so->coef_list_size);
+    cnrn_target_delete(so->rhs, n1 * so->_cntml_padded);
+    cnrn_target_delete(so->ngetcall, so->_cntml_padded);
+    cnrn_target_delete(so->diag, n1);
+    cnrn_target_delete(so->rowst, n1);
+    cnrn_target_delete(so);
 #endif
 }
 
@@ -1366,12 +1357,11 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
 void nrn_ion_global_map_copyto_device() {
     if (nrn_ion_global_map_size) {
         double** d_data = (double**) cnrn_gpu_copyin(nrn_ion_global_map,
-                                                sizeof(double*) * nrn_ion_global_map_size);
+                                                nrn_ion_global_map_size);
         for (int j = 0; j < nrn_ion_global_map_size; j++) {
             if (nrn_ion_global_map[j]) {
                 double* d_mechmap = (double*) cnrn_gpu_copyin(nrn_ion_global_map[j],
-                                                         ion_global_map_member_size *
-                                                             sizeof(double));
+                                                         ion_global_map_member_size);
                 cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*));
             }
         }
@@ -1381,11 +1371,11 @@ void nrn_ion_global_map_copyto_device() {
 void nrn_ion_global_map_delete_from_device() {
     for (int j = 0; j < nrn_ion_global_map_size; j++) {
         if (nrn_ion_global_map[j]) {
-            cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double));
+            cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size);
         }
     }
     if (nrn_ion_global_map_size) {
-        cnrn_target_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size);
+        cnrn_target_delete(nrn_ion_global_map, nrn_ion_global_map_size);
     }
 }
 
@@ -1439,7 +1429,7 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i];
 
         /** just VecPlayContinuous object */
-        void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance, sizeof(VecPlayContinuous));
+        void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance);
         cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*));
 
         VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p;
@@ -1454,8 +1444,8 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         }
 
         /** copy PlayRecordEvent : todo: verify this */
-        PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_,
-                                                              sizeof(PlayRecordEvent));
+        PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_);
+                                                   
         cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*));
         cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*));
 
@@ -1469,13 +1459,13 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
 void nrn_VecPlay_delete_from_device(NrnThread* nt) {
     for (int i = 0; i < nt->n_vecplay; i++) {
         auto* vecplay_instance = reinterpret_cast<VecPlayContinuous*>(nt->_vecplay[i]);
-        cnrn_target_delete(vecplay_instance->e_, sizeof(PlayRecordEvent));
+        cnrn_target_delete(vecplay_instance->e_);
         if (vecplay_instance->discon_indices_) {
             delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
         }
         delete_ivoc_vect_from_device(vecplay_instance->t_);
         delete_ivoc_vect_from_device(vecplay_instance->y_);
-        cnrn_target_delete(vecplay_instance, sizeof(VecPlayContinuous));
+        cnrn_target_delete(vecplay_instance);
     }
 }
 

From 84f032295812405cd7f13724248145795b85fd49 Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Fri, 10 Dec 2021 17:42:47 +0100
Subject: [PATCH 2/5] Simplifying pointers

---
 coreneuron/gpu/nrn_acc_manager.cpp | 173 ++++++++++++++---------------
 1 file changed, 83 insertions(+), 90 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index d8eded736..3b076c172 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -55,12 +55,12 @@ void* cnrn_target_deviceptr(void* h_ptr) {
 }
 
 template <typename T>
-void* cnrn_gpu_copyin(T* h_ptr, std::size_t len = 1) {
+T* cnrn_gpu_copyin(T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return acc_copyin(h_ptr, len * sizeof(T));
+    return static_cast<T*>(acc_copyin(h_ptr, len * sizeof(T)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
     #pragma omp target enter data map(to:h_ptr[:len])
-    return cnrn_target_deviceptr(h_ptr);
+    return static_cast<T*>(cnrn_target_deviceptr(h_ptr));
 #else
     throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build");
 #endif
@@ -107,13 +107,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         NrnThread* nt = threads + i;  // NrnThread on host
 
         if (nt->n_presyn) {
-            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, nt->n_presyn);
+            PreSyn* d_presyns = cnrn_gpu_copyin(nt->presyns, nt->n_presyn);
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay);
+            void** d_vecplay = cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay);
             // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu,
             // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay
             // inside nt on cpu (due to unified memory).
@@ -131,7 +131,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
      * find
      * corresponding NrnThread using Point_process in NET_RECEIVE block
      */
-    NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, nthreads);
+    NrnThread* d_threads = cnrn_gpu_copyin(threads, nthreads);
 
     if (interleave_info == nullptr) {
         printf("\n Warning: No permutation data? Required for linear algebra!");
@@ -150,7 +150,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         /* -- copy _data to device -- */
 
         /*copy all double data for thread */
-        d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata);
+        d__data = cnrn_gpu_copyin(nt->_data, nt->_ndata);
 
 
         /* Here is the example of using OpenACC data enter/exit
@@ -192,11 +192,11 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*));
         }
 
-        int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end);
+        int* d_v_parent_index = cnrn_gpu_copyin(nt->_v_parent_index, nt->end);
         cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*));
 
         /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
-        Memb_list** d_ml_list = (Memb_list**) cnrn_gpu_copyin(nt->_ml_list,
+        Memb_list** d_ml_list = cnrn_gpu_copyin(nt->_ml_list,
                                                          corenrn.get_memb_funcs().size());
         cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**));
 
@@ -209,7 +209,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         for (auto tml = nt->tml; tml; tml = tml->next) {
             /*copy tml to device*/
             /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/
-            auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml);
+            auto d_tml = cnrn_gpu_copyin(tml);
 
             /*first tml is pointed by nt */
             if (first_tml) {
@@ -224,7 +224,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             d_last_tml = d_tml;
 
             /* now for every tml, there is a ml. copy that and setup pointer */
-            auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml);
+            auto d_ml = cnrn_gpu_copyin(tml->ml);
             cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*));
 
             /* setup nt._ml_list */
@@ -242,19 +242,19 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
 
             if (!is_art) {
-                int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, n);
+                int* d_nodeindices = cnrn_gpu_copyin(tml->ml->nodeindices, n);
                 cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*));
             }
 
             if (szdp) {
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, pcnt);
+                int* d_pdata = cnrn_gpu_copyin(tml->ml->pdata, pcnt);
                 cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*));
             }
 
             int ts = corenrn.get_memb_funcs()[type].thread_size_;
             if (ts) {
-                ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread, ts);
+                ThreadDatum* td = cnrn_gpu_copyin(tml->ml->_thread, ts);
                 cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*));
             }
 
@@ -267,27 +267,27 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
             // if net receive buffer exist for mechanism
             if (nrb) {
-                d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb);
+                d_nrb = cnrn_gpu_copyin(nrb);
                 cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer),
                                      &d_nrb,
                                      sizeof(NetReceiveBuffer_t*));
 
-                d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size);
+                d_pnt_index = cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
 
-                d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, nrb->_size);
+                d_weight_index = cnrn_gpu_copyin(nrb->_weight_index, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
 
-                d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size);
+                d_nrb_t = cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
 
-                d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size);
+                d_nrb_flag = cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
 
-                d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1);
+                d_displ = cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1);
                 cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
 
-                d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size);
+                d_nrb_index = cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size);
                 cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
             }
 
@@ -300,25 +300,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 int* d_iptr;
                 double* d_dptr;
 
-                d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb);
+                d_nsb = cnrn_gpu_copyin(nsb);
                 cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*));
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, nsb->_size);
+                d_iptr = cnrn_gpu_copyin(nsb->_sendtype, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*));
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, nsb->_size);
+                d_iptr = cnrn_gpu_copyin(nsb->_vdata_index, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*));
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, nsb->_size);
+                d_iptr = cnrn_gpu_copyin(nsb->_pnt_index, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*));
 
-                d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, nsb->_size);
+                d_iptr = cnrn_gpu_copyin(nsb->_weight_index, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*));
 
-                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, nsb->_size);
+                d_dptr = cnrn_gpu_copyin(nsb->_nsb_t, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*));
 
-                d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, nsb->_size);
+                d_dptr = cnrn_gpu_copyin(nsb->_nsb_flag, nsb->_size);
                 cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*));
             }
         }
@@ -329,27 +329,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
 
             /* copy shadow_rhs to device and fix-up the pointer */
-            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt);
+            d_shadow_ptr = cnrn_gpu_copyin(nt->_shadow_rhs, pcnt);
             cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*));
 
             /* copy shadow_d to device and fix-up the pointer */
-            d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt);
+            d_shadow_ptr = cnrn_gpu_copyin(nt->_shadow_d, pcnt);
             cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*));
         }
 
         /* Fast membrane current calculation struct */
         if (nt->nrn_fast_imem) {
-            auto* d_fast_imem = reinterpret_cast<NrnFastImem*>(
-                cnrn_gpu_copyin(nt->nrn_fast_imem));
+            NrnFastImem* d_fast_imem = cnrn_gpu_copyin(nt->nrn_fast_imem);
             cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*));
             {
-                auto* d_ptr = reinterpret_cast<double*>(
-                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end));
+                double* d_ptr = cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
                 cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*));
             }
             {
-                auto* d_ptr = reinterpret_cast<double*>(
-                    cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end));
+                double* d_ptr = cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end);
                 cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*));
             }
         }
@@ -358,20 +355,20 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
              */
             Point_process* pntptr =
-                (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc);
+                cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc);
             cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*));
         }
 
         if (nt->n_weight) {
             /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
-            double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, nt->n_weight);
+            double* d_weights = cnrn_gpu_copyin(nt->weights, nt->n_weight);
             cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*));
         }
 
         if (nt->_nvdata) {
             /* copy vdata which is setup in bbcore_read. This contains cuda allocated
              * nrnran123_State * */
-            void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, nt->_nvdata);
+            void** d_vdata = cnrn_gpu_copyin(nt->_vdata, nt->_nvdata);
             cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**));
         }
 
@@ -382,15 +379,15 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
              * to
              * VTable and alignment */
             PreSynHelper* d_presyns_helper =
-                (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, nt->n_presyn);
+                cnrn_gpu_copyin(nt->presyns_helper, nt->n_presyn);
             cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*));
-            PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, nt->n_presyn);
+            PreSyn* d_presyns = cnrn_gpu_copyin(nt->presyns, nt->n_presyn);
             cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*));
         }
 
         if (nt->_net_send_buffer_size) {
             /* copy send_receive buffer */
-            int* d_net_send_buffer = (int*) cnrn_gpu_copyin(nt->_net_send_buffer,
+            int* d_net_send_buffer = cnrn_gpu_copyin(nt->_net_send_buffer,
                     nt->_net_send_buffer_size);
             cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*));
         }
@@ -398,7 +395,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay);
+            void** d_vecplay = cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay);
             cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**));
 
             nrn_VecPlay_copyto_device(nt, d_vecplay);
@@ -409,39 +406,39 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
                 int* d_ptr = nullptr;
-                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info);
+                InterleaveInfo* d_info = cnrn_gpu_copyin(info);
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->stride, info->nstride + 1);
+                d_ptr = cnrn_gpu_copyin(info->stride, info->nstride + 1);
                 cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, nt->ncell);
+                d_ptr = cnrn_gpu_copyin(info->firstnode, nt->ncell);
                 cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, nt->ncell);
+                d_ptr = cnrn_gpu_copyin(info->lastnode, nt->ncell);
                 cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, nt->ncell);
+                d_ptr = cnrn_gpu_copyin(info->cellsize, nt->ncell);
                 cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
 
             } else if (interleave_permute_type == 2) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info);
+                InterleaveInfo* d_info = cnrn_gpu_copyin(info);
                 int* d_ptr = nullptr;
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->stride, info->nstride);
+                d_ptr = cnrn_gpu_copyin(info->stride, info->nstride);
                 cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, info->nwarp + 1);
+                d_ptr = cnrn_gpu_copyin(info->firstnode, info->nwarp + 1);
                 cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, info->nwarp + 1);
+                d_ptr = cnrn_gpu_copyin(info->lastnode, info->nwarp + 1);
                 cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, info->nwarp + 1);
+                d_ptr = cnrn_gpu_copyin(info->stridedispl, info->nwarp + 1);
                 cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*));
 
-                d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, info->nwarp);
+                d_ptr = cnrn_gpu_copyin(info->cellsize, info->nwarp);
                 cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
             } else {
                 printf("\n ERROR: only --cell_permute = [12] implemented");
@@ -456,21 +453,18 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (tr) {
                 // Create a device-side copy of the `trajec_requests` struct and
                 // make sure the device-side NrnThread object knows about it.
-                auto* d_trajec_requests = reinterpret_cast<TrajectoryRequests*>(
-                    cnrn_gpu_copyin(tr));
+                TrajectoryRequests* d_trajec_requests = cnrn_gpu_copyin(tr);
                 cnrn_memcpy_to_device(&(d_nt->trajec_requests),
                                      &d_trajec_requests,
                                      sizeof(TrajectoryRequests*));
                 // Initialise the double** gather member of the struct.
-                auto* d_tr_gather = reinterpret_cast<double**>(
-                    cnrn_gpu_copyin(tr->gather, tr->n_trajec));
+                double** d_tr_gather = cnrn_gpu_copyin(tr->gather, tr->n_trajec);
                 cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**));
                 // Initialise the double** varrays member of the struct if it's
                 // set.
                 double** d_tr_varrays{nullptr};
                 if (tr->varrays) {
-                    d_tr_varrays = reinterpret_cast<double**>(
-                        cnrn_gpu_copyin(tr->varrays, tr->n_trajec));
+                    d_tr_varrays = cnrn_gpu_copyin(tr->varrays, tr->n_trajec);
                     cnrn_memcpy_to_device(&(d_trajec_requests->varrays),
                                          &d_tr_varrays,
                                          sizeof(double**));
@@ -480,8 +474,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                         // tr->varrays[i] is a buffer of tr->bsize doubles on the host,
                         // make a device-side copy of it and store a pointer to it in
                         // the device-side version of tr->varrays.
-                        auto* d_buf_traj_i = reinterpret_cast<double*>(
-                            cnrn_gpu_copyin(tr->varrays[i], tr->bsize));
+                        double* d_buf_traj_i = cnrn_gpu_copyin(tr->varrays[i], tr->bsize);
                         cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*));
                     }
                     // tr->gather[i] is a double* referring to (host) data in the
@@ -512,12 +505,12 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_co
     /// if we need to copy IvocVect vector then newly alloated vector
     /// on the device is a new destination pointer
     if(vector_copy_needed) {
-        d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from);
+        d_iv = cnrn_gpu_copyin(const_cast<IvocVect*>(&from));
         cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*));
     }
     size_t n = from.size();
     if (n) {
-        double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), n);
+        double* d_data =  cnrn_gpu_copyin(const_cast<double*>(from.data()), n);
         cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*));
     }
 #else
@@ -577,22 +570,22 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
         NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb);
 
         // recopy the vectors in the buffer
-        d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size);
+        d_pnt_index = cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
 
-        d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, nrb->_size);
+        d_weight_index = cnrn_gpu_copyin(nrb->_weight_index, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
 
-        d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size);
+        d_nrb_t = cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
 
-        d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size);
+        d_nrb_flag = cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
 
-        d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1);
+        d_displ = cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1);
         cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
 
-        d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size);
+        d_nrb_index = cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size);
         cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
     }
 #endif
@@ -1195,30 +1188,30 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
 
     int n = ns->n * ns->n_instance;
     // actually, the values of double do not matter, only the  pointers.
-    NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns);
+    NewtonSpace* d_ns = cnrn_gpu_copyin(ns);
 
     double* pd;
 
-    pd = (double*) cnrn_gpu_copyin(ns->delta_x, n);
+    pd = cnrn_gpu_copyin(ns->delta_x, n);
     cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*));
 
-    pd = (double*) cnrn_gpu_copyin(ns->high_value, n);
+    pd = cnrn_gpu_copyin(ns->high_value, n);
     cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*));
 
-    pd = (double*) cnrn_gpu_copyin(ns->low_value, n);
+    pd = cnrn_gpu_copyin(ns->low_value, n);
     cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*));
 
-    pd = (double*) cnrn_gpu_copyin(ns->rowmax, n);
+    pd = cnrn_gpu_copyin(ns->rowmax, n);
     cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*));
 
-    auto pint = (int*) cnrn_gpu_copyin(ns->perm, n);
+    auto pint = cnrn_gpu_copyin(ns->perm, n);
     cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*));
 
-    auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n);
+    auto ppd = cnrn_gpu_copyin(ns->jacobian, ns->n);
     cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**));
 
     // the actual jacobian doubles were allocated as a single array
-    double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n);
+    double* d_jacdat = cnrn_gpu_copyin(ns->jacobian[0], ns->n * n);
 
     for (int i = 0; i < ns->n; ++i) {
         pd = d_jacdat + i * n;
@@ -1255,33 +1248,33 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
     }
 
     unsigned n1 = so->neqn + 1;
-    SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so);
+    SparseObj* d_so = cnrn_gpu_copyin(so);
     // only pointer fields in SparseObj that need setting up are
     //   rowst, diag, rhs, ngetcall, coef_list
     // only pointer fields in Elm that need setting up are
     //   r_down, c_right, value
     // do not care about the Elm* ptr value, just the space.
 
-    Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1);
+    Elm** d_rowst = cnrn_gpu_copyin(so->rowst, n1);
     cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**));
 
-    Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1);
+    Elm** d_diag = cnrn_gpu_copyin(so->diag, n1);
     cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**));
 
-    auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded);
+    unsigned* pu = cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded);
     cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**));
 
-    auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded);
+    double* pd = cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded);
     cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*));
 
-    auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size);
+    double** d_coef_list = cnrn_gpu_copyin(so->coef_list, so->coef_list_size);
     cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**));
 
     // Fill in relevant Elm pointer values
 
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            Elm* pelm = (Elm*) cnrn_gpu_copyin(elm);
+            Elm* pelm = cnrn_gpu_copyin(elm);
 
             if (elm == so->rowst[irow]) {
                 cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*));
@@ -1301,7 +1294,7 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
                 }
             }
 
-            pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded);
+            pd = cnrn_gpu_copyin(elm->value, so->_cntml_padded);
             cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*));
         }
     }
@@ -1356,11 +1349,11 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
 
 void nrn_ion_global_map_copyto_device() {
     if (nrn_ion_global_map_size) {
-        double** d_data = (double**) cnrn_gpu_copyin(nrn_ion_global_map,
+        double** d_data = cnrn_gpu_copyin(nrn_ion_global_map,
                                                 nrn_ion_global_map_size);
         for (int j = 0; j < nrn_ion_global_map_size; j++) {
             if (nrn_ion_global_map[j]) {
-                double* d_mechmap = (double*) cnrn_gpu_copyin(nrn_ion_global_map[j],
+                double* d_mechmap = cnrn_gpu_copyin(nrn_ion_global_map[j],
                                                          ion_global_map_member_size);
                 cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*));
             }
@@ -1429,7 +1422,7 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i];
 
         /** just VecPlayContinuous object */
-        void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance);
+        void* d_p = cnrn_gpu_copyin(vecplay_instance);
         cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*));
 
         VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p;
@@ -1444,7 +1437,7 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         }
 
         /** copy PlayRecordEvent : todo: verify this */
-        PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_);
+        PlayRecordEvent* d_e_ = cnrn_gpu_copyin(vecplay_instance->e_);
                                                    
         cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*));
         cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*));

From f34aabf46393efe374eedd06a64c5bda5516f2d3 Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Mon, 13 Dec 2021 11:27:21 +0100
Subject: [PATCH 3/5] More template, more const

---
 coreneuron/gpu/nrn_acc_manager.cpp | 409 ++++++++++++++---------------
 1 file changed, 200 insertions(+), 209 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 3b076c172..0b5aa2cc6 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -43,31 +43,31 @@ void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
-void* cnrn_target_deviceptr(void* h_ptr) {
+template <typename T>
+T* cnrn_target_deviceptr(const T* h_ptr) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return acc_deviceptr(h_ptr);
+    return acc_deviceptr(static_cast<T*>(h_ptr));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    auto device_id = omp_get_default_device();
-    return omp_get_mapped_ptr(h_ptr, device_id);
+    return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
 #else
-    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
 template <typename T>
-T* cnrn_gpu_copyin(T* h_ptr, std::size_t len = 1) {
+T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return static_cast<T*>(acc_copyin(h_ptr, len * sizeof(T)));
+    return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
     #pragma omp target enter data map(to:h_ptr[:len])
-    return static_cast<T*>(cnrn_target_deviceptr(h_ptr));
+    return cnrn_target_deviceptr(const_cast<T*>(h_ptr));
 #else
-    throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
 template <typename T>
-void cnrn_target_delete(T* h_ptr, size_t len = 1) {
+void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
     acc_delete(h_ptr, len * sizeof(T));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
@@ -77,15 +77,14 @@ void cnrn_target_delete(T* h_ptr, size_t len = 1) {
 #endif
 }
 
-void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) {
+template <typename T>
+void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    acc_memcpy_to_device(d_ptr, h_ptr, len);
+    acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    auto host_id = omp_get_initial_device();
-    auto device_id = omp_get_default_device();
-    omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id);
+    omp_target_memcpy(d_ptr, const_cast<T*>(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device());
 #else
-    throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
+    throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
 #endif
 }
 
@@ -107,13 +106,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         NrnThread* nt = threads + i;  // NrnThread on host
 
         if (nt->n_presyn) {
-            PreSyn* d_presyns = cnrn_gpu_copyin(nt->presyns, nt->n_presyn);
+            PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay);
+            void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
             // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu,
             // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay
             // inside nt on cpu (due to unified memory).
@@ -131,7 +130,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
      * find
      * corresponding NrnThread using Point_process in NET_RECEIVE block
      */
-    NrnThread* d_threads = cnrn_gpu_copyin(threads, nthreads);
+    NrnThread* d_threads = cnrn_target_copyin(threads, nthreads);
 
     if (interleave_info == nullptr) {
         printf("\n Warning: No permutation data? Required for linear algebra!");
@@ -150,7 +149,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         /* -- copy _data to device -- */
 
         /*copy all double data for thread */
-        d__data = cnrn_gpu_copyin(nt->_data, nt->_ndata);
+        d__data = cnrn_target_copyin(nt->_data, nt->_ndata);
 
 
         /* Here is the example of using OpenACC data enter/exit
@@ -161,7 +160,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
          */
 
         /*update d_nt._data to point to device copy */
-        cnrn_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_data), &d__data);
 
         /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */
         double* dptr;
@@ -170,35 +169,35 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         int ne = nrn_soa_padded_size(nt->end, 0);
 
         dptr = d__data + 0 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr));
 
         dptr = d__data + 1 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_d), &(dptr));
 
         dptr = d__data + 2 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_a), &(dptr));
 
         dptr = d__data + 3 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_b), &(dptr));
 
         dptr = d__data + 4 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_v), &(dptr));
 
         dptr = d__data + 5 * ne;
-        cnrn_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_area), &(dptr));
 
         if (nt->_actual_diam) {
             dptr = d__data + 6 * ne;
-            cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*));
+            cnrn_target_memcpy_to_device(&(d_nt->_actual_diam), &(dptr));
         }
 
-        int* d_v_parent_index = cnrn_gpu_copyin(nt->_v_parent_index, nt->end);
-        cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*));
+        int* d_v_parent_index = cnrn_target_copyin(nt->_v_parent_index, nt->end);
+        cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index));
 
         /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
-        Memb_list** d_ml_list = cnrn_gpu_copyin(nt->_ml_list,
+        Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list,
                                                          corenrn.get_memb_funcs().size());
-        cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**));
+        cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list));
 
         /* -- copy NrnThreadMembList list ml to device -- */
 
@@ -209,26 +208,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         for (auto tml = nt->tml; tml; tml = tml->next) {
             /*copy tml to device*/
             /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/
-            auto d_tml = cnrn_gpu_copyin(tml);
+            auto d_tml = cnrn_target_copyin(tml);
 
             /*first tml is pointed by nt */
             if (first_tml) {
-                cnrn_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*));
+                cnrn_target_memcpy_to_device(&(d_nt->tml), &d_tml);
                 first_tml = false;
             } else {
                 /*rest of tml forms linked list */
-                cnrn_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*));
+                cnrn_target_memcpy_to_device(&(d_last_tml->next), &d_tml);
             }
 
             // book keeping for linked-list
             d_last_tml = d_tml;
 
             /* now for every tml, there is a ml. copy that and setup pointer */
-            auto d_ml = cnrn_gpu_copyin(tml->ml);
-            cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*));
+            auto d_ml = cnrn_target_copyin(tml->ml);
+            cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml);
 
             /* setup nt._ml_list */
-            cnrn_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*));
+            cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml);
 
             int type = tml->index;
             int n = tml->ml->nodecount;
@@ -237,25 +236,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int is_art = corenrn.get_is_artificial()[type];
 
             // get device pointer for corresponding mechanism data
-            dptr = (double*) cnrn_target_deviceptr(tml->ml->data);
-            cnrn_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*));
+            dptr = cnrn_target_deviceptr(tml->ml->data);
+            cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
 
 
             if (!is_art) {
-                int* d_nodeindices = cnrn_gpu_copyin(tml->ml->nodeindices, n);
-                cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*));
+                int* d_nodeindices = cnrn_target_copyin(tml->ml->nodeindices, n);
+                cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);
             }
 
             if (szdp) {
                 int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                int* d_pdata = cnrn_gpu_copyin(tml->ml->pdata, pcnt);
-                cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*));
+                int* d_pdata = cnrn_target_copyin(tml->ml->pdata, pcnt);
+                cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata);
             }
 
             int ts = corenrn.get_memb_funcs()[type].thread_size_;
             if (ts) {
-                ThreadDatum* td = cnrn_gpu_copyin(tml->ml->_thread, ts);
-                cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*));
+                ThreadDatum* td = cnrn_target_copyin(tml->ml->_thread, ts);
+                cnrn_target_memcpy_to_device(&(d_ml->_thread), &td);
             }
 
             NetReceiveBuffer_t *nrb, *d_nrb;
@@ -267,28 +266,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 
             // if net receive buffer exist for mechanism
             if (nrb) {
-                d_nrb = cnrn_gpu_copyin(nrb);
-                cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer),
-                                     &d_nrb,
-                                     sizeof(NetReceiveBuffer_t*));
+                d_nrb = cnrn_target_copyin(nrb);
+                cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb);
 
-                d_pnt_index = cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
+                d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
 
-                d_weight_index = cnrn_gpu_copyin(nrb->_weight_index, nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
+                d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
 
-                d_nrb_t = cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
+                d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
 
-                d_nrb_flag = cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
+                d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
 
-                d_displ = cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1);
-                cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
+                d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
+                cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
 
-                d_nrb_index = cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size);
-                cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
+                d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
+                cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
             }
 
             /* copy NetSendBuffer_t on to GPU */
@@ -300,26 +297,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 int* d_iptr;
                 double* d_dptr;
 
-                d_nsb = cnrn_gpu_copyin(nsb);
-                cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*));
+                d_nsb = cnrn_target_copyin(nsb);
+                cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb);
 
-                d_iptr = cnrn_gpu_copyin(nsb->_sendtype, nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*));
+                d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr);
 
-                d_iptr = cnrn_gpu_copyin(nsb->_vdata_index, nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*));
+                d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr);
 
-                d_iptr = cnrn_gpu_copyin(nsb->_pnt_index, nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*));
+                d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr);
 
-                d_iptr = cnrn_gpu_copyin(nsb->_weight_index, nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*));
+                d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr);
 
-                d_dptr = cnrn_gpu_copyin(nsb->_nsb_t, nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*));
+                d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr);
 
-                d_dptr = cnrn_gpu_copyin(nsb->_nsb_flag, nsb->_size);
-                cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*));
+                d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size);
+                cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr);
             }
         }
 
@@ -329,25 +326,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
 
             /* copy shadow_rhs to device and fix-up the pointer */
-            d_shadow_ptr = cnrn_gpu_copyin(nt->_shadow_rhs, pcnt);
-            cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*));
+            d_shadow_ptr = cnrn_target_copyin(nt->_shadow_rhs, pcnt);
+            cnrn_target_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr);
 
             /* copy shadow_d to device and fix-up the pointer */
-            d_shadow_ptr = cnrn_gpu_copyin(nt->_shadow_d, pcnt);
-            cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*));
+            d_shadow_ptr = cnrn_target_copyin(nt->_shadow_d, pcnt);
+            cnrn_target_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr);
         }
 
         /* Fast membrane current calculation struct */
         if (nt->nrn_fast_imem) {
-            NrnFastImem* d_fast_imem = cnrn_gpu_copyin(nt->nrn_fast_imem);
-            cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*));
+            NrnFastImem* d_fast_imem = cnrn_target_copyin(nt->nrn_fast_imem);
+            cnrn_target_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem);
             {
-                double* d_ptr = cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
-                cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*));
+                double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
+                cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr);
             }
             {
-                double* d_ptr = cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end);
-                cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*));
+                double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end);
+                cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr);
             }
         }
 
@@ -355,21 +352,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
              */
             Point_process* pntptr =
-                cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc);
-            cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*));
+                cnrn_target_copyin(nt->pntprocs, nt->n_pntproc);
+            cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr);
         }
 
         if (nt->n_weight) {
             /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
-            double* d_weights = cnrn_gpu_copyin(nt->weights, nt->n_weight);
-            cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*));
+            double* d_weights = cnrn_target_copyin(nt->weights, nt->n_weight);
+            cnrn_target_memcpy_to_device(&(d_nt->weights), &d_weights);
         }
 
         if (nt->_nvdata) {
             /* copy vdata which is setup in bbcore_read. This contains cuda allocated
              * nrnran123_State * */
-            void** d_vdata = cnrn_gpu_copyin(nt->_vdata, nt->_nvdata);
-            cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**));
+            void** d_vdata = cnrn_target_copyin(nt->_vdata, nt->_nvdata);
+            cnrn_target_memcpy_to_device(&(d_nt->_vdata), &d_vdata);
         }
 
         if (nt->n_presyn) {
@@ -379,24 +376,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
              * to
              * VTable and alignment */
             PreSynHelper* d_presyns_helper =
-                cnrn_gpu_copyin(nt->presyns_helper, nt->n_presyn);
-            cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*));
-            PreSyn* d_presyns = cnrn_gpu_copyin(nt->presyns, nt->n_presyn);
-            cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*));
+                cnrn_target_copyin(nt->presyns_helper, nt->n_presyn);
+            cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper);
+            PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
+            cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns);
         }
 
         if (nt->_net_send_buffer_size) {
             /* copy send_receive buffer */
-            int* d_net_send_buffer = cnrn_gpu_copyin(nt->_net_send_buffer,
+            int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer,
                     nt->_net_send_buffer_size);
-            cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*));
+            cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer);
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay);
-            cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**));
+            void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
+            cnrn_target_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay);
 
             nrn_VecPlay_copyto_device(nt, d_vecplay);
         }
@@ -406,40 +403,40 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
                 int* d_ptr = nullptr;
-                InterleaveInfo* d_info = cnrn_gpu_copyin(info);
+                InterleaveInfo* d_info = cnrn_target_copyin(info);
 
-                d_ptr = cnrn_gpu_copyin(info->stride, info->nstride + 1);
-                cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->stride, info->nstride + 1);
+                cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
 
-                d_ptr = cnrn_gpu_copyin(info->firstnode, nt->ncell);
-                cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->firstnode, nt->ncell);
+                cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
 
-                d_ptr = cnrn_gpu_copyin(info->lastnode, nt->ncell);
-                cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->lastnode, nt->ncell);
+                cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
 
-                d_ptr = cnrn_gpu_copyin(info->cellsize, nt->ncell);
-                cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->cellsize, nt->ncell);
+                cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
 
             } else if (interleave_permute_type == 2) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = cnrn_gpu_copyin(info);
+                InterleaveInfo* d_info = cnrn_target_copyin(info);
                 int* d_ptr = nullptr;
 
-                d_ptr = cnrn_gpu_copyin(info->stride, info->nstride);
-                cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->stride, info->nstride);
+                cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
 
-                d_ptr = cnrn_gpu_copyin(info->firstnode, info->nwarp + 1);
-                cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->firstnode, info->nwarp + 1);
+                cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
 
-                d_ptr = cnrn_gpu_copyin(info->lastnode, info->nwarp + 1);
-                cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->lastnode, info->nwarp + 1);
+                cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
 
-                d_ptr = cnrn_gpu_copyin(info->stridedispl, info->nwarp + 1);
-                cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->stridedispl, info->nwarp + 1);
+                cnrn_target_memcpy_to_device(&(d_info->stridedispl), &d_ptr);
 
-                d_ptr = cnrn_gpu_copyin(info->cellsize, info->nwarp);
-                cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->cellsize, info->nwarp);
+                cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
             } else {
                 printf("\n ERROR: only --cell_permute = [12] implemented");
                 abort();
@@ -453,34 +450,30 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (tr) {
                 // Create a device-side copy of the `trajec_requests` struct and
                 // make sure the device-side NrnThread object knows about it.
-                TrajectoryRequests* d_trajec_requests = cnrn_gpu_copyin(tr);
-                cnrn_memcpy_to_device(&(d_nt->trajec_requests),
-                                     &d_trajec_requests,
-                                     sizeof(TrajectoryRequests*));
+                TrajectoryRequests* d_trajec_requests = cnrn_target_copyin(tr);
+                cnrn_target_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests);
                 // Initialise the double** gather member of the struct.
-                double** d_tr_gather = cnrn_gpu_copyin(tr->gather, tr->n_trajec);
-                cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**));
+                double** d_tr_gather = cnrn_target_copyin(tr->gather, tr->n_trajec);
+                cnrn_target_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather);
                 // Initialise the double** varrays member of the struct if it's
                 // set.
                 double** d_tr_varrays{nullptr};
                 if (tr->varrays) {
-                    d_tr_varrays = cnrn_gpu_copyin(tr->varrays, tr->n_trajec);
-                    cnrn_memcpy_to_device(&(d_trajec_requests->varrays),
-                                         &d_tr_varrays,
-                                         sizeof(double**));
+                    d_tr_varrays = cnrn_target_copyin(tr->varrays, tr->n_trajec);
+                    cnrn_target_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays);
                 }
                 for (int i = 0; i < tr->n_trajec; ++i) {
                     if (tr->varrays) {
                         // tr->varrays[i] is a buffer of tr->bsize doubles on the host,
                         // make a device-side copy of it and store a pointer to it in
                         // the device-side version of tr->varrays.
-                        double* d_buf_traj_i = cnrn_gpu_copyin(tr->varrays[i], tr->bsize);
-                        cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*));
+                        double* d_buf_traj_i = cnrn_target_copyin(tr->varrays[i], tr->bsize);
+                        cnrn_target_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i);
                     }
                     // tr->gather[i] is a double* referring to (host) data in the
                     // (host) _data block
                     auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]);
-                    cnrn_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*));
+                    cnrn_target_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i);
                 }
                 // TODO: other `double** scatter` and `void** vpr` members of
                 // the TrajectoryRequests struct are not copied to the device.
@@ -505,13 +498,13 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_co
     /// if we need to copy IvocVect vector then newly alloated vector
     /// on the device is a new destination pointer
     if(vector_copy_needed) {
-        d_iv = cnrn_gpu_copyin(const_cast<IvocVect*>(&from));
-        cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*));
+        d_iv = cnrn_target_copyin(&from);
+        cnrn_target_memcpy_to_device(&to, d_iv);
     }
     size_t n = from.size();
     if (n) {
-        double* d_data =  cnrn_gpu_copyin(const_cast<double*>(from.data()), n);
-        cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*));
+        double* d_data =  cnrn_target_copyin(from.data(), n);
+        cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data);
     }
 #else
     (void) from;
@@ -567,26 +560,26 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
         nrn_pragma_acc(update device(nrb));
         nrn_pragma_omp(target update to(nrb));
 
-        NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb);
+        NetReceiveBuffer_t* d_nrb = cnrn_target_deviceptr(nrb);
 
         // recopy the vectors in the buffer
-        d_pnt_index = cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
+        d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
 
-        d_weight_index = cnrn_gpu_copyin(nrb->_weight_index, nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
+        d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
 
-        d_nrb_t = cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
+        d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
 
-        d_nrb_flag = cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
+        d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
 
-        d_displ = cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1);
-        cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
+        d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
+        cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
 
-        d_nrb_index = cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size);
-        cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
+        d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
     }
 #endif
 }
@@ -1023,21 +1016,21 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
 /** Cleanup device memory that is being tracked by the OpenACC runtime.
  *
  *  This function painstakingly calls `cnrn_target_delete` in reverse order on all
- *  pointers that were passed to `cnrn_gpu_copyin` in `setup_nrnthreads_on_device`.
+ *  pointers that were passed to `cnrn_target_copyin` in `setup_nrnthreads_on_device`.
  *  This cleanup ensures that if the GPU is initialised multiple times from the
  *  same process then the OpenACC runtime will not be polluted with old
  *  pointers, which can cause errors. In particular if we do:
  *  @code
  *    {
  *      // ... some_ptr is dynamically allocated ...
- *      cnrn_gpu_copyin(some_ptr, some_size);
+ *      cnrn_target_copyin(some_ptr, some_size);
  *      // ... do some work ...
  *      // cnrn_target_delete(some_ptr);
  *      free(some_ptr);
  *    }
  *    {
  *      // ... same_ptr_again is dynamically allocated at the same address ...
- *      cnrn_gpu_copyin(same_ptr_again, some_other_size); // ERROR
+ *      cnrn_target_copyin(same_ptr_again, some_other_size); // ERROR
  *    }
  *  @endcode
  *  the application will/may abort with an error such as:
@@ -1188,34 +1181,34 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
 
     int n = ns->n * ns->n_instance;
     // actually, the values of double do not matter, only the  pointers.
-    NewtonSpace* d_ns = cnrn_gpu_copyin(ns);
+    NewtonSpace* d_ns = cnrn_target_copyin(ns);
 
     double* pd;
 
-    pd = cnrn_gpu_copyin(ns->delta_x, n);
-    cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->delta_x, n);
+    cnrn_target_memcpy_to_device(&(d_ns->delta_x), &pd);
 
-    pd = cnrn_gpu_copyin(ns->high_value, n);
-    cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->high_value, n);
+    cnrn_target_memcpy_to_device(&(d_ns->high_value), &pd);
 
-    pd = cnrn_gpu_copyin(ns->low_value, n);
-    cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->low_value, n);
+    cnrn_target_memcpy_to_device(&(d_ns->low_value), &pd);
 
-    pd = cnrn_gpu_copyin(ns->rowmax, n);
-    cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->rowmax, n);
+    cnrn_target_memcpy_to_device(&(d_ns->rowmax), &pd);
 
-    auto pint = cnrn_gpu_copyin(ns->perm, n);
-    cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*));
+    auto pint = cnrn_target_copyin(ns->perm, n);
+    cnrn_target_memcpy_to_device(&(d_ns->perm), &pint);
 
-    auto ppd = cnrn_gpu_copyin(ns->jacobian, ns->n);
-    cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**));
+    auto ppd = cnrn_target_copyin(ns->jacobian, ns->n);
+    cnrn_target_memcpy_to_device(&(d_ns->jacobian), &ppd);
 
     // the actual jacobian doubles were allocated as a single array
-    double* d_jacdat = cnrn_gpu_copyin(ns->jacobian[0], ns->n * n);
+    double* d_jacdat = cnrn_target_copyin(ns->jacobian[0], ns->n * n);
 
     for (int i = 0; i < ns->n; ++i) {
         pd = d_jacdat + i * n;
-        cnrn_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*));
+        cnrn_target_memcpy_to_device(&(ppd[i]), &pd);
     }
 #endif
 }
@@ -1248,76 +1241,76 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
     }
 
     unsigned n1 = so->neqn + 1;
-    SparseObj* d_so = cnrn_gpu_copyin(so);
+    SparseObj* d_so = cnrn_target_copyin(so);
     // only pointer fields in SparseObj that need setting up are
     //   rowst, diag, rhs, ngetcall, coef_list
     // only pointer fields in Elm that need setting up are
     //   r_down, c_right, value
     // do not care about the Elm* ptr value, just the space.
 
-    Elm** d_rowst = cnrn_gpu_copyin(so->rowst, n1);
-    cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**));
+    Elm** d_rowst = cnrn_target_copyin(so->rowst, n1);
+    cnrn_target_memcpy_to_device(&(d_so->rowst), &d_rowst);
 
-    Elm** d_diag = cnrn_gpu_copyin(so->diag, n1);
-    cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**));
+    Elm** d_diag = cnrn_target_copyin(so->diag, n1);
+    cnrn_target_memcpy_to_device(&(d_so->diag), &d_diag);
 
-    unsigned* pu = cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded);
-    cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**));
+    unsigned* pu = cnrn_target_copyin(so->ngetcall, so->_cntml_padded);
+    cnrn_target_memcpy_to_device(&(d_so->ngetcall), &pu);
 
-    double* pd = cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded);
-    cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*));
+    double* pd = cnrn_target_copyin(so->rhs, n1 * so->_cntml_padded);
+    cnrn_target_memcpy_to_device(&(d_so->rhs), &pd);
 
-    double** d_coef_list = cnrn_gpu_copyin(so->coef_list, so->coef_list_size);
-    cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**));
+    double** d_coef_list = cnrn_target_copyin(so->coef_list, so->coef_list_size);
+    cnrn_target_memcpy_to_device(&(d_so->coef_list), &d_coef_list);
 
     // Fill in relevant Elm pointer values
 
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            Elm* pelm = cnrn_gpu_copyin(elm);
+            Elm* pelm = cnrn_target_copyin(elm);
 
             if (elm == so->rowst[irow]) {
-                cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*));
+                cnrn_target_memcpy_to_device(&(d_rowst[irow]), &pelm);
             } else {
-                Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->c_left);
-                cnrn_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*));
+                Elm* d_e = cnrn_target_deviceptr(elm->c_left);
+                cnrn_target_memcpy_to_device(&(pelm->c_left), &d_e);
             }
 
             if (elm->col == elm->row) {
-                cnrn_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*));
+                cnrn_target_memcpy_to_device(&(d_diag[irow]), &pelm);
             }
 
             if (irow > 1) {
                 if (elm->r_up) {
-                    Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->r_up);
-                    cnrn_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*));
+                    Elm* d_e = cnrn_target_deviceptr(elm->r_up);
+                    cnrn_target_memcpy_to_device(&(pelm->r_up), &d_e);
                 }
             }
 
-            pd = cnrn_gpu_copyin(elm->value, so->_cntml_padded);
-            cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*));
+            pd = cnrn_target_copyin(elm->value, so->_cntml_padded);
+            cnrn_target_memcpy_to_device(&(pelm->value), &pd);
         }
     }
 
     // visit all the Elm again and fill in pelm->r_down and pelm->c_left
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            auto pelm = (Elm*) cnrn_target_deviceptr(elm);
+            auto pelm = cnrn_target_deviceptr(elm);
             if (elm->r_down) {
-                auto d_e = (Elm*) cnrn_target_deviceptr(elm->r_down);
-                cnrn_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*));
+                auto d_e = cnrn_target_deviceptr(elm->r_down);
+                cnrn_target_memcpy_to_device(&(pelm->r_down), &d_e);
             }
             if (elm->c_right) {
-                auto d_e = (Elm*) cnrn_target_deviceptr(elm->c_right);
-                cnrn_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*));
+                auto d_e = cnrn_target_deviceptr(elm->c_right);
+                cnrn_target_memcpy_to_device(&(pelm->c_right), &d_e);
             }
         }
     }
 
     // Fill in the d_so->coef_list
     for (unsigned i = 0; i < so->coef_list_size; ++i) {
-        pd = (double*) cnrn_target_deviceptr(so->coef_list[i]);
-        cnrn_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*));
+        pd = cnrn_target_deviceptr(so->coef_list[i]);
+        cnrn_target_memcpy_to_device(&(d_coef_list[i]), &pd);
     }
 #endif
 }
@@ -1349,13 +1342,13 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
 
 void nrn_ion_global_map_copyto_device() {
     if (nrn_ion_global_map_size) {
-        double** d_data = cnrn_gpu_copyin(nrn_ion_global_map,
-                                                nrn_ion_global_map_size);
+        double** d_data = cnrn_target_copyin(nrn_ion_global_map,
+                                             nrn_ion_global_map_size);
         for (int j = 0; j < nrn_ion_global_map_size; j++) {
             if (nrn_ion_global_map[j]) {
-                double* d_mechmap = cnrn_gpu_copyin(nrn_ion_global_map[j],
-                                                         ion_global_map_member_size);
-                cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*));
+                double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j],
+                                                       ion_global_map_member_size);
+                cnrn_target_memcpy_to_device(&(d_data[j]), &d_mechmap);
             }
         }
     }
@@ -1422,10 +1415,8 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i];
 
         /** just VecPlayContinuous object */
-        void* d_p = cnrn_gpu_copyin(vecplay_instance);
-        cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*));
-
-        VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p;
+        VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance);
+        cnrn_target_memcpy_to_device((VecPlayContinuous**)(&(d_vecplay[i])), &d_vecplay_instance);
 
         /** copy y_, t_ and discon_indices_ */
         copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_);
@@ -1437,15 +1428,15 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         }
 
         /** copy PlayRecordEvent : todo: verify this */
-        PlayRecordEvent* d_e_ = cnrn_gpu_copyin(vecplay_instance->e_);
+        PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_);
                                                    
-        cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*));
-        cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*));
+        cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**)(&d_vecplay_instance));
+        cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_);
 
         /** copy pd_ : note that it's pointer inside ml->data and hence data itself is
          * already on GPU */
-        double* d_pd_ = (double*) cnrn_target_deviceptr(vecplay_instance->pd_);
-        cnrn_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*));
+        double* d_pd_ = cnrn_target_deviceptr(vecplay_instance->pd_);
+        cnrn_target_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_);
     }
 }
 

From 4b9af99321cdcacae4154cb1a366e92fca25b02a Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Mon, 13 Dec 2021 12:39:32 +0100
Subject: [PATCH 4/5] Fix static_cast to const_cast

---
 coreneuron/gpu/nrn_acc_manager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 0b5aa2cc6..8ed02d3e7 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -46,7 +46,7 @@ void nrn_VecPlay_delete_from_device(NrnThread* nt);
 template <typename T>
 T* cnrn_target_deviceptr(const T* h_ptr) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return acc_deviceptr(static_cast<T*>(h_ptr));
+    return acc_deviceptr(const_cast<T*>(h_ptr));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
     return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
 #else

From b1ac85392bd36ae282b03cd46be3590d177d2e8c Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Mon, 13 Dec 2021 15:11:03 +0100
Subject: [PATCH 5/5] Fix VecPlayContinuous::discon_indices_ and OpenACC
 compilation.

---
 coreneuron/gpu/nrn_acc_manager.cpp | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 8ed02d3e7..4fe0004fd 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -36,7 +36,7 @@
 #endif
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
-void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div, bool vector_copy_needed = false);
+void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
 void delete_ivoc_vect_from_device(IvocVect&);
 void nrn_ion_global_map_copyto_device();
 void nrn_ion_global_map_delete_from_device();
@@ -46,7 +46,7 @@ void nrn_VecPlay_delete_from_device(NrnThread* nt);
 template <typename T>
 T* cnrn_target_deviceptr(const T* h_ptr) {
 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return acc_deviceptr(const_cast<T*>(h_ptr));
+    return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
     return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
 #else
@@ -490,17 +490,11 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 #endif
 }
 
-void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_copy_needed) {
+void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
 #ifdef _OPENACC
     /// by default `to` is desitionation pointer on a device
     IvocVect* d_iv = &to;
 
-    /// if we need to copy IvocVect vector then newly alloated vector
-    /// on the device is a new destination pointer
-    if(vector_copy_needed) {
-        d_iv = cnrn_target_copyin(&from);
-        cnrn_target_memcpy_to_device(&to, d_iv);
-    }
     size_t n = from.size();
     if (n) {
         double* d_data =  cnrn_target_copyin(from.data(), n);
@@ -1416,21 +1410,24 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
 
         /** just VecPlayContinuous object */
         VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance);
-        cnrn_target_memcpy_to_device((VecPlayContinuous**)(&(d_vecplay[i])), &d_vecplay_instance);
+        cnrn_target_memcpy_to_device((VecPlayContinuous**) (&(d_vecplay[i])), &d_vecplay_instance);
 
         /** copy y_, t_ and discon_indices_ */
         copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_);
         copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_);
+        // OL211213: beware, the test suite does not currently include anything
+        // with a non-null discon_indices_.
         if (vecplay_instance->discon_indices_) {
+            IvocVect* d_discon_indices = cnrn_target_copyin(vecplay_instance->discon_indices_);
+            cnrn_target_memcpy_to_device(&(d_vecplay_instance->discon_indices_), &d_discon_indices);
             copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_),
-                                     *(d_vecplay_instance->discon_indices_),
-                                     true);
+                                     *(d_vecplay_instance->discon_indices_));
         }
 
         /** copy PlayRecordEvent : todo: verify this */
         PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_);
-                                                   
-        cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**)(&d_vecplay_instance));
+
+        cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**) (&d_vecplay_instance));
         cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_);
 
         /** copy pd_ : note that it's pointer inside ml->data and hence data itself is