From 0c42f406d0a3c5af9f6aa74b4f97a58138a09611 Mon Sep 17 00:00:00 2001 From: Nicolas Cornu Date: Fri, 10 Dec 2021 14:23:40 +0100 Subject: [PATCH 1/5] Make a try with pragmas --- coreneuron/gpu/nrn_acc_manager.cpp | 366 ++++++++++++++--------------- 1 file changed, 178 insertions(+), 188 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 373fcdbc3..d8eded736 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -43,56 +43,49 @@ void nrn_ion_global_map_delete_from_device(); void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay); void nrn_VecPlay_delete_from_device(NrnThread* nt); -void* cnrn_gpu_copyin(void* h_ptr, std::size_t len) { +void* cnrn_target_deviceptr(void* h_ptr) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return acc_copyin(h_ptr, len); + return acc_deviceptr(h_ptr); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - auto host_id = omp_get_initial_device(); auto device_id = omp_get_default_device(); - auto* d_ptr = omp_target_alloc(len, device_id); - nrn_assert(d_ptr != nullptr); - nrn_assert(omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id) == 0); - nrn_assert(omp_target_associate_ptr(h_ptr, d_ptr, len, 0, device_id) == 0); - return d_ptr; + return omp_get_mapped_ptr(h_ptr, device_id); #else - throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); #endif } -void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) { +template +void* cnrn_gpu_copyin(T* h_ptr, std::size_t len = 1) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - acc_memcpy_to_device(d_ptr, h_ptr, len); + return acc_copyin(h_ptr, len * sizeof(T)); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - auto host_id = omp_get_initial_device(); - auto device_id = omp_get_default_device(); - omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id); + #pragma omp target enter data map(to:h_ptr[:len]) + return cnrn_target_deviceptr(h_ptr); #else - throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build"); #endif } -void cnrn_target_delete(void* h_ptr, size_t len) { +template +void cnrn_target_delete(T* h_ptr, size_t len = 1) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - acc_delete(h_ptr, len); + acc_delete(h_ptr, len * sizeof(T)); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - (void)len; - auto device_id = omp_get_default_device(); - omp_target_disassociate_ptr(h_ptr, device_id); - auto* d_ptr = omp_get_mapped_ptr(h_ptr, device_id); - omp_target_free(d_ptr, device_id); + #pragma omp target exit data map(delete: h_ptr[:len]) #else throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); #endif } -void* cnrn_target_deviceptr(void* h_ptr) { +void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return acc_deviceptr(h_ptr); + acc_memcpy_to_device(d_ptr, h_ptr, len); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + auto host_id = omp_get_initial_device(); auto device_id = omp_get_default_device(); - return omp_get_mapped_ptr(h_ptr, device_id); + omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id); #else - throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); #endif } @@ -114,13 +107,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { NrnThread* nt = threads + i; // NrnThread on host if (nt->n_presyn) { - PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); + PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, nt->n_presyn); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay); // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu, // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay // inside nt on cpu (due to unified memory). @@ -138,7 +131,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * find * corresponding NrnThread using Point_process in NET_RECEIVE block */ - NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, sizeof(NrnThread) * nthreads); + NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, nthreads); if (interleave_info == nullptr) { printf("\n Warning: No permutation data? Required for linear algebra!"); @@ -157,7 +150,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* -- copy _data to device -- */ /*copy all double data for thread */ - d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata * sizeof(double)); + d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata); /* Here is the example of using OpenACC data enter/exit @@ -199,13 +192,12 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*)); } - int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end * sizeof(int)); + int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end); cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*)); /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/ Memb_list** d_ml_list = (Memb_list**) cnrn_gpu_copyin(nt->_ml_list, - corenrn.get_memb_funcs().size() * - sizeof(Memb_list*)); + corenrn.get_memb_funcs().size()); cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**)); /* -- copy NrnThreadMembList list ml to device -- */ @@ -217,7 +209,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { for (auto tml = nt->tml; tml; tml = tml->next) { /*copy tml to device*/ /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/ - auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml, sizeof(NrnThreadMembList)); + auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml); /*first tml is pointed by nt */ if (first_tml) { @@ -232,7 +224,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { d_last_tml = d_tml; /* now for every tml, there is a ml. copy that and setup pointer */ - auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml, sizeof(Memb_list)); + auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml); cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*)); /* setup nt._ml_list */ @@ -250,20 +242,19 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (!is_art) { - int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, sizeof(int) * n); + int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, n); cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*)); } if (szdp) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, sizeof(int) * pcnt); + int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, pcnt); cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*)); } int ts = corenrn.get_memb_funcs()[type].thread_size_; if (ts) { - ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread, - ts * sizeof(ThreadDatum)); + ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread, ts); cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*)); } @@ -276,27 +267,27 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // if net receive buffer exist for mechanism if (nrb) { - d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb, sizeof(NetReceiveBuffer_t)); + d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb); cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb, sizeof(NetReceiveBuffer_t*)); - d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); + d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); - d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); + d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); - d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); + d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); - d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); + d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); - d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); + d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1); cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); - d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); + d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); } @@ -309,25 +300,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int* d_iptr; double* d_dptr; - d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb, sizeof(NetSendBuffer_t)); + d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb); cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*)); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, sizeof(int) * nsb->_size); + d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*)); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size); + d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*)); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size); + d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*)); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, sizeof(int) * nsb->_size); + d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*)); - d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size); + d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*)); - d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size); + d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*)); } } @@ -338,27 +329,27 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); /* copy shadow_rhs to device and fix-up the pointer */ - d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt * sizeof(double)); + d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt); cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*)); /* copy shadow_d to device and fix-up the pointer */ - d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt * sizeof(double)); + d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt); cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*)); } /* Fast membrane current calculation struct */ if (nt->nrn_fast_imem) { auto* d_fast_imem = reinterpret_cast( - cnrn_gpu_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem))); + cnrn_gpu_copyin(nt->nrn_fast_imem)); cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*)); { auto* d_ptr = reinterpret_cast( - cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double))); + cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end)); cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*)); } { auto* d_ptr = reinterpret_cast( - cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double))); + cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end)); cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*)); } } @@ -367,20 +358,20 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU */ Point_process* pntptr = - (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); + (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc); cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*)); } if (nt->n_weight) { /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */ - double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, sizeof(double) * nt->n_weight); + double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, nt->n_weight); cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*)); } if (nt->_nvdata) { /* copy vdata which is setup in bbcore_read. This contains cuda allocated * nrnran123_State * */ - void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata); + void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, nt->_nvdata); cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**)); } @@ -391,23 +382,23 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * to * VTable and alignment */ PreSynHelper* d_presyns_helper = - (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); + (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, nt->n_presyn); cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*)); - PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); + PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, nt->n_presyn); cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*)); } if (nt->_net_send_buffer_size) { /* copy send_receive buffer */ int* d_net_send_buffer = (int*) cnrn_gpu_copyin(nt->_net_send_buffer, - sizeof(int) * nt->_net_send_buffer_size); + nt->_net_send_buffer_size); cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*)); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay); cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**)); nrn_VecPlay_copyto_device(nt, d_vecplay); @@ -417,40 +408,40 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (interleave_permute_type == 1) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo)); int* d_ptr = nullptr; + InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info); - d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * (info->nstride + 1)); + d_ptr = (int*) cnrn_gpu_copyin(info->stride, info->nstride + 1); cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * nt->ncell); + d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, nt->ncell); cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * nt->ncell); + d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, nt->ncell); cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * nt->ncell); + d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, nt->ncell); cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); } else if (interleave_permute_type == 2) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo)); + InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info); int* d_ptr = nullptr; - d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * info->nstride); + d_ptr = (int*) cnrn_gpu_copyin(info->stride, info->nstride); cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1)); + d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, info->nwarp + 1); cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1)); + d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, info->nwarp + 1); cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1)); + d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, info->nwarp + 1); cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * info->nwarp); + d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, info->nwarp); cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); } else { printf("\n ERROR: only --cell_permute = [12] implemented"); @@ -466,20 +457,20 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // Create a device-side copy of the `trajec_requests` struct and // make sure the device-side NrnThread object knows about it. auto* d_trajec_requests = reinterpret_cast( - cnrn_gpu_copyin(tr, sizeof(TrajectoryRequests))); + cnrn_gpu_copyin(tr)); cnrn_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests, sizeof(TrajectoryRequests*)); // Initialise the double** gather member of the struct. auto* d_tr_gather = reinterpret_cast( - cnrn_gpu_copyin(tr->gather, sizeof(double*) * tr->n_trajec)); + cnrn_gpu_copyin(tr->gather, tr->n_trajec)); cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**)); // Initialise the double** varrays member of the struct if it's // set. double** d_tr_varrays{nullptr}; if (tr->varrays) { d_tr_varrays = reinterpret_cast( - cnrn_gpu_copyin(tr->varrays, sizeof(double*) * tr->n_trajec)); + cnrn_gpu_copyin(tr->varrays, tr->n_trajec)); cnrn_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays, sizeof(double**)); @@ -490,7 +481,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // make a device-side copy of it and store a pointer to it in // the device-side version of tr->varrays. auto* d_buf_traj_i = reinterpret_cast( - cnrn_gpu_copyin(tr->varrays[i], tr->bsize * sizeof(double))); + cnrn_gpu_copyin(tr->varrays[i], tr->bsize)); cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*)); } // tr->gather[i] is a double* referring to (host) data in the @@ -521,12 +512,12 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_co /// if we need to copy IvocVect vector then newly alloated vector /// on the device is a new destination pointer if(vector_copy_needed) { - d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from, sizeof(IvocVect)); + d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from); cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*)); } size_t n = from.size(); if (n) { - double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), sizeof(double) * n); + double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), n); cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*)); } #else @@ -539,9 +530,9 @@ void delete_ivoc_vect_from_device(IvocVect& vec) { #ifdef _OPENACC auto const n = vec.size(); if (n) { - cnrn_target_delete(vec.data(), sizeof(double) * n); + cnrn_target_delete(vec.data(), n); } - cnrn_target_delete(&vec, sizeof(IvocVect)); + cnrn_target_delete(&vec); #else (void) vec; #endif @@ -556,12 +547,12 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { #ifdef _OPENACC if (nt->compute_gpu) { // free existing vectors in buffers on gpu - cnrn_target_delete(nrb->_pnt_index, nrb->_size * sizeof(int)); - cnrn_target_delete(nrb->_weight_index, nrb->_size * sizeof(int)); - cnrn_target_delete(nrb->_nrb_t, nrb->_size * sizeof(double)); - cnrn_target_delete(nrb->_nrb_flag, nrb->_size * sizeof(double)); - cnrn_target_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int)); - cnrn_target_delete(nrb->_nrb_index, nrb->_size * sizeof(int)); + cnrn_target_delete(nrb->_pnt_index, nrb->_size); + cnrn_target_delete(nrb->_weight_index, nrb->_size); + cnrn_target_delete(nrb->_nrb_t, nrb->_size); + cnrn_target_delete(nrb->_nrb_flag, nrb->_size); + cnrn_target_delete(nrb->_displ, nrb->_size + 1); + cnrn_target_delete(nrb->_nrb_index, nrb->_size); } #endif @@ -586,22 +577,22 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb); // recopy the vectors in the buffer - d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); + d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); - d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); + d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); - d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); + d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); - d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); + d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); - d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); + d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1); cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); - d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); + d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); } #endif @@ -1070,73 +1061,73 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (tr) { if (tr->varrays) { for (int i = 0; i < tr->n_trajec; ++i) { - cnrn_target_delete(tr->varrays[i], tr->bsize * sizeof(double)); + cnrn_target_delete(tr->varrays[i], tr->bsize); } - cnrn_target_delete(tr->varrays, sizeof(double*) * tr->n_trajec); + cnrn_target_delete(tr->varrays, tr->n_trajec); } - cnrn_target_delete(tr->gather, sizeof(double*) * tr->n_trajec); - cnrn_target_delete(tr, sizeof(TrajectoryRequests)); + cnrn_target_delete(tr->gather, tr->n_trajec); + cnrn_target_delete(tr); } } if (nt->_permute) { if (interleave_permute_type == 1) { InterleaveInfo* info = interleave_info + i; - cnrn_target_delete(info->cellsize, sizeof(int) * nt->ncell); - cnrn_target_delete(info->lastnode, sizeof(int) * nt->ncell); - cnrn_target_delete(info->firstnode, sizeof(int) * nt->ncell); - cnrn_target_delete(info->stride, sizeof(int) * (info->nstride + 1)); - cnrn_target_delete(info, sizeof(InterleaveInfo)); + cnrn_target_delete(info->cellsize, nt->ncell); + cnrn_target_delete(info->lastnode, nt->ncell); + cnrn_target_delete(info->firstnode, nt->ncell); + cnrn_target_delete(info->stride, info->nstride + 1); + cnrn_target_delete(info); } else if (interleave_permute_type == 2) { InterleaveInfo* info = interleave_info + i; - cnrn_target_delete(info->cellsize, sizeof(int) * info->nwarp); - cnrn_target_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1)); - cnrn_target_delete(info->lastnode, sizeof(int) * (info->nwarp + 1)); - cnrn_target_delete(info->firstnode, sizeof(int) * (info->nwarp + 1)); - cnrn_target_delete(info->stride, sizeof(int) * info->nstride); - cnrn_target_delete(info, sizeof(InterleaveInfo)); + cnrn_target_delete(info->cellsize, info->nwarp); + cnrn_target_delete(info->stridedispl, info->nwarp + 1); + cnrn_target_delete(info->lastnode, info->nwarp + 1); + cnrn_target_delete(info->firstnode, info->nwarp + 1); + cnrn_target_delete(info->stride, info->nstride); + cnrn_target_delete(info); } } if (nt->n_vecplay) { nrn_VecPlay_delete_from_device(nt); - cnrn_target_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + cnrn_target_delete(nt->_vecplay, nt->n_vecplay); } // Cleanup send_receive buffer. if (nt->_net_send_buffer_size) { - cnrn_target_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size); + cnrn_target_delete(nt->_net_send_buffer, nt->_net_send_buffer_size); } if (nt->n_presyn) { - cnrn_target_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - cnrn_target_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); + cnrn_target_delete(nt->presyns, nt->n_presyn); + cnrn_target_delete(nt->presyns_helper, nt->n_presyn); } // Cleanup data that's setup in bbcore_read. if (nt->_nvdata) { - cnrn_target_delete(nt->_vdata, sizeof(void*) * nt->_nvdata); + cnrn_target_delete(nt->_vdata, nt->_nvdata); } // Cleanup weight vector used in NET_RECEIVE if (nt->n_weight) { - cnrn_target_delete(nt->weights, sizeof(double) * nt->n_weight); + cnrn_target_delete(nt->weights, nt->n_weight); } // Cleanup point processes if (nt->n_pntproc) { - cnrn_target_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); + cnrn_target_delete(nt->pntprocs, nt->n_pntproc); } if (nt->nrn_fast_imem) { - cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)); - cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)); - cnrn_target_delete(nt->nrn_fast_imem, sizeof(NrnFastImem)); + cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end); + cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end); + cnrn_target_delete(nt->nrn_fast_imem); } if (nt->shadow_rhs_cnt) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); - cnrn_target_delete(nt->_shadow_d, pcnt * sizeof(double)); - cnrn_target_delete(nt->_shadow_rhs, pcnt * sizeof(double)); + cnrn_target_delete(nt->_shadow_d, pcnt); + cnrn_target_delete(nt->_shadow_rhs, pcnt); } for (auto tml = nt->tml; tml; tml = tml->next) { @@ -1144,26 +1135,26 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { { NetSendBuffer_t* nsb{tml->ml->_net_send_buffer}; if (nsb) { - cnrn_target_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size); - cnrn_target_delete(nsb->_nsb_t, sizeof(double) * nsb->_size); - cnrn_target_delete(nsb->_weight_index, sizeof(int) * nsb->_size); - cnrn_target_delete(nsb->_pnt_index, sizeof(int) * nsb->_size); - cnrn_target_delete(nsb->_vdata_index, sizeof(int) * nsb->_size); - cnrn_target_delete(nsb->_sendtype, sizeof(int) * nsb->_size); - cnrn_target_delete(nsb, sizeof(NetSendBuffer_t)); + cnrn_target_delete(nsb->_nsb_flag, nsb->_size); + cnrn_target_delete(nsb->_nsb_t, nsb->_size); + cnrn_target_delete(nsb->_weight_index, nsb->_size); + cnrn_target_delete(nsb->_pnt_index, nsb->_size); + cnrn_target_delete(nsb->_vdata_index, nsb->_size); + cnrn_target_delete(nsb->_sendtype, nsb->_size); + cnrn_target_delete(nsb); } } // Cleanup the net receive buffer if it exists. { NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer}; if (nrb) { - cnrn_target_delete(nrb->_nrb_index, sizeof(int) * nrb->_size); - cnrn_target_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - cnrn_target_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size); - cnrn_target_delete(nrb->_nrb_t, sizeof(double) * nrb->_size); - cnrn_target_delete(nrb->_weight_index, sizeof(int) * nrb->_size); - cnrn_target_delete(nrb->_pnt_index, sizeof(int) * nrb->_size); - cnrn_target_delete(nrb, sizeof(NetReceiveBuffer_t)); + cnrn_target_delete(nrb->_nrb_index, nrb->_size); + cnrn_target_delete(nrb->_displ, nrb->_size + 1); + cnrn_target_delete(nrb->_nrb_flag, nrb->_size); + cnrn_target_delete(nrb->_nrb_t, nrb->_size); + cnrn_target_delete(nrb->_weight_index, nrb->_size); + cnrn_target_delete(nrb->_pnt_index, nrb->_size); + cnrn_target_delete(nrb); } } int type = tml->index; @@ -1172,23 +1163,23 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { int is_art = corenrn.get_is_artificial()[type]; int ts = corenrn.get_memb_funcs()[type].thread_size_; if (ts) { - cnrn_target_delete(tml->ml->_thread, ts * sizeof(ThreadDatum)); + cnrn_target_delete(tml->ml->_thread, ts); } if (szdp) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - cnrn_target_delete(tml->ml->pdata, sizeof(int) * pcnt); + cnrn_target_delete(tml->ml->pdata, pcnt); } if (!is_art) { - cnrn_target_delete(tml->ml->nodeindices, sizeof(int) * n); + cnrn_target_delete(tml->ml->nodeindices, n); } - cnrn_target_delete(tml->ml, sizeof(Memb_list)); - cnrn_target_delete(tml, sizeof(NrnThreadMembList)); + cnrn_target_delete(tml->ml); + cnrn_target_delete(tml); } - cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*)); - cnrn_target_delete(nt->_v_parent_index, nt->end * sizeof(int)); - cnrn_target_delete(nt->_data, nt->_ndata * sizeof(double)); + cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size()); + cnrn_target_delete(nt->_v_parent_index, nt->end); + cnrn_target_delete(nt->_data, nt->_ndata); } - cnrn_target_delete(threads, sizeof(NrnThread) * nthreads); + cnrn_target_delete(threads, nthreads); nrn_ion_global_map_delete_from_device(); #endif } @@ -1204,30 +1195,30 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) { int n = ns->n * ns->n_instance; // actually, the values of double do not matter, only the pointers. - NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns, sizeof(NewtonSpace)); + NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns); double* pd; - pd = (double*) cnrn_gpu_copyin(ns->delta_x, n * sizeof(double)); + pd = (double*) cnrn_gpu_copyin(ns->delta_x, n); cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*)); - pd = (double*) cnrn_gpu_copyin(ns->high_value, n * sizeof(double)); + pd = (double*) cnrn_gpu_copyin(ns->high_value, n); cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*)); - pd = (double*) cnrn_gpu_copyin(ns->low_value, n * sizeof(double)); + pd = (double*) cnrn_gpu_copyin(ns->low_value, n); cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*)); - pd = (double*) cnrn_gpu_copyin(ns->rowmax, n * sizeof(double)); + pd = (double*) cnrn_gpu_copyin(ns->rowmax, n); cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*)); - auto pint = (int*) cnrn_gpu_copyin(ns->perm, n * sizeof(int)); + auto pint = (int*) cnrn_gpu_copyin(ns->perm, n); cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*)); - auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n * sizeof(double*)); + auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n); cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**)); // the actual jacobian doubles were allocated as a single array - double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n * sizeof(double)); + double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n); for (int i = 0; i < ns->n; ++i) { pd = d_jacdat + i * n; @@ -1244,14 +1235,14 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) { return; } int n = ns->n * ns->n_instance; - cnrn_target_delete(ns->jacobian[0], ns->n * n * sizeof(double)); - cnrn_target_delete(ns->jacobian, ns->n * sizeof(double*)); - cnrn_target_delete(ns->perm, n * sizeof(int)); - cnrn_target_delete(ns->rowmax, n * sizeof(double)); - cnrn_target_delete(ns->low_value, n * sizeof(double)); - cnrn_target_delete(ns->high_value, n * sizeof(double)); - cnrn_target_delete(ns->delta_x, n * sizeof(double)); - cnrn_target_delete(ns, sizeof(NewtonSpace)); + cnrn_target_delete(ns->jacobian[0], ns->n * n); + cnrn_target_delete(ns->jacobian, ns->n); + cnrn_target_delete(ns->perm, n); + cnrn_target_delete(ns->rowmax, n); + cnrn_target_delete(ns->low_value, n); + cnrn_target_delete(ns->high_value, n); + cnrn_target_delete(ns->delta_x, n); + cnrn_target_delete(ns); #endif } @@ -1264,33 +1255,33 @@ void nrn_sparseobj_copyto_device(SparseObj* so) { } unsigned n1 = so->neqn + 1; - SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so, sizeof(SparseObj)); + SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so); // only pointer fields in SparseObj that need setting up are // rowst, diag, rhs, ngetcall, coef_list // only pointer fields in Elm that need setting up are // r_down, c_right, value // do not care about the Elm* ptr value, just the space. - Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1 * sizeof(Elm*)); + Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1); cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**)); - Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1 * sizeof(Elm*)); + Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1); cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**)); - auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); + auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded); cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**)); - auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double)); + auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded); cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*)); - auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size * sizeof(double*)); + auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size); cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**)); // Fill in relevant Elm pointer values for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - Elm* pelm = (Elm*) cnrn_gpu_copyin(elm, sizeof(Elm)); + Elm* pelm = (Elm*) cnrn_gpu_copyin(elm); if (elm == so->rowst[irow]) { cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*)); @@ -1310,7 +1301,7 @@ void nrn_sparseobj_copyto_device(SparseObj* so) { } } - pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded * sizeof(double)); + pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded); cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*)); } } @@ -1348,16 +1339,16 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { unsigned n1 = so->neqn + 1; for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - cnrn_target_delete(elm->value, so->_cntml_padded * sizeof(double)); - cnrn_target_delete(elm, sizeof(Elm)); + cnrn_target_delete(elm->value, so->_cntml_padded); + cnrn_target_delete(elm); } } - cnrn_target_delete(so->coef_list, so->coef_list_size * sizeof(double*)); - cnrn_target_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double)); - cnrn_target_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); - cnrn_target_delete(so->diag, n1 * sizeof(Elm*)); - cnrn_target_delete(so->rowst, n1 * sizeof(Elm*)); - cnrn_target_delete(so, sizeof(SparseObj)); + cnrn_target_delete(so->coef_list, so->coef_list_size); + cnrn_target_delete(so->rhs, n1 * so->_cntml_padded); + cnrn_target_delete(so->ngetcall, so->_cntml_padded); + cnrn_target_delete(so->diag, n1); + cnrn_target_delete(so->rowst, n1); + cnrn_target_delete(so); #endif } @@ -1366,12 +1357,11 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { void nrn_ion_global_map_copyto_device() { if (nrn_ion_global_map_size) { double** d_data = (double**) cnrn_gpu_copyin(nrn_ion_global_map, - sizeof(double*) * nrn_ion_global_map_size); + nrn_ion_global_map_size); for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { double* d_mechmap = (double*) cnrn_gpu_copyin(nrn_ion_global_map[j], - ion_global_map_member_size * - sizeof(double)); + ion_global_map_member_size); cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*)); } } @@ -1381,11 +1371,11 @@ void nrn_ion_global_map_copyto_device() { void nrn_ion_global_map_delete_from_device() { for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { - cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double)); + cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size); } } if (nrn_ion_global_map_size) { - cnrn_target_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size); + cnrn_target_delete(nrn_ion_global_map, nrn_ion_global_map_size); } } @@ -1439,7 +1429,7 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i]; /** just VecPlayContinuous object */ - void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance, sizeof(VecPlayContinuous)); + void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance); cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*)); VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p; @@ -1454,8 +1444,8 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { } /** copy PlayRecordEvent : todo: verify this */ - PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_, - sizeof(PlayRecordEvent)); + PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_); + cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*)); cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*)); @@ -1469,13 +1459,13 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { void nrn_VecPlay_delete_from_device(NrnThread* nt) { for (int i = 0; i < nt->n_vecplay; i++) { auto* vecplay_instance = reinterpret_cast(nt->_vecplay[i]); - cnrn_target_delete(vecplay_instance->e_, sizeof(PlayRecordEvent)); + cnrn_target_delete(vecplay_instance->e_); if (vecplay_instance->discon_indices_) { delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_)); } delete_ivoc_vect_from_device(vecplay_instance->t_); delete_ivoc_vect_from_device(vecplay_instance->y_); - cnrn_target_delete(vecplay_instance, sizeof(VecPlayContinuous)); + cnrn_target_delete(vecplay_instance); } } From 84f032295812405cd7f13724248145795b85fd49 Mon Sep 17 00:00:00 2001 From: Nicolas Cornu Date: Fri, 10 Dec 2021 17:42:47 +0100 Subject: [PATCH 2/5] Simplifying pointers --- coreneuron/gpu/nrn_acc_manager.cpp | 173 ++++++++++++++--------------- 1 file changed, 83 insertions(+), 90 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index d8eded736..3b076c172 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -55,12 +55,12 @@ void* cnrn_target_deviceptr(void* h_ptr) { } template -void* cnrn_gpu_copyin(T* h_ptr, std::size_t len = 1) { +T* cnrn_gpu_copyin(T* h_ptr, std::size_t len = 1) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return acc_copyin(h_ptr, len * sizeof(T)); + return static_cast(acc_copyin(h_ptr, len * sizeof(T))); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) #pragma omp target enter data map(to:h_ptr[:len]) - return cnrn_target_deviceptr(h_ptr); + return static_cast(cnrn_target_deviceptr(h_ptr)); #else throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build"); #endif @@ -107,13 +107,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { NrnThread* nt = threads + i; // NrnThread on host if (nt->n_presyn) { - PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, nt->n_presyn); + PreSyn* d_presyns = cnrn_gpu_copyin(nt->presyns, nt->n_presyn); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay); + void** d_vecplay = cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay); // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu, // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay // inside nt on cpu (due to unified memory). @@ -131,7 +131,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * find * corresponding NrnThread using Point_process in NET_RECEIVE block */ - NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, nthreads); + NrnThread* d_threads = cnrn_gpu_copyin(threads, nthreads); if (interleave_info == nullptr) { printf("\n Warning: No permutation data? Required for linear algebra!"); @@ -150,7 +150,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* -- copy _data to device -- */ /*copy all double data for thread */ - d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata); + d__data = cnrn_gpu_copyin(nt->_data, nt->_ndata); /* Here is the example of using OpenACC data enter/exit @@ -192,11 +192,11 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*)); } - int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end); + int* d_v_parent_index = cnrn_gpu_copyin(nt->_v_parent_index, nt->end); cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*)); /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/ - Memb_list** d_ml_list = (Memb_list**) cnrn_gpu_copyin(nt->_ml_list, + Memb_list** d_ml_list = cnrn_gpu_copyin(nt->_ml_list, corenrn.get_memb_funcs().size()); cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**)); @@ -209,7 +209,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { for (auto tml = nt->tml; tml; tml = tml->next) { /*copy tml to device*/ /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/ - auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml); + auto d_tml = cnrn_gpu_copyin(tml); /*first tml is pointed by nt */ if (first_tml) { @@ -224,7 +224,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { d_last_tml = d_tml; /* now for every tml, there is a ml. copy that and setup pointer */ - auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml); + auto d_ml = cnrn_gpu_copyin(tml->ml); cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*)); /* setup nt._ml_list */ @@ -242,19 +242,19 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (!is_art) { - int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, n); + int* d_nodeindices = cnrn_gpu_copyin(tml->ml->nodeindices, n); cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*)); } if (szdp) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, pcnt); + int* d_pdata = cnrn_gpu_copyin(tml->ml->pdata, pcnt); cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*)); } int ts = corenrn.get_memb_funcs()[type].thread_size_; if (ts) { - ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread, ts); + ThreadDatum* td = cnrn_gpu_copyin(tml->ml->_thread, ts); cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*)); } @@ -267,27 +267,27 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // if net receive buffer exist for mechanism if (nrb) { - d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb); + d_nrb = cnrn_gpu_copyin(nrb); cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb, sizeof(NetReceiveBuffer_t*)); - d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size); + d_pnt_index = cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); - d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, nrb->_size); + d_weight_index = cnrn_gpu_copyin(nrb->_weight_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); - d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size); + d_nrb_t = cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); - d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size); + d_nrb_flag = cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); - d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1); + d_displ = cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1); cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); - d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size); + d_nrb_index = cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); } @@ -300,25 +300,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int* d_iptr; double* d_dptr; - d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb); + d_nsb = cnrn_gpu_copyin(nsb); cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*)); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, nsb->_size); + d_iptr = cnrn_gpu_copyin(nsb->_sendtype, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*)); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, nsb->_size); + d_iptr = cnrn_gpu_copyin(nsb->_vdata_index, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*)); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, nsb->_size); + d_iptr = cnrn_gpu_copyin(nsb->_pnt_index, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*)); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, nsb->_size); + d_iptr = cnrn_gpu_copyin(nsb->_weight_index, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*)); - d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, nsb->_size); + d_dptr = cnrn_gpu_copyin(nsb->_nsb_t, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*)); - d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, nsb->_size); + d_dptr = cnrn_gpu_copyin(nsb->_nsb_flag, nsb->_size); cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*)); } } @@ -329,27 +329,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); /* copy shadow_rhs to device and fix-up the pointer */ - d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt); + d_shadow_ptr = cnrn_gpu_copyin(nt->_shadow_rhs, pcnt); cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*)); /* copy shadow_d to device and fix-up the pointer */ - d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt); + d_shadow_ptr = cnrn_gpu_copyin(nt->_shadow_d, pcnt); cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*)); } /* Fast membrane current calculation struct */ if (nt->nrn_fast_imem) { - auto* d_fast_imem = reinterpret_cast( - cnrn_gpu_copyin(nt->nrn_fast_imem)); + NrnFastImem* d_fast_imem = cnrn_gpu_copyin(nt->nrn_fast_imem); cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*)); { - auto* d_ptr = reinterpret_cast( - cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end)); + double* d_ptr = cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end); cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*)); } { - auto* d_ptr = reinterpret_cast( - cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end)); + double* d_ptr = cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end); cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*)); } } @@ -358,20 +355,20 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU */ Point_process* pntptr = - (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc); + cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc); cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*)); } if (nt->n_weight) { /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */ - double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, nt->n_weight); + double* d_weights = cnrn_gpu_copyin(nt->weights, nt->n_weight); cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*)); } if (nt->_nvdata) { /* copy vdata which is setup in bbcore_read. This contains cuda allocated * nrnran123_State * */ - void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, nt->_nvdata); + void** d_vdata = cnrn_gpu_copyin(nt->_vdata, nt->_nvdata); cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**)); } @@ -382,15 +379,15 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * to * VTable and alignment */ PreSynHelper* d_presyns_helper = - (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, nt->n_presyn); + cnrn_gpu_copyin(nt->presyns_helper, nt->n_presyn); cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*)); - PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, nt->n_presyn); + PreSyn* d_presyns = cnrn_gpu_copyin(nt->presyns, nt->n_presyn); cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*)); } if (nt->_net_send_buffer_size) { /* copy send_receive buffer */ - int* d_net_send_buffer = (int*) cnrn_gpu_copyin(nt->_net_send_buffer, + int* d_net_send_buffer = cnrn_gpu_copyin(nt->_net_send_buffer, nt->_net_send_buffer_size); cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*)); } @@ -398,7 +395,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay); + void** d_vecplay = cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay); cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**)); nrn_VecPlay_copyto_device(nt, d_vecplay); @@ -409,39 +406,39 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; int* d_ptr = nullptr; - InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info); + InterleaveInfo* d_info = cnrn_gpu_copyin(info); - d_ptr = (int*) cnrn_gpu_copyin(info->stride, info->nstride + 1); + d_ptr = cnrn_gpu_copyin(info->stride, info->nstride + 1); cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, nt->ncell); + d_ptr = cnrn_gpu_copyin(info->firstnode, nt->ncell); cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, nt->ncell); + d_ptr = cnrn_gpu_copyin(info->lastnode, nt->ncell); cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, nt->ncell); + d_ptr = cnrn_gpu_copyin(info->cellsize, nt->ncell); cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); } else if (interleave_permute_type == 2) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info); + InterleaveInfo* d_info = cnrn_gpu_copyin(info); int* d_ptr = nullptr; - d_ptr = (int*) cnrn_gpu_copyin(info->stride, info->nstride); + d_ptr = cnrn_gpu_copyin(info->stride, info->nstride); cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, info->nwarp + 1); + d_ptr = cnrn_gpu_copyin(info->firstnode, info->nwarp + 1); cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, info->nwarp + 1); + d_ptr = cnrn_gpu_copyin(info->lastnode, info->nwarp + 1); cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, info->nwarp + 1); + d_ptr = cnrn_gpu_copyin(info->stridedispl, info->nwarp + 1); cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*)); - d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, info->nwarp); + d_ptr = cnrn_gpu_copyin(info->cellsize, info->nwarp); cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); } else { printf("\n ERROR: only --cell_permute = [12] implemented"); @@ -456,21 +453,18 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (tr) { // Create a device-side copy of the `trajec_requests` struct and // make sure the device-side NrnThread object knows about it. - auto* d_trajec_requests = reinterpret_cast( - cnrn_gpu_copyin(tr)); + TrajectoryRequests* d_trajec_requests = cnrn_gpu_copyin(tr); cnrn_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests, sizeof(TrajectoryRequests*)); // Initialise the double** gather member of the struct. - auto* d_tr_gather = reinterpret_cast( - cnrn_gpu_copyin(tr->gather, tr->n_trajec)); + double** d_tr_gather = cnrn_gpu_copyin(tr->gather, tr->n_trajec); cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**)); // Initialise the double** varrays member of the struct if it's // set. double** d_tr_varrays{nullptr}; if (tr->varrays) { - d_tr_varrays = reinterpret_cast( - cnrn_gpu_copyin(tr->varrays, tr->n_trajec)); + d_tr_varrays = cnrn_gpu_copyin(tr->varrays, tr->n_trajec); cnrn_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays, sizeof(double**)); @@ -480,8 +474,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // tr->varrays[i] is a buffer of tr->bsize doubles on the host, // make a device-side copy of it and store a pointer to it in // the device-side version of tr->varrays. - auto* d_buf_traj_i = reinterpret_cast( - cnrn_gpu_copyin(tr->varrays[i], tr->bsize)); + double* d_buf_traj_i = cnrn_gpu_copyin(tr->varrays[i], tr->bsize); cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*)); } // tr->gather[i] is a double* referring to (host) data in the @@ -512,12 +505,12 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_co /// if we need to copy IvocVect vector then newly alloated vector /// on the device is a new destination pointer if(vector_copy_needed) { - d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from); + d_iv = cnrn_gpu_copyin(const_cast(&from)); cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*)); } size_t n = from.size(); if (n) { - double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), n); + double* d_data = cnrn_gpu_copyin(const_cast(from.data()), n); cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*)); } #else @@ -577,22 +570,22 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb); // recopy the vectors in the buffer - d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size); + d_pnt_index = cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); - d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, nrb->_size); + d_weight_index = cnrn_gpu_copyin(nrb->_weight_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); - d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size); + d_nrb_t = cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); - d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size); + d_nrb_flag = cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); - d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1); + d_displ = cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1); cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); - d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size); + d_nrb_index = cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size); cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); } #endif @@ -1195,30 +1188,30 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) { int n = ns->n * ns->n_instance; // actually, the values of double do not matter, only the pointers. - NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns); + NewtonSpace* d_ns = cnrn_gpu_copyin(ns); double* pd; - pd = (double*) cnrn_gpu_copyin(ns->delta_x, n); + pd = cnrn_gpu_copyin(ns->delta_x, n); cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*)); - pd = (double*) cnrn_gpu_copyin(ns->high_value, n); + pd = cnrn_gpu_copyin(ns->high_value, n); cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*)); - pd = (double*) cnrn_gpu_copyin(ns->low_value, n); + pd = cnrn_gpu_copyin(ns->low_value, n); cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*)); - pd = (double*) cnrn_gpu_copyin(ns->rowmax, n); + pd = cnrn_gpu_copyin(ns->rowmax, n); cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*)); - auto pint = (int*) cnrn_gpu_copyin(ns->perm, n); + auto pint = cnrn_gpu_copyin(ns->perm, n); cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*)); - auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n); + auto ppd = cnrn_gpu_copyin(ns->jacobian, ns->n); cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**)); // the actual jacobian doubles were allocated as a single array - double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n); + double* d_jacdat = cnrn_gpu_copyin(ns->jacobian[0], ns->n * n); for (int i = 0; i < ns->n; ++i) { pd = d_jacdat + i * n; @@ -1255,33 +1248,33 @@ void nrn_sparseobj_copyto_device(SparseObj* so) { } unsigned n1 = so->neqn + 1; - SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so); + SparseObj* d_so = cnrn_gpu_copyin(so); // only pointer fields in SparseObj that need setting up are // rowst, diag, rhs, ngetcall, coef_list // only pointer fields in Elm that need setting up are // r_down, c_right, value // do not care about the Elm* ptr value, just the space. - Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1); + Elm** d_rowst = cnrn_gpu_copyin(so->rowst, n1); cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**)); - Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1); + Elm** d_diag = cnrn_gpu_copyin(so->diag, n1); cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**)); - auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded); + unsigned* pu = cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded); cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**)); - auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded); + double* pd = cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded); cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*)); - auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size); + double** d_coef_list = cnrn_gpu_copyin(so->coef_list, so->coef_list_size); cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**)); // Fill in relevant Elm pointer values for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - Elm* pelm = (Elm*) cnrn_gpu_copyin(elm); + Elm* pelm = cnrn_gpu_copyin(elm); if (elm == so->rowst[irow]) { cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*)); @@ -1301,7 +1294,7 @@ void nrn_sparseobj_copyto_device(SparseObj* so) { } } - pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded); + pd = cnrn_gpu_copyin(elm->value, so->_cntml_padded); cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*)); } } @@ -1356,11 +1349,11 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { void nrn_ion_global_map_copyto_device() { if (nrn_ion_global_map_size) { - double** d_data = (double**) cnrn_gpu_copyin(nrn_ion_global_map, + double** d_data = cnrn_gpu_copyin(nrn_ion_global_map, nrn_ion_global_map_size); for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { - double* d_mechmap = (double*) cnrn_gpu_copyin(nrn_ion_global_map[j], + double* d_mechmap = cnrn_gpu_copyin(nrn_ion_global_map[j], ion_global_map_member_size); cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*)); } @@ -1429,7 +1422,7 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i]; /** just VecPlayContinuous object */ - void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance); + void* d_p = cnrn_gpu_copyin(vecplay_instance); cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*)); VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p; @@ -1444,7 +1437,7 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { } /** copy PlayRecordEvent : todo: verify this */ - PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_); + PlayRecordEvent* d_e_ = cnrn_gpu_copyin(vecplay_instance->e_); cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*)); cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*)); From f34aabf46393efe374eedd06a64c5bda5516f2d3 Mon Sep 17 00:00:00 2001 From: Nicolas Cornu Date: Mon, 13 Dec 2021 11:27:21 +0100 Subject: [PATCH 3/5] More template, more const --- coreneuron/gpu/nrn_acc_manager.cpp | 409 ++++++++++++++--------------- 1 file changed, 200 insertions(+), 209 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 3b076c172..0b5aa2cc6 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -43,31 +43,31 @@ void nrn_ion_global_map_delete_from_device(); void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay); void nrn_VecPlay_delete_from_device(NrnThread* nt); -void* cnrn_target_deviceptr(void* h_ptr) { +template +T* cnrn_target_deviceptr(const T* h_ptr) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return acc_deviceptr(h_ptr); + return acc_deviceptr(static_cast(h_ptr)); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - auto device_id = omp_get_default_device(); - return omp_get_mapped_ptr(h_ptr, device_id); + return static_cast(omp_get_mapped_ptr(const_cast(h_ptr), omp_get_default_device())); #else - throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); #endif } template -T* cnrn_gpu_copyin(T* h_ptr, std::size_t len = 1) { +T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return static_cast(acc_copyin(h_ptr, len * sizeof(T))); + return static_cast(acc_copyin(const_cast(h_ptr), len * sizeof(T))); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) #pragma omp target enter data map(to:h_ptr[:len]) - return static_cast(cnrn_target_deviceptr(h_ptr)); + return cnrn_target_deviceptr(const_cast(h_ptr)); #else - throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); #endif } template -void cnrn_target_delete(T* h_ptr, size_t len = 1) { +void cnrn_target_delete(T* h_ptr, std::size_t len = 1) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) acc_delete(h_ptr, len * sizeof(T)); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) @@ -77,15 +77,14 @@ void cnrn_target_delete(T* h_ptr, size_t len = 1) { #endif } -void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) { +template +void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - acc_memcpy_to_device(d_ptr, h_ptr, len); + acc_memcpy_to_device(d_ptr, const_cast(h_ptr), len * sizeof(T)); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - auto host_id = omp_get_initial_device(); - auto device_id = omp_get_default_device(); - omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id); + omp_target_memcpy(d_ptr, const_cast(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device()); #else - throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); #endif } @@ -107,13 +106,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { NrnThread* nt = threads + i; // NrnThread on host if (nt->n_presyn) { - PreSyn* d_presyns = cnrn_gpu_copyin(nt->presyns, nt->n_presyn); + PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay); + void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay); // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu, // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay // inside nt on cpu (due to unified memory). @@ -131,7 +130,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * find * corresponding NrnThread using Point_process in NET_RECEIVE block */ - NrnThread* d_threads = cnrn_gpu_copyin(threads, nthreads); + NrnThread* d_threads = cnrn_target_copyin(threads, nthreads); if (interleave_info == nullptr) { printf("\n Warning: No permutation data? Required for linear algebra!"); @@ -150,7 +149,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* -- copy _data to device -- */ /*copy all double data for thread */ - d__data = cnrn_gpu_copyin(nt->_data, nt->_ndata); + d__data = cnrn_target_copyin(nt->_data, nt->_ndata); /* Here is the example of using OpenACC data enter/exit @@ -161,7 +160,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { */ /*update d_nt._data to point to device copy */ - cnrn_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_data), &d__data); /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */ double* dptr; @@ -170,35 +169,35 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int ne = nrn_soa_padded_size(nt->end, 0); dptr = d__data + 0 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr)); dptr = d__data + 1 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_d), &(dptr)); dptr = d__data + 2 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_a), &(dptr)); dptr = d__data + 3 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_b), &(dptr)); dptr = d__data + 4 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_v), &(dptr)); dptr = d__data + 5 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_area), &(dptr)); if (nt->_actual_diam) { dptr = d__data + 6 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_diam), &(dptr)); } - int* d_v_parent_index = cnrn_gpu_copyin(nt->_v_parent_index, nt->end); - cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*)); + int* d_v_parent_index = cnrn_target_copyin(nt->_v_parent_index, nt->end); + cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index)); /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/ - Memb_list** d_ml_list = cnrn_gpu_copyin(nt->_ml_list, + Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, corenrn.get_memb_funcs().size()); - cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**)); + cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list)); /* -- copy NrnThreadMembList list ml to device -- */ @@ -209,26 +208,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { for (auto tml = nt->tml; tml; tml = tml->next) { /*copy tml to device*/ /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/ - auto d_tml = cnrn_gpu_copyin(tml); + auto d_tml = cnrn_target_copyin(tml); /*first tml is pointed by nt */ if (first_tml) { - cnrn_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*)); + cnrn_target_memcpy_to_device(&(d_nt->tml), &d_tml); first_tml = false; } else { /*rest of tml forms linked list */ - cnrn_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*)); + cnrn_target_memcpy_to_device(&(d_last_tml->next), &d_tml); } // book keeping for linked-list d_last_tml = d_tml; /* now for every tml, there is a ml. copy that and setup pointer */ - auto d_ml = cnrn_gpu_copyin(tml->ml); - cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*)); + auto d_ml = cnrn_target_copyin(tml->ml); + cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml); /* setup nt._ml_list */ - cnrn_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*)); + cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml); int type = tml->index; int n = tml->ml->nodecount; @@ -237,25 +236,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int is_art = corenrn.get_is_artificial()[type]; // get device pointer for corresponding mechanism data - dptr = (double*) cnrn_target_deviceptr(tml->ml->data); - cnrn_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*)); + dptr = cnrn_target_deviceptr(tml->ml->data); + cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr)); if (!is_art) { - int* d_nodeindices = cnrn_gpu_copyin(tml->ml->nodeindices, n); - cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*)); + int* d_nodeindices = cnrn_target_copyin(tml->ml->nodeindices, n); + cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices); } if (szdp) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - int* d_pdata = cnrn_gpu_copyin(tml->ml->pdata, pcnt); - cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*)); + int* d_pdata = cnrn_target_copyin(tml->ml->pdata, pcnt); + cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata); } int ts = corenrn.get_memb_funcs()[type].thread_size_; if (ts) { - ThreadDatum* td = cnrn_gpu_copyin(tml->ml->_thread, ts); - cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*)); + ThreadDatum* td = cnrn_target_copyin(tml->ml->_thread, ts); + cnrn_target_memcpy_to_device(&(d_ml->_thread), &td); } NetReceiveBuffer_t *nrb, *d_nrb; @@ -267,28 +266,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // if net receive buffer exist for mechanism if (nrb) { - d_nrb = cnrn_gpu_copyin(nrb); - cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer), - &d_nrb, - sizeof(NetReceiveBuffer_t*)); + d_nrb = cnrn_target_copyin(nrb); + cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb); - d_pnt_index = cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); + d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index); - d_weight_index = cnrn_gpu_copyin(nrb->_weight_index, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); + d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index); - d_nrb_t = cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); + d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t); - d_nrb_flag = cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); + d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag); - d_displ = cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1); - cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); + d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1); + cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ); - d_nrb_index = cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); + d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index); } /* copy NetSendBuffer_t on to GPU */ @@ -300,26 +297,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int* d_iptr; double* d_dptr; - d_nsb = cnrn_gpu_copyin(nsb); - cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*)); + d_nsb = cnrn_target_copyin(nsb); + cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb); - d_iptr = cnrn_gpu_copyin(nsb->_sendtype, nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*)); + d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr); - d_iptr = cnrn_gpu_copyin(nsb->_vdata_index, nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*)); + d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr); - d_iptr = cnrn_gpu_copyin(nsb->_pnt_index, nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*)); + d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr); - d_iptr = cnrn_gpu_copyin(nsb->_weight_index, nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*)); + d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr); - d_dptr = cnrn_gpu_copyin(nsb->_nsb_t, nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*)); + d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr); - d_dptr = cnrn_gpu_copyin(nsb->_nsb_flag, nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*)); + d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr); } } @@ -329,25 +326,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); /* copy shadow_rhs to device and fix-up the pointer */ - d_shadow_ptr = cnrn_gpu_copyin(nt->_shadow_rhs, pcnt); - cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*)); + d_shadow_ptr = cnrn_target_copyin(nt->_shadow_rhs, pcnt); + cnrn_target_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr); /* copy shadow_d to device and fix-up the pointer */ - d_shadow_ptr = cnrn_gpu_copyin(nt->_shadow_d, pcnt); - cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*)); + d_shadow_ptr = cnrn_target_copyin(nt->_shadow_d, pcnt); + cnrn_target_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr); } /* Fast membrane current calculation struct */ if (nt->nrn_fast_imem) { - NrnFastImem* d_fast_imem = cnrn_gpu_copyin(nt->nrn_fast_imem); - cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*)); + NrnFastImem* d_fast_imem = cnrn_target_copyin(nt->nrn_fast_imem); + cnrn_target_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem); { - double* d_ptr = cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end); - cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*)); + double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end); + cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr); } { - double* d_ptr = cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end); - cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*)); + double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end); + cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr); } } @@ -355,21 +352,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU */ Point_process* pntptr = - cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc); - cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*)); + cnrn_target_copyin(nt->pntprocs, nt->n_pntproc); + cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr); } if (nt->n_weight) { /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */ - double* d_weights = cnrn_gpu_copyin(nt->weights, nt->n_weight); - cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*)); + double* d_weights = cnrn_target_copyin(nt->weights, nt->n_weight); + cnrn_target_memcpy_to_device(&(d_nt->weights), &d_weights); } if (nt->_nvdata) { /* copy vdata which is setup in bbcore_read. This contains cuda allocated * nrnran123_State * */ - void** d_vdata = cnrn_gpu_copyin(nt->_vdata, nt->_nvdata); - cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**)); + void** d_vdata = cnrn_target_copyin(nt->_vdata, nt->_nvdata); + cnrn_target_memcpy_to_device(&(d_nt->_vdata), &d_vdata); } if (nt->n_presyn) { @@ -379,24 +376,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * to * VTable and alignment */ PreSynHelper* d_presyns_helper = - cnrn_gpu_copyin(nt->presyns_helper, nt->n_presyn); - cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*)); - PreSyn* d_presyns = cnrn_gpu_copyin(nt->presyns, nt->n_presyn); - cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*)); + cnrn_target_copyin(nt->presyns_helper, nt->n_presyn); + cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper); + PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn); + cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns); } if (nt->_net_send_buffer_size) { /* copy send_receive buffer */ - int* d_net_send_buffer = cnrn_gpu_copyin(nt->_net_send_buffer, + int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer, nt->_net_send_buffer_size); - cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*)); + cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = cnrn_gpu_copyin(nt->_vecplay, nt->n_vecplay); - cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**)); + void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay); + cnrn_target_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay); nrn_VecPlay_copyto_device(nt, d_vecplay); } @@ -406,40 +403,40 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; int* d_ptr = nullptr; - InterleaveInfo* d_info = cnrn_gpu_copyin(info); + InterleaveInfo* d_info = cnrn_target_copyin(info); - d_ptr = cnrn_gpu_copyin(info->stride, info->nstride + 1); - cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->stride, info->nstride + 1); + cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr); - d_ptr = cnrn_gpu_copyin(info->firstnode, nt->ncell); - cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->firstnode, nt->ncell); + cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr); - d_ptr = cnrn_gpu_copyin(info->lastnode, nt->ncell); - cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->lastnode, nt->ncell); + cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr); - d_ptr = cnrn_gpu_copyin(info->cellsize, nt->ncell); - cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->cellsize, nt->ncell); + cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr); } else if (interleave_permute_type == 2) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = cnrn_gpu_copyin(info); + InterleaveInfo* d_info = cnrn_target_copyin(info); int* d_ptr = nullptr; - d_ptr = cnrn_gpu_copyin(info->stride, info->nstride); - cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->stride, info->nstride); + cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr); - d_ptr = cnrn_gpu_copyin(info->firstnode, info->nwarp + 1); - cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->firstnode, info->nwarp + 1); + cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr); - d_ptr = cnrn_gpu_copyin(info->lastnode, info->nwarp + 1); - cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->lastnode, info->nwarp + 1); + cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr); - d_ptr = cnrn_gpu_copyin(info->stridedispl, info->nwarp + 1); - cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->stridedispl, info->nwarp + 1); + cnrn_target_memcpy_to_device(&(d_info->stridedispl), &d_ptr); - d_ptr = cnrn_gpu_copyin(info->cellsize, info->nwarp); - cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->cellsize, info->nwarp); + cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr); } else { printf("\n ERROR: only --cell_permute = [12] implemented"); abort(); @@ -453,34 +450,30 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (tr) { // Create a device-side copy of the `trajec_requests` struct and // make sure the device-side NrnThread object knows about it. - TrajectoryRequests* d_trajec_requests = cnrn_gpu_copyin(tr); - cnrn_memcpy_to_device(&(d_nt->trajec_requests), - &d_trajec_requests, - sizeof(TrajectoryRequests*)); + TrajectoryRequests* d_trajec_requests = cnrn_target_copyin(tr); + cnrn_target_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests); // Initialise the double** gather member of the struct. - double** d_tr_gather = cnrn_gpu_copyin(tr->gather, tr->n_trajec); - cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**)); + double** d_tr_gather = cnrn_target_copyin(tr->gather, tr->n_trajec); + cnrn_target_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather); // Initialise the double** varrays member of the struct if it's // set. double** d_tr_varrays{nullptr}; if (tr->varrays) { - d_tr_varrays = cnrn_gpu_copyin(tr->varrays, tr->n_trajec); - cnrn_memcpy_to_device(&(d_trajec_requests->varrays), - &d_tr_varrays, - sizeof(double**)); + d_tr_varrays = cnrn_target_copyin(tr->varrays, tr->n_trajec); + cnrn_target_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays); } for (int i = 0; i < tr->n_trajec; ++i) { if (tr->varrays) { // tr->varrays[i] is a buffer of tr->bsize doubles on the host, // make a device-side copy of it and store a pointer to it in // the device-side version of tr->varrays. - double* d_buf_traj_i = cnrn_gpu_copyin(tr->varrays[i], tr->bsize); - cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*)); + double* d_buf_traj_i = cnrn_target_copyin(tr->varrays[i], tr->bsize); + cnrn_target_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i); } // tr->gather[i] is a double* referring to (host) data in the // (host) _data block auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]); - cnrn_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i); } // TODO: other `double** scatter` and `void** vpr` members of // the TrajectoryRequests struct are not copied to the device. @@ -505,13 +498,13 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_co /// if we need to copy IvocVect vector then newly alloated vector /// on the device is a new destination pointer if(vector_copy_needed) { - d_iv = cnrn_gpu_copyin(const_cast(&from)); - cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*)); + d_iv = cnrn_target_copyin(&from); + cnrn_target_memcpy_to_device(&to, d_iv); } size_t n = from.size(); if (n) { - double* d_data = cnrn_gpu_copyin(const_cast(from.data()), n); - cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*)); + double* d_data = cnrn_target_copyin(from.data(), n); + cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data); } #else (void) from; @@ -567,26 +560,26 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { nrn_pragma_acc(update device(nrb)); nrn_pragma_omp(target update to(nrb)); - NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb); + NetReceiveBuffer_t* d_nrb = cnrn_target_deviceptr(nrb); // recopy the vectors in the buffer - d_pnt_index = cnrn_gpu_copyin(nrb->_pnt_index, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); + d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index); - d_weight_index = cnrn_gpu_copyin(nrb->_weight_index, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); + d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index); - d_nrb_t = cnrn_gpu_copyin(nrb->_nrb_t, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); + d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t); - d_nrb_flag = cnrn_gpu_copyin(nrb->_nrb_flag, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); + d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag); - d_displ = cnrn_gpu_copyin(nrb->_displ, nrb->_size + 1); - cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); + d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1); + cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ); - d_nrb_index = cnrn_gpu_copyin(nrb->_nrb_index, nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); + d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index); } #endif } @@ -1023,21 +1016,21 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) { /** Cleanup device memory that is being tracked by the OpenACC runtime. * * This function painstakingly calls `cnrn_target_delete` in reverse order on all - * pointers that were passed to `cnrn_gpu_copyin` in `setup_nrnthreads_on_device`. + * pointers that were passed to `cnrn_target_copyin` in `setup_nrnthreads_on_device`. * This cleanup ensures that if the GPU is initialised multiple times from the * same process then the OpenACC runtime will not be polluted with old * pointers, which can cause errors. In particular if we do: * @code * { * // ... some_ptr is dynamically allocated ... - * cnrn_gpu_copyin(some_ptr, some_size); + * cnrn_target_copyin(some_ptr, some_size); * // ... do some work ... * // cnrn_target_delete(some_ptr); * free(some_ptr); * } * { * // ... same_ptr_again is dynamically allocated at the same address ... - * cnrn_gpu_copyin(same_ptr_again, some_other_size); // ERROR + * cnrn_target_copyin(same_ptr_again, some_other_size); // ERROR * } * @endcode * the application will/may abort with an error such as: @@ -1188,34 +1181,34 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) { int n = ns->n * ns->n_instance; // actually, the values of double do not matter, only the pointers. - NewtonSpace* d_ns = cnrn_gpu_copyin(ns); + NewtonSpace* d_ns = cnrn_target_copyin(ns); double* pd; - pd = cnrn_gpu_copyin(ns->delta_x, n); - cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->delta_x, n); + cnrn_target_memcpy_to_device(&(d_ns->delta_x), &pd); - pd = cnrn_gpu_copyin(ns->high_value, n); - cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->high_value, n); + cnrn_target_memcpy_to_device(&(d_ns->high_value), &pd); - pd = cnrn_gpu_copyin(ns->low_value, n); - cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->low_value, n); + cnrn_target_memcpy_to_device(&(d_ns->low_value), &pd); - pd = cnrn_gpu_copyin(ns->rowmax, n); - cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->rowmax, n); + cnrn_target_memcpy_to_device(&(d_ns->rowmax), &pd); - auto pint = cnrn_gpu_copyin(ns->perm, n); - cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*)); + auto pint = cnrn_target_copyin(ns->perm, n); + cnrn_target_memcpy_to_device(&(d_ns->perm), &pint); - auto ppd = cnrn_gpu_copyin(ns->jacobian, ns->n); - cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**)); + auto ppd = cnrn_target_copyin(ns->jacobian, ns->n); + cnrn_target_memcpy_to_device(&(d_ns->jacobian), &ppd); // the actual jacobian doubles were allocated as a single array - double* d_jacdat = cnrn_gpu_copyin(ns->jacobian[0], ns->n * n); + double* d_jacdat = cnrn_target_copyin(ns->jacobian[0], ns->n * n); for (int i = 0; i < ns->n; ++i) { pd = d_jacdat + i * n; - cnrn_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*)); + cnrn_target_memcpy_to_device(&(ppd[i]), &pd); } #endif } @@ -1248,76 +1241,76 @@ void nrn_sparseobj_copyto_device(SparseObj* so) { } unsigned n1 = so->neqn + 1; - SparseObj* d_so = cnrn_gpu_copyin(so); + SparseObj* d_so = cnrn_target_copyin(so); // only pointer fields in SparseObj that need setting up are // rowst, diag, rhs, ngetcall, coef_list // only pointer fields in Elm that need setting up are // r_down, c_right, value // do not care about the Elm* ptr value, just the space. - Elm** d_rowst = cnrn_gpu_copyin(so->rowst, n1); - cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**)); + Elm** d_rowst = cnrn_target_copyin(so->rowst, n1); + cnrn_target_memcpy_to_device(&(d_so->rowst), &d_rowst); - Elm** d_diag = cnrn_gpu_copyin(so->diag, n1); - cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**)); + Elm** d_diag = cnrn_target_copyin(so->diag, n1); + cnrn_target_memcpy_to_device(&(d_so->diag), &d_diag); - unsigned* pu = cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded); - cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**)); + unsigned* pu = cnrn_target_copyin(so->ngetcall, so->_cntml_padded); + cnrn_target_memcpy_to_device(&(d_so->ngetcall), &pu); - double* pd = cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded); - cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*)); + double* pd = cnrn_target_copyin(so->rhs, n1 * so->_cntml_padded); + cnrn_target_memcpy_to_device(&(d_so->rhs), &pd); - double** d_coef_list = cnrn_gpu_copyin(so->coef_list, so->coef_list_size); - cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**)); + double** d_coef_list = cnrn_target_copyin(so->coef_list, so->coef_list_size); + cnrn_target_memcpy_to_device(&(d_so->coef_list), &d_coef_list); // Fill in relevant Elm pointer values for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - Elm* pelm = cnrn_gpu_copyin(elm); + Elm* pelm = cnrn_target_copyin(elm); if (elm == so->rowst[irow]) { - cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*)); + cnrn_target_memcpy_to_device(&(d_rowst[irow]), &pelm); } else { - Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->c_left); - cnrn_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*)); + Elm* d_e = cnrn_target_deviceptr(elm->c_left); + cnrn_target_memcpy_to_device(&(pelm->c_left), &d_e); } if (elm->col == elm->row) { - cnrn_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*)); + cnrn_target_memcpy_to_device(&(d_diag[irow]), &pelm); } if (irow > 1) { if (elm->r_up) { - Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->r_up); - cnrn_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*)); + Elm* d_e = cnrn_target_deviceptr(elm->r_up); + cnrn_target_memcpy_to_device(&(pelm->r_up), &d_e); } } - pd = cnrn_gpu_copyin(elm->value, so->_cntml_padded); - cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*)); + pd = cnrn_target_copyin(elm->value, so->_cntml_padded); + cnrn_target_memcpy_to_device(&(pelm->value), &pd); } } // visit all the Elm again and fill in pelm->r_down and pelm->c_left for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - auto pelm = (Elm*) cnrn_target_deviceptr(elm); + auto pelm = cnrn_target_deviceptr(elm); if (elm->r_down) { - auto d_e = (Elm*) cnrn_target_deviceptr(elm->r_down); - cnrn_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*)); + auto d_e = cnrn_target_deviceptr(elm->r_down); + cnrn_target_memcpy_to_device(&(pelm->r_down), &d_e); } if (elm->c_right) { - auto d_e = (Elm*) cnrn_target_deviceptr(elm->c_right); - cnrn_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*)); + auto d_e = cnrn_target_deviceptr(elm->c_right); + cnrn_target_memcpy_to_device(&(pelm->c_right), &d_e); } } } // Fill in the d_so->coef_list for (unsigned i = 0; i < so->coef_list_size; ++i) { - pd = (double*) cnrn_target_deviceptr(so->coef_list[i]); - cnrn_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*)); + pd = cnrn_target_deviceptr(so->coef_list[i]); + cnrn_target_memcpy_to_device(&(d_coef_list[i]), &pd); } #endif } @@ -1349,13 +1342,13 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { void nrn_ion_global_map_copyto_device() { if (nrn_ion_global_map_size) { - double** d_data = cnrn_gpu_copyin(nrn_ion_global_map, - nrn_ion_global_map_size); + double** d_data = cnrn_target_copyin(nrn_ion_global_map, + nrn_ion_global_map_size); for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { - double* d_mechmap = cnrn_gpu_copyin(nrn_ion_global_map[j], - ion_global_map_member_size); - cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*)); + double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j], + ion_global_map_member_size); + cnrn_target_memcpy_to_device(&(d_data[j]), &d_mechmap); } } } @@ -1422,10 +1415,8 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i]; /** just VecPlayContinuous object */ - void* d_p = cnrn_gpu_copyin(vecplay_instance); - cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*)); - - VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p; + VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance); + cnrn_target_memcpy_to_device((VecPlayContinuous**)(&(d_vecplay[i])), &d_vecplay_instance); /** copy y_, t_ and discon_indices_ */ copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_); @@ -1437,15 +1428,15 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { } /** copy PlayRecordEvent : todo: verify this */ - PlayRecordEvent* d_e_ = cnrn_gpu_copyin(vecplay_instance->e_); + PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_); - cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*)); - cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*)); + cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**)(&d_vecplay_instance)); + cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_); /** copy pd_ : note that it's pointer inside ml->data and hence data itself is * already on GPU */ - double* d_pd_ = (double*) cnrn_target_deviceptr(vecplay_instance->pd_); - cnrn_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*)); + double* d_pd_ = cnrn_target_deviceptr(vecplay_instance->pd_); + cnrn_target_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_); } } From 4b9af99321cdcacae4154cb1a366e92fca25b02a Mon Sep 17 00:00:00 2001 From: Nicolas Cornu Date: Mon, 13 Dec 2021 12:39:32 +0100 Subject: [PATCH 4/5] Fix static_cast to const_cast --- coreneuron/gpu/nrn_acc_manager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 0b5aa2cc6..8ed02d3e7 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -46,7 +46,7 @@ void nrn_VecPlay_delete_from_device(NrnThread* nt); template T* cnrn_target_deviceptr(const T* h_ptr) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return acc_deviceptr(static_cast(h_ptr)); + return acc_deviceptr(const_cast(h_ptr)); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) return static_cast(omp_get_mapped_ptr(const_cast(h_ptr), omp_get_default_device())); #else From b1ac85392bd36ae282b03cd46be3590d177d2e8c Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Mon, 13 Dec 2021 15:11:03 +0100 Subject: [PATCH 5/5] Fix VecPlayContinuous::discon_indices_ and OpenACC compilation. --- coreneuron/gpu/nrn_acc_manager.cpp | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 8ed02d3e7..4fe0004fd 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -36,7 +36,7 @@ #endif namespace coreneuron { extern InterleaveInfo* interleave_info; -void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div, bool vector_copy_needed = false); +void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div); void delete_ivoc_vect_from_device(IvocVect&); void nrn_ion_global_map_copyto_device(); void nrn_ion_global_map_delete_from_device(); @@ -46,7 +46,7 @@ void nrn_VecPlay_delete_from_device(NrnThread* nt); template T* cnrn_target_deviceptr(const T* h_ptr) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return acc_deviceptr(const_cast(h_ptr)); + return static_cast(acc_deviceptr(const_cast(h_ptr))); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) return static_cast(omp_get_mapped_ptr(const_cast(h_ptr), omp_get_default_device())); #else @@ -490,17 +490,11 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { #endif } -void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_copy_needed) { +void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) { #ifdef _OPENACC /// by default `to` is desitionation pointer on a device IvocVect* d_iv = &to; - /// if we need to copy IvocVect vector then newly alloated vector - /// on the device is a new destination pointer - if(vector_copy_needed) { - d_iv = cnrn_target_copyin(&from); - cnrn_target_memcpy_to_device(&to, d_iv); - } size_t n = from.size(); if (n) { double* d_data = cnrn_target_copyin(from.data(), n); @@ -1416,21 +1410,24 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { /** just VecPlayContinuous object */ VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance); - cnrn_target_memcpy_to_device((VecPlayContinuous**)(&(d_vecplay[i])), &d_vecplay_instance); + cnrn_target_memcpy_to_device((VecPlayContinuous**) (&(d_vecplay[i])), &d_vecplay_instance); /** copy y_, t_ and discon_indices_ */ copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_); copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_); + // OL211213: beware, the test suite does not currently include anything + // with a non-null discon_indices_. if (vecplay_instance->discon_indices_) { + IvocVect* d_discon_indices = cnrn_target_copyin(vecplay_instance->discon_indices_); + cnrn_target_memcpy_to_device(&(d_vecplay_instance->discon_indices_), &d_discon_indices); copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_), - *(d_vecplay_instance->discon_indices_), - true); + *(d_vecplay_instance->discon_indices_)); } /** copy PlayRecordEvent : todo: verify this */ PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_); - - cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**)(&d_vecplay_instance)); + + cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**) (&d_vecplay_instance)); cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_); /** copy pd_ : note that it's pointer inside ml->data and hence data itself is