3131namespace coreneuron {
3232extern InterleaveInfo* interleave_info;
3333void copy_ivoc_vect_to_device (const IvocVect& iv, IvocVect& div);
34+ void delete_ivoc_vect_from_device (IvocVect&);
3435void nrn_ion_global_map_copyto_device ();
36+ void nrn_ion_global_map_delete_from_device ();
3537void nrn_VecPlay_copyto_device (NrnThread* nt, void ** d_vecplay);
38+ void nrn_VecPlay_delete_from_device (NrnThread* nt);
3639
3740/* note: threads here are corresponding to global nrn_threads array */
3841void setup_nrnthreads_on_device (NrnThread* threads, int nthreads) {
@@ -415,6 +418,18 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
415418#endif
416419}
417420
421+ void delete_ivoc_vect_from_device (IvocVect& vec) {
422+ #ifdef _OPENACC
423+ auto const n = vec.size ();
424+ if (n) {
425+ acc_delete (vec.data (), sizeof (double ) * n);
426+ }
427+ acc_delete (&vec, sizeof (IvocVect));
428+ #else
429+ (void ) vec;
430+ #endif
431+ }
432+
418433void realloc_net_receive_buffer (NrnThread* nt, Memb_list* ml) {
419434 NetReceiveBuffer_t* nrb = ml->_net_receive_buffer ;
420435 if (!nrb) {
@@ -610,19 +625,17 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
610625 int is_art = corenrn.get_is_artificial ()[type];
611626 int layout = corenrn.get_mech_data_layout ()[type];
612627
613- // PatternStim is a special mechanim of type artificial cell
614- // and it's not copied on GPU. So we shouldn't update it from GPU.
615- if (type == nrn_get_mechtype (" PatternStim" )) {
628+ // Artificial mechanisms such as PatternStim and IntervalFire
629+ // are not copied onto the GPU. They should not, therefore, be
630+ // updated from the GPU.
631+ if (is_art) {
616632 continue ;
617633 }
618634
619635 int pcnt = nrn_soa_padded_size (n, layout) * szp;
620636
621637 acc_update_self (ml->data , pcnt * sizeof (double ));
622-
623- if (!is_art) {
624- acc_update_self (ml->nodeindices , n * sizeof (int ));
625- }
638+ acc_update_self (ml->nodeindices , n * sizeof (int ));
626639
627640 if (szdp) {
628641 int pcnt = nrn_soa_padded_size (n, layout) * szdp;
@@ -854,17 +867,153 @@ void update_matrix_to_gpu(NrnThread* _nt) {
854867#endif
855868}
856869
857- void finalize_data_on_device () {
858- /* @todo: when we have used random123 on gpu and we do this finalize,
859- I am seeing cuCtxDestroy returned CUDA_ERROR_INVALID_CONTEXT error.
860- THis might be due to the fact that the cuda apis (e.g. free is not
861- called yet for Ramdom123 data / streams etc. So handle this better!
862- */
863- return ;
870+ /* * Cleanup device memory that is being tracked by the OpenACC runtime.
871+ *
872+ * This function painstakingly calls `acc_delete` in reverse order on all
873+ * pointers that were passed to `acc_copyin` in `setup_nrnthreads_on_device`.
874+ * This cleanup ensures that if the GPU is initialised multiple times from the
875+ * same process then the OpenACC runtime will not be polluted with old
876+ * pointers, which can cause errors. In particular if we do:
877+ * @code
878+ * {
879+ * // ... some_ptr is dynamically allocated ...
880+ * acc_copyin(some_ptr, some_size);
881+ * // ... do some work ...
882+ * // acc_delete(some_ptr);
883+ * free(some_ptr);
884+ * }
885+ * {
886+ * // ... same_ptr_again is dynamically allocated at the same address ...
887+ * acc_copyin(same_ptr_again, some_other_size); // ERROR
888+ * }
889+ * @endcode
890+ * the application will/may abort with an error such as:
891+ * FATAL ERROR: variable in data clause is partially present on the device.
892+ * The pattern above is typical of calling CoreNEURON on GPU multiple times in
893+ * the same process.
894+ */
895+ void delete_nrnthreads_on_device (NrnThread* threads, int nthreads) {
896+ #ifdef _OPENACC
897+ for (int i = 0 ; i < nthreads; i++) {
898+ NrnThread* nt = threads + i;
899+
900+ if (nt->_permute ) {
901+ if (interleave_permute_type == 1 ) {
902+ InterleaveInfo* info = interleave_info + i;
903+ acc_delete (info->cellsize , sizeof (int ) * nt->ncell );
904+ acc_delete (info->lastnode , sizeof (int ) * nt->ncell );
905+ acc_delete (info->firstnode , sizeof (int ) * nt->ncell );
906+ acc_delete (info->stride , sizeof (int ) * (info->nstride + 1 ));
907+ acc_delete (info, sizeof (InterleaveInfo));
908+ } else if (interleave_permute_type == 2 ) {
909+ InterleaveInfo* info = interleave_info + i;
910+ acc_delete (info->cellsize , sizeof (int ) * info->nwarp );
911+ acc_delete (info->stridedispl , sizeof (int ) * (info->nwarp + 1 ));
912+ acc_delete (info->lastnode , sizeof (int ) * (info->nwarp + 1 ));
913+ acc_delete (info->firstnode , sizeof (int ) * (info->nwarp + 1 ));
914+ acc_delete (info->stride , sizeof (int ) * info->nstride );
915+ acc_delete (info, sizeof (InterleaveInfo));
916+ }
917+ }
918+
919+ if (nt->n_vecplay ) {
920+ nrn_VecPlay_delete_from_device (nt);
921+ acc_delete (nt->_vecplay , sizeof (void *) * nt->n_vecplay );
922+ }
923+
924+ // Cleanup send_receive buffer.
925+ if (nt->_net_send_buffer_size ) {
926+ acc_delete (nt->_net_send_buffer , sizeof (int ) * nt->_net_send_buffer_size );
927+ }
928+
929+ if (nt->n_presyn ) {
930+ acc_delete (nt->presyns , sizeof (PreSyn) * nt->n_presyn );
931+ acc_delete (nt->presyns_helper , sizeof (PreSynHelper) * nt->n_presyn );
932+ }
933+
934+ // Cleanup data that's setup in bbcore_read.
935+ if (nt->_nvdata ) {
936+ acc_delete (nt->_vdata , sizeof (void *) * nt->_nvdata );
937+ }
938+
939+ // Cleanup weight vector used in NET_RECEIVE
940+ if (nt->n_weight ) {
941+ acc_delete (nt->weights , sizeof (double ) * nt->n_weight );
942+ }
943+
944+ // Cleanup point processes
945+ if (nt->n_pntproc ) {
946+ acc_delete (nt->pntprocs , nt->n_pntproc * sizeof (Point_process));
947+ }
948+
949+ if (nt->shadow_rhs_cnt ) {
950+ int pcnt = nrn_soa_padded_size (nt->shadow_rhs_cnt , 0 );
951+ acc_delete (nt->_shadow_d , pcnt * sizeof (double ));
952+ acc_delete (nt->_shadow_rhs , pcnt * sizeof (double ));
953+ }
954+
955+ for (auto tml = nt->tml ; tml; tml = tml->next ) {
956+ // Cleanup the net send buffer if it exists
957+ {
958+ NetSendBuffer_t* nsb{tml->ml ->_net_send_buffer };
959+ if (nsb) {
960+ acc_delete (nsb->_nsb_flag , sizeof (double ) * nsb->_size );
961+ acc_delete (nsb->_nsb_t , sizeof (double ) * nsb->_size );
962+ acc_delete (nsb->_weight_index , sizeof (int ) * nsb->_size );
963+ acc_delete (nsb->_pnt_index , sizeof (int ) * nsb->_size );
964+ acc_delete (nsb->_vdata_index , sizeof (int ) * nsb->_size );
965+ acc_delete (nsb->_sendtype , sizeof (int ) * nsb->_size );
966+ acc_delete (nsb, sizeof (NetSendBuffer_t));
967+ }
968+ }
969+ // Cleanup the net receive buffer if it exists.
970+ {
971+ NetReceiveBuffer_t* nrb{tml->ml ->_net_receive_buffer };
972+ if (nrb) {
973+ acc_delete (nrb->_nrb_index , sizeof (int ) * (nrb->_size + 1 ));
974+ acc_delete (nrb->_displ , sizeof (int ) * (nrb->_size + 1 ));
975+ acc_delete (nrb->_nrb_flag , sizeof (double ) * nrb->_size );
976+ acc_delete (nrb->_nrb_t , sizeof (double ) * nrb->_size );
977+ acc_delete (nrb->_weight_index , sizeof (int ) * nrb->_size );
978+ acc_delete (nrb->_pnt_index , sizeof (int ) * nrb->_size );
979+ acc_delete (nrb, sizeof (NetReceiveBuffer_t));
980+ }
981+ }
982+ int type = tml->index ;
983+ int n = tml->ml ->nodecount ;
984+ int szdp = corenrn.get_prop_dparam_size ()[type];
985+ int is_art = corenrn.get_is_artificial ()[type];
986+ int layout = corenrn.get_mech_data_layout ()[type];
987+ int ts = corenrn.get_memb_funcs ()[type].thread_size_ ;
988+ if (ts) {
989+ acc_delete (tml->ml ->_thread , ts * sizeof (ThreadDatum));
990+ }
991+ if (szdp) {
992+ int pcnt = nrn_soa_padded_size (n, layout) * szdp;
993+ acc_delete (tml->ml ->pdata , sizeof (int ) * pcnt);
994+ }
995+ if (!is_art) {
996+ acc_delete (tml->ml ->nodeindices , sizeof (int ) * n);
997+ }
998+ acc_delete (tml->ml , sizeof (Memb_list));
999+ acc_delete (tml, sizeof (NrnThreadMembList));
1000+ }
1001+ acc_delete (nt->_ml_list , corenrn.get_memb_funcs ().size () * sizeof (Memb_list*));
1002+ acc_delete (nt->_v_parent_index , nt->end * sizeof (int ));
1003+ acc_delete (nt->_data , nt->_ndata * sizeof (double ));
1004+ }
1005+ acc_delete (threads, sizeof (NrnThread) * nthreads);
1006+ nrn_ion_global_map_delete_from_device ();
1007+
1008+ acc_shutdown (acc_device_nvidia);
1009+ #endif
8641010}
8651011
1012+
8661013void nrn_newtonspace_copyto_device (NewtonSpace* ns) {
8671014#ifdef _OPENACC
1015+ // FIXME this check needs to be tweaked if we ever want to run with a mix
1016+ // of CPU and GPU threads.
8681017 if (nrn_threads[0 ].compute_gpu == 0 ) {
8691018 return ;
8701019 }
@@ -903,8 +1052,29 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
9031052#endif
9041053}
9051054
1055+ void nrn_newtonspace_delete_from_device (NewtonSpace* ns) {
1056+ #ifdef _OPENACC
1057+ // FIXME this check needs to be tweaked if we ever want to run with a mix
1058+ // of CPU and GPU threads.
1059+ if (nrn_threads[0 ].compute_gpu == 0 ) {
1060+ return ;
1061+ }
1062+ int n = ns->n * ns->n_instance ;
1063+ acc_delete (ns->jacobian [0 ], ns->n * n * sizeof (double ));
1064+ acc_delete (ns->jacobian , ns->n * sizeof (double *));
1065+ acc_delete (ns->perm , n * sizeof (int ));
1066+ acc_delete (ns->rowmax , n * sizeof (double ));
1067+ acc_delete (ns->low_value , n * sizeof (double ));
1068+ acc_delete (ns->high_value , n * sizeof (double ));
1069+ acc_delete (ns->delta_x , n * sizeof (double ));
1070+ acc_delete (ns, sizeof (NewtonSpace));
1071+ #endif
1072+ }
1073+
9061074void nrn_sparseobj_copyto_device (SparseObj* so) {
9071075#ifdef _OPENACC
1076+ // FIXME this check needs to be tweaked if we ever want to run with a mix
1077+ // of CPU and GPU threads.
9081078 if (nrn_threads[0 ].compute_gpu == 0 ) {
9091079 return ;
9101080 }
@@ -984,6 +1154,29 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
9841154#endif
9851155}
9861156
1157+ void nrn_sparseobj_delete_from_device (SparseObj* so) {
1158+ #ifdef _OPENACC
1159+ // FIXME this check needs to be tweaked if we ever want to run with a mix
1160+ // of CPU and GPU threads.
1161+ if (nrn_threads[0 ].compute_gpu == 0 ) {
1162+ return ;
1163+ }
1164+ unsigned n1 = so->neqn + 1 ;
1165+ for (unsigned irow = 1 ; irow < n1; ++irow) {
1166+ for (Elm* elm = so->rowst [irow]; elm; elm = elm->c_right ) {
1167+ acc_delete (elm->value , so->_cntml_padded * sizeof (double ));
1168+ acc_delete (elm, sizeof (Elm));
1169+ }
1170+ }
1171+ acc_delete (so->coef_list , so->coef_list_size * sizeof (double *));
1172+ acc_delete (so->rhs , n1 * so->_cntml_padded * sizeof (double ));
1173+ acc_delete (so->ngetcall , so->_cntml_padded * sizeof (unsigned ));
1174+ acc_delete (so->diag , n1 * sizeof (Elm*));
1175+ acc_delete (so->rowst , n1 * sizeof (Elm*));
1176+ acc_delete (so, sizeof (SparseObj));
1177+ #endif
1178+ }
1179+
9871180#ifdef _OPENACC
9881181
9891182void nrn_ion_global_map_copyto_device () {
@@ -1001,6 +1194,17 @@ void nrn_ion_global_map_copyto_device() {
10011194 }
10021195}
10031196
1197+ void nrn_ion_global_map_delete_from_device () {
1198+ for (int j = 0 ; j < nrn_ion_global_map_size; j++) {
1199+ if (nrn_ion_global_map[j]) {
1200+ acc_delete (nrn_ion_global_map[j], ion_global_map_member_size * sizeof (double ));
1201+ }
1202+ }
1203+ if (nrn_ion_global_map_size) {
1204+ acc_delete (nrn_ion_global_map, sizeof (double *) * nrn_ion_global_map_size);
1205+ }
1206+ }
1207+
10041208void init_gpu () {
10051209 // choose nvidia GPU by default
10061210 acc_device_t device_type = acc_device_nvidia;
@@ -1057,5 +1261,19 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
10571261 acc_memcpy_to_device (&(d_vecplay_instance->pd_ ), &d_pd_, sizeof (double *));
10581262 }
10591263}
1264+
1265+ void nrn_VecPlay_delete_from_device (NrnThread* nt) {
1266+ for (int i = 0 ; i < nt->n_vecplay ; i++) {
1267+ auto * vecplay_instance = reinterpret_cast <VecPlayContinuous*>(nt->_vecplay [i]);
1268+ acc_delete (vecplay_instance->e_ , sizeof (PlayRecordEvent));
1269+ if (vecplay_instance->discon_indices_ ) {
1270+ delete_ivoc_vect_from_device (*(vecplay_instance->discon_indices_ ));
1271+ }
1272+ delete_ivoc_vect_from_device (vecplay_instance->t_ );
1273+ delete_ivoc_vect_from_device (vecplay_instance->y_ );
1274+ acc_delete (vecplay_instance, sizeof (VecPlayContinuous));
1275+ }
1276+ }
1277+
10601278#endif
10611279} // namespace coreneuron
0 commit comments