Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit b18835b

Browse files
authored
Add OpenACC data cleanup. (#515)
* Add OpenACC data cleanup. Ensure that calls to acc_copyin(...) have corresponding calls to acc_delete(...), fixing errors (variable in data clause is partially present on the device) when repeatedly running CoreNEURON on GPU in the same process. Also fix handling of artificial cells to avoid problems with IntervalFire. * Address comment, add more acc_delete calls. This adds `nrn_newtonspace_delete_from_device` and `nrn_sparseobj_delete_from_device` methods that include `acc_delete` counterparts to `acc_copyin` calls.
1 parent aa264c2 commit b18835b

File tree

7 files changed

+242
-16
lines changed

7 files changed

+242
-16
lines changed

coreneuron/apps/main1.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,11 @@ extern "C" int run_solve_core(int argc, char** argv) {
608608
finalize_report();
609609
}
610610

611+
// cleanup threads on GPU
612+
if (corenrn_param.gpu) {
613+
delete_nrnthreads_on_device(nrn_threads, nrn_nthread);
614+
}
615+
611616
// Cleaning the memory
612617
nrn_cleanup();
613618

@@ -621,7 +626,6 @@ extern "C" int run_solve_core(int argc, char** argv) {
621626
}
622627
#endif
623628

624-
finalize_data_on_device();
625629
Instrumentor::phase_end("main");
626630

627631
return 0;

coreneuron/gpu/nrn_acc_manager.cpp

Lines changed: 232 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,11 @@
3131
namespace coreneuron {
3232
extern InterleaveInfo* interleave_info;
3333
void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
34+
void delete_ivoc_vect_from_device(IvocVect&);
3435
void nrn_ion_global_map_copyto_device();
36+
void nrn_ion_global_map_delete_from_device();
3537
void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
38+
void nrn_VecPlay_delete_from_device(NrnThread* nt);
3639

3740
/* note: threads here are corresponding to global nrn_threads array */
3841
void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
@@ -415,6 +418,18 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
415418
#endif
416419
}
417420

421+
void delete_ivoc_vect_from_device(IvocVect& vec) {
422+
#ifdef _OPENACC
423+
auto const n = vec.size();
424+
if (n) {
425+
acc_delete(vec.data(), sizeof(double) * n);
426+
}
427+
acc_delete(&vec, sizeof(IvocVect));
428+
#else
429+
(void) vec;
430+
#endif
431+
}
432+
418433
void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
419434
NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
420435
if (!nrb) {
@@ -610,19 +625,17 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
610625
int is_art = corenrn.get_is_artificial()[type];
611626
int layout = corenrn.get_mech_data_layout()[type];
612627

613-
// PatternStim is a special mechanim of type artificial cell
614-
// and it's not copied on GPU. So we shouldn't update it from GPU.
615-
if (type == nrn_get_mechtype("PatternStim")) {
628+
// Artificial mechanisms such as PatternStim and IntervalFire
629+
// are not copied onto the GPU. They should not, therefore, be
630+
// updated from the GPU.
631+
if (is_art) {
616632
continue;
617633
}
618634

619635
int pcnt = nrn_soa_padded_size(n, layout) * szp;
620636

621637
acc_update_self(ml->data, pcnt * sizeof(double));
622-
623-
if (!is_art) {
624-
acc_update_self(ml->nodeindices, n * sizeof(int));
625-
}
638+
acc_update_self(ml->nodeindices, n * sizeof(int));
626639

627640
if (szdp) {
628641
int pcnt = nrn_soa_padded_size(n, layout) * szdp;
@@ -854,17 +867,153 @@ void update_matrix_to_gpu(NrnThread* _nt) {
854867
#endif
855868
}
856869

857-
void finalize_data_on_device() {
858-
/*@todo: when we have used random123 on gpu and we do this finalize,
859-
I am seeing cuCtxDestroy returned CUDA_ERROR_INVALID_CONTEXT error.
860-
THis might be due to the fact that the cuda apis (e.g. free is not
861-
called yet for Ramdom123 data / streams etc. So handle this better!
862-
*/
863-
return;
870+
/** Cleanup device memory that is being tracked by the OpenACC runtime.
871+
*
872+
* This function painstakingly calls `acc_delete` in reverse order on all
873+
* pointers that were passed to `acc_copyin` in `setup_nrnthreads_on_device`.
874+
* This cleanup ensures that if the GPU is initialised multiple times from the
875+
* same process then the OpenACC runtime will not be polluted with old
876+
* pointers, which can cause errors. In particular if we do:
877+
* @code
878+
* {
879+
* // ... some_ptr is dynamically allocated ...
880+
* acc_copyin(some_ptr, some_size);
881+
* // ... do some work ...
882+
* // acc_delete(some_ptr);
883+
* free(some_ptr);
884+
* }
885+
* {
886+
* // ... same_ptr_again is dynamically allocated at the same address ...
887+
* acc_copyin(same_ptr_again, some_other_size); // ERROR
888+
* }
889+
* @endcode
890+
* the application will/may abort with an error such as:
891+
* FATAL ERROR: variable in data clause is partially present on the device.
892+
* The pattern above is typical of calling CoreNEURON on GPU multiple times in
893+
* the same process.
894+
*/
895+
void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
896+
#ifdef _OPENACC
897+
for (int i = 0; i < nthreads; i++) {
898+
NrnThread* nt = threads + i;
899+
900+
if (nt->_permute) {
901+
if (interleave_permute_type == 1) {
902+
InterleaveInfo* info = interleave_info + i;
903+
acc_delete(info->cellsize, sizeof(int) * nt->ncell);
904+
acc_delete(info->lastnode, sizeof(int) * nt->ncell);
905+
acc_delete(info->firstnode, sizeof(int) * nt->ncell);
906+
acc_delete(info->stride, sizeof(int) * (info->nstride + 1));
907+
acc_delete(info, sizeof(InterleaveInfo));
908+
} else if (interleave_permute_type == 2) {
909+
InterleaveInfo* info = interleave_info + i;
910+
acc_delete(info->cellsize, sizeof(int) * info->nwarp);
911+
acc_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1));
912+
acc_delete(info->lastnode, sizeof(int) * (info->nwarp + 1));
913+
acc_delete(info->firstnode, sizeof(int) * (info->nwarp + 1));
914+
acc_delete(info->stride, sizeof(int) * info->nstride);
915+
acc_delete(info, sizeof(InterleaveInfo));
916+
}
917+
}
918+
919+
if (nt->n_vecplay) {
920+
nrn_VecPlay_delete_from_device(nt);
921+
acc_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
922+
}
923+
924+
// Cleanup send_receive buffer.
925+
if (nt->_net_send_buffer_size) {
926+
acc_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size);
927+
}
928+
929+
if (nt->n_presyn) {
930+
acc_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
931+
acc_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
932+
}
933+
934+
// Cleanup data that's setup in bbcore_read.
935+
if (nt->_nvdata) {
936+
acc_delete(nt->_vdata, sizeof(void*) * nt->_nvdata);
937+
}
938+
939+
// Cleanup weight vector used in NET_RECEIVE
940+
if (nt->n_weight) {
941+
acc_delete(nt->weights, sizeof(double) * nt->n_weight);
942+
}
943+
944+
// Cleanup point processes
945+
if (nt->n_pntproc) {
946+
acc_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
947+
}
948+
949+
if (nt->shadow_rhs_cnt) {
950+
int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
951+
acc_delete(nt->_shadow_d, pcnt * sizeof(double));
952+
acc_delete(nt->_shadow_rhs, pcnt * sizeof(double));
953+
}
954+
955+
for (auto tml = nt->tml; tml; tml = tml->next) {
956+
// Cleanup the net send buffer if it exists
957+
{
958+
NetSendBuffer_t* nsb{tml->ml->_net_send_buffer};
959+
if (nsb) {
960+
acc_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size);
961+
acc_delete(nsb->_nsb_t, sizeof(double) * nsb->_size);
962+
acc_delete(nsb->_weight_index, sizeof(int) * nsb->_size);
963+
acc_delete(nsb->_pnt_index, sizeof(int) * nsb->_size);
964+
acc_delete(nsb->_vdata_index, sizeof(int) * nsb->_size);
965+
acc_delete(nsb->_sendtype, sizeof(int) * nsb->_size);
966+
acc_delete(nsb, sizeof(NetSendBuffer_t));
967+
}
968+
}
969+
// Cleanup the net receive buffer if it exists.
970+
{
971+
NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer};
972+
if (nrb) {
973+
acc_delete(nrb->_nrb_index, sizeof(int) * (nrb->_size + 1));
974+
acc_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1));
975+
acc_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size);
976+
acc_delete(nrb->_nrb_t, sizeof(double) * nrb->_size);
977+
acc_delete(nrb->_weight_index, sizeof(int) * nrb->_size);
978+
acc_delete(nrb->_pnt_index, sizeof(int) * nrb->_size);
979+
acc_delete(nrb, sizeof(NetReceiveBuffer_t));
980+
}
981+
}
982+
int type = tml->index;
983+
int n = tml->ml->nodecount;
984+
int szdp = corenrn.get_prop_dparam_size()[type];
985+
int is_art = corenrn.get_is_artificial()[type];
986+
int layout = corenrn.get_mech_data_layout()[type];
987+
int ts = corenrn.get_memb_funcs()[type].thread_size_;
988+
if (ts) {
989+
acc_delete(tml->ml->_thread, ts * sizeof(ThreadDatum));
990+
}
991+
if (szdp) {
992+
int pcnt = nrn_soa_padded_size(n, layout) * szdp;
993+
acc_delete(tml->ml->pdata, sizeof(int) * pcnt);
994+
}
995+
if (!is_art) {
996+
acc_delete(tml->ml->nodeindices, sizeof(int) * n);
997+
}
998+
acc_delete(tml->ml, sizeof(Memb_list));
999+
acc_delete(tml, sizeof(NrnThreadMembList));
1000+
}
1001+
acc_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*));
1002+
acc_delete(nt->_v_parent_index, nt->end * sizeof(int));
1003+
acc_delete(nt->_data, nt->_ndata * sizeof(double));
1004+
}
1005+
acc_delete(threads, sizeof(NrnThread) * nthreads);
1006+
nrn_ion_global_map_delete_from_device();
1007+
1008+
acc_shutdown(acc_device_nvidia);
1009+
#endif
8641010
}
8651011

1012+
8661013
void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
8671014
#ifdef _OPENACC
1015+
// FIXME this check needs to be tweaked if we ever want to run with a mix
1016+
// of CPU and GPU threads.
8681017
if (nrn_threads[0].compute_gpu == 0) {
8691018
return;
8701019
}
@@ -903,8 +1052,29 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
9031052
#endif
9041053
}
9051054

1055+
void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
1056+
#ifdef _OPENACC
1057+
// FIXME this check needs to be tweaked if we ever want to run with a mix
1058+
// of CPU and GPU threads.
1059+
if (nrn_threads[0].compute_gpu == 0) {
1060+
return;
1061+
}
1062+
int n = ns->n * ns->n_instance;
1063+
acc_delete(ns->jacobian[0], ns->n * n * sizeof(double));
1064+
acc_delete(ns->jacobian, ns->n * sizeof(double*));
1065+
acc_delete(ns->perm, n * sizeof(int));
1066+
acc_delete(ns->rowmax, n * sizeof(double));
1067+
acc_delete(ns->low_value, n * sizeof(double));
1068+
acc_delete(ns->high_value, n * sizeof(double));
1069+
acc_delete(ns->delta_x, n * sizeof(double));
1070+
acc_delete(ns, sizeof(NewtonSpace));
1071+
#endif
1072+
}
1073+
9061074
void nrn_sparseobj_copyto_device(SparseObj* so) {
9071075
#ifdef _OPENACC
1076+
// FIXME this check needs to be tweaked if we ever want to run with a mix
1077+
// of CPU and GPU threads.
9081078
if (nrn_threads[0].compute_gpu == 0) {
9091079
return;
9101080
}
@@ -984,6 +1154,29 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
9841154
#endif
9851155
}
9861156

1157+
void nrn_sparseobj_delete_from_device(SparseObj* so) {
1158+
#ifdef _OPENACC
1159+
// FIXME this check needs to be tweaked if we ever want to run with a mix
1160+
// of CPU and GPU threads.
1161+
if (nrn_threads[0].compute_gpu == 0) {
1162+
return;
1163+
}
1164+
unsigned n1 = so->neqn + 1;
1165+
for (unsigned irow = 1; irow < n1; ++irow) {
1166+
for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
1167+
acc_delete(elm->value, so->_cntml_padded * sizeof(double));
1168+
acc_delete(elm, sizeof(Elm));
1169+
}
1170+
}
1171+
acc_delete(so->coef_list, so->coef_list_size * sizeof(double*));
1172+
acc_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double));
1173+
acc_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
1174+
acc_delete(so->diag, n1 * sizeof(Elm*));
1175+
acc_delete(so->rowst, n1 * sizeof(Elm*));
1176+
acc_delete(so, sizeof(SparseObj));
1177+
#endif
1178+
}
1179+
9871180
#ifdef _OPENACC
9881181

9891182
void nrn_ion_global_map_copyto_device() {
@@ -1001,6 +1194,17 @@ void nrn_ion_global_map_copyto_device() {
10011194
}
10021195
}
10031196

1197+
void nrn_ion_global_map_delete_from_device() {
1198+
for (int j = 0; j < nrn_ion_global_map_size; j++) {
1199+
if (nrn_ion_global_map[j]) {
1200+
acc_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double));
1201+
}
1202+
}
1203+
if (nrn_ion_global_map_size) {
1204+
acc_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size);
1205+
}
1206+
}
1207+
10041208
void init_gpu() {
10051209
// choose nvidia GPU by default
10061210
acc_device_t device_type = acc_device_nvidia;
@@ -1057,5 +1261,19 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
10571261
acc_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*));
10581262
}
10591263
}
1264+
1265+
void nrn_VecPlay_delete_from_device(NrnThread* nt) {
1266+
for (int i = 0; i < nt->n_vecplay; i++) {
1267+
auto* vecplay_instance = reinterpret_cast<VecPlayContinuous*>(nt->_vecplay[i]);
1268+
acc_delete(vecplay_instance->e_, sizeof(PlayRecordEvent));
1269+
if (vecplay_instance->discon_indices_) {
1270+
delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
1271+
}
1272+
delete_ivoc_vect_from_device(vecplay_instance->t_);
1273+
delete_ivoc_vect_from_device(vecplay_instance->y_);
1274+
acc_delete(vecplay_instance, sizeof(VecPlayContinuous));
1275+
}
1276+
}
1277+
10601278
#endif
10611279
} // namespace coreneuron

coreneuron/gpu/nrn_acc_manager.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717

1818
namespace coreneuron {
1919
void setup_nrnthreads_on_device(NrnThread* threads, int nthreads);
20+
void delete_nrnthreads_on_device(NrnThread* threads, int nthreads);
2021
void update_nrnthreads_on_host(NrnThread* threads, int nthreads);
2122
void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
2223
void modify_data_on_device(NrnThread* threads, int nthreads);
2324
void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);
24-
void finalize_data_on_device();
2525

2626
void update_matrix_from_gpu(NrnThread* _nt);
2727
void update_matrix_to_gpu(NrnThread* _nt);

coreneuron/mechanism/mech/mod2c_core_thread.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ extern double _modl_get_dt_thread(NrnThread*);
148148
extern void _modl_set_dt_thread(double, NrnThread*);
149149

150150
void nrn_sparseobj_copyto_device(SparseObj* so);
151+
void nrn_sparseobj_delete_from_device(SparseObj* so);
151152

152153
} // namespace coreneuron
153154

coreneuron/sim/scopmath/newton_struct.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ extern NewtonSpace* nrn_cons_newtonspace(int n, int n_instance);
5858
extern void nrn_destroy_newtonspace(NewtonSpace* ns);
5959

6060
void nrn_newtonspace_copyto_device(NewtonSpace* ns);
61+
void nrn_newtonspace_delete_from_device(NewtonSpace* ns);
6162

6263
} // namespace coreneuron
6364

coreneuron/sim/scopmath/newton_thread.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ NewtonSpace* nrn_cons_newtonspace(int n, int n_instance) {
234234
}
235235

236236
void nrn_destroy_newtonspace(NewtonSpace* ns) {
237+
nrn_newtonspace_delete_from_device(ns);
237238
free((char*) ns->perm);
238239
freevector(ns->delta_x);
239240
freematrix(ns->jacobian);

coreneuron/sim/scopmath/sparse_thread.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -763,6 +763,7 @@ void _nrn_destroy_sparseobj_thread(SparseObj* so) {
763763
if (!so) {
764764
return;
765765
}
766+
nrn_sparseobj_delete_from_device(so);
766767
if (so->rowst)
767768
Free(so->rowst);
768769
if (so->diag)

0 commit comments

Comments
 (0)