Add OpenACC data cleanup. (#515)

olupton · web-flow · commit b18835b19d91 · 2021-03-30T17:09:57.000+02:00
* Add OpenACC data cleanup.
  Ensure that calls to acc_copyin(...) have corresponding calls to
  acc_delete(...), fixing errors (variable in data clause is partially
  present on the device) when repeatedly running CoreNEURON on GPU in the
  same process.

  Also fix handling of artificial cells to avoid problems with IntervalFire.

* Address comment, add more acc_delete calls.

  This adds `nrn_newtonspace_delete_from_device` and
  `nrn_sparseobj_delete_from_device` methods that include `acc_delete`
  counterparts to `acc_copyin` calls.
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
@@ -608,6 +608,11 @@ extern "C" int run_solve_core(int argc, char** argv) {
         finalize_report();
     }
 
+    // cleanup threads on GPU
+    if (corenrn_param.gpu) {
+        delete_nrnthreads_on_device(nrn_threads, nrn_nthread);
+    }
+
     // Cleaning the memory
     nrn_cleanup();
 
@@ -621,7 +626,6 @@ extern "C" int run_solve_core(int argc, char** argv) {
     }
 #endif
 
-    finalize_data_on_device();
     Instrumentor::phase_end("main");
 
     return 0;
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -31,8 +31,11 @@
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
 void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
+void delete_ivoc_vect_from_device(IvocVect&);
 void nrn_ion_global_map_copyto_device();
+void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
+void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
 /* note: threads here are corresponding to global nrn_threads array */
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
@@ -415,6 +418,18 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
 #endif
 }
 
+void delete_ivoc_vect_from_device(IvocVect& vec) {
+#ifdef _OPENACC
+    auto const n = vec.size();
+    if (n) {
+        acc_delete(vec.data(), sizeof(double) * n);
+    }
+    acc_delete(&vec, sizeof(IvocVect));
+#else
+    (void) vec;
+#endif
+}
+
 void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
     NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
     if (!nrb) {
@@ -610,19 +625,17 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
                 int is_art = corenrn.get_is_artificial()[type];
                 int layout = corenrn.get_mech_data_layout()[type];
 
-                // PatternStim is a special mechanim of type artificial cell
-                // and it's not copied on GPU. So we shouldn't update it from GPU.
-                if (type == nrn_get_mechtype("PatternStim")) {
+                // Artificial mechanisms such as PatternStim and IntervalFire
+                // are not copied onto the GPU. They should not, therefore, be
+                // updated from the GPU.
+                if (is_art) {
                     continue;
                 }
 
                 int pcnt = nrn_soa_padded_size(n, layout) * szp;
 
                 acc_update_self(ml->data, pcnt * sizeof(double));
-
-                if (!is_art) {
-                    acc_update_self(ml->nodeindices, n * sizeof(int));
-                }
+                acc_update_self(ml->nodeindices, n * sizeof(int));
 
                 if (szdp) {
                     int pcnt = nrn_soa_padded_size(n, layout) * szdp;
@@ -854,17 +867,153 @@ void update_matrix_to_gpu(NrnThread* _nt) {
 #endif
 }
 
-void finalize_data_on_device() {
-    /*@todo: when we have used random123 on gpu and we do this finalize,
-    I am seeing cuCtxDestroy returned CUDA_ERROR_INVALID_CONTEXT error.
-    THis might be due to the fact that the cuda apis (e.g. free is not
-    called yet for Ramdom123 data / streams etc. So handle this better!
-    */
-    return;
+/** Cleanup device memory that is being tracked by the OpenACC runtime.
+ *
+ *  This function painstakingly calls `acc_delete` in reverse order on all
+ *  pointers that were passed to `acc_copyin` in `setup_nrnthreads_on_device`.
+ *  This cleanup ensures that if the GPU is initialised multiple times from the
+ *  same process then the OpenACC runtime will not be polluted with old
+ *  pointers, which can cause errors. In particular if we do:
+ *  @code
+ *    {
+ *      // ... some_ptr is dynamically allocated ...
+ *      acc_copyin(some_ptr, some_size);
+ *      // ... do some work ...
+ *      // acc_delete(some_ptr);
+ *      free(some_ptr);
+ *    }
+ *    {
+ *      // ... same_ptr_again is dynamically allocated at the same address ...
+ *      acc_copyin(same_ptr_again, some_other_size); // ERROR
+ *    }
+ *  @endcode
+ *  the application will/may abort with an error such as:
+ *    FATAL ERROR: variable in data clause is partially present on the device.
+ *  The pattern above is typical of calling CoreNEURON on GPU multiple times in
+ *  the same process.
+ */
+void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
+#ifdef _OPENACC
+    for (int i = 0; i < nthreads; i++) {
+        NrnThread* nt = threads + i;
+
+        if (nt->_permute) {
+            if (interleave_permute_type == 1) {
+                InterleaveInfo* info = interleave_info + i;
+                acc_delete(info->cellsize, sizeof(int) * nt->ncell);
+                acc_delete(info->lastnode, sizeof(int) * nt->ncell);
+                acc_delete(info->firstnode, sizeof(int) * nt->ncell);
+                acc_delete(info->stride, sizeof(int) * (info->nstride + 1));
+                acc_delete(info, sizeof(InterleaveInfo));
+            } else if (interleave_permute_type == 2) {
+                InterleaveInfo* info = interleave_info + i;
+                acc_delete(info->cellsize, sizeof(int) * info->nwarp);
+                acc_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1));
+                acc_delete(info->lastnode, sizeof(int) * (info->nwarp + 1));
+                acc_delete(info->firstnode, sizeof(int) * (info->nwarp + 1));
+                acc_delete(info->stride, sizeof(int) * info->nstride);
+                acc_delete(info, sizeof(InterleaveInfo));
+            }
+        }
+
+        if (nt->n_vecplay) {
+            nrn_VecPlay_delete_from_device(nt);
+            acc_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+        }
+
+        // Cleanup send_receive buffer.
+        if (nt->_net_send_buffer_size) {
+            acc_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size);
+        }
+
+        if (nt->n_presyn) {
+            acc_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            acc_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+        }
+
+        // Cleanup data that's setup in bbcore_read.
+        if (nt->_nvdata) {
+            acc_delete(nt->_vdata, sizeof(void*) * nt->_nvdata);
+        }
+
+        // Cleanup weight vector used in NET_RECEIVE
+        if (nt->n_weight) {
+            acc_delete(nt->weights, sizeof(double) * nt->n_weight);
+        }
+
+        // Cleanup point processes
+        if (nt->n_pntproc) {
+            acc_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+        }
+
+        if (nt->shadow_rhs_cnt) {
+            int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
+            acc_delete(nt->_shadow_d, pcnt * sizeof(double));
+            acc_delete(nt->_shadow_rhs, pcnt * sizeof(double));
+        }
+
+        for (auto tml = nt->tml; tml; tml = tml->next) {
+            // Cleanup the net send buffer if it exists
+            {
+                NetSendBuffer_t* nsb{tml->ml->_net_send_buffer};
+                if (nsb) {
+                    acc_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size);
+                    acc_delete(nsb->_nsb_t, sizeof(double) * nsb->_size);
+                    acc_delete(nsb->_weight_index, sizeof(int) * nsb->_size);
+                    acc_delete(nsb->_pnt_index, sizeof(int) * nsb->_size);
+                    acc_delete(nsb->_vdata_index, sizeof(int) * nsb->_size);
+                    acc_delete(nsb->_sendtype, sizeof(int) * nsb->_size);
+                    acc_delete(nsb, sizeof(NetSendBuffer_t));
+                }
+            }
+            // Cleanup the net receive buffer if it exists.
+            {
+                NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer};
+                if (nrb) {
+                    acc_delete(nrb->_nrb_index, sizeof(int) * (nrb->_size + 1));
+                    acc_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+                    acc_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size);
+                    acc_delete(nrb->_nrb_t, sizeof(double) * nrb->_size);
+                    acc_delete(nrb->_weight_index, sizeof(int) * nrb->_size);
+                    acc_delete(nrb->_pnt_index, sizeof(int) * nrb->_size);
+                    acc_delete(nrb, sizeof(NetReceiveBuffer_t));
+                }
+            }
+            int type = tml->index;
+            int n = tml->ml->nodecount;
+            int szdp = corenrn.get_prop_dparam_size()[type];
+            int is_art = corenrn.get_is_artificial()[type];
+            int layout = corenrn.get_mech_data_layout()[type];
+            int ts = corenrn.get_memb_funcs()[type].thread_size_;
+            if (ts) {
+                acc_delete(tml->ml->_thread, ts * sizeof(ThreadDatum));
+            }
+            if (szdp) {
+                int pcnt = nrn_soa_padded_size(n, layout) * szdp;
+                acc_delete(tml->ml->pdata, sizeof(int) * pcnt);
+            }
+            if (!is_art) {
+                acc_delete(tml->ml->nodeindices, sizeof(int) * n);
+            }
+            acc_delete(tml->ml, sizeof(Memb_list));
+            acc_delete(tml, sizeof(NrnThreadMembList));
+        }
+        acc_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*));
+        acc_delete(nt->_v_parent_index, nt->end * sizeof(int));
+        acc_delete(nt->_data, nt->_ndata * sizeof(double));
+    }
+    acc_delete(threads, sizeof(NrnThread) * nthreads);
+    nrn_ion_global_map_delete_from_device();
+
+    acc_shutdown(acc_device_nvidia);
+#endif
 }
 
+
 void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
 #ifdef _OPENACC
+    // FIXME this check needs to be tweaked if we ever want to run with a mix
+    //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
         return;
     }
@@ -903,8 +1052,29 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
 #endif
 }
 
+void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
+#ifdef _OPENACC
+    // FIXME this check needs to be tweaked if we ever want to run with a mix
+    //       of CPU and GPU threads.
+    if (nrn_threads[0].compute_gpu == 0) {
+        return;
+    }
+    int n = ns->n * ns->n_instance;
+    acc_delete(ns->jacobian[0], ns->n * n * sizeof(double));
+    acc_delete(ns->jacobian, ns->n * sizeof(double*));
+    acc_delete(ns->perm, n * sizeof(int));
+    acc_delete(ns->rowmax, n * sizeof(double));
+    acc_delete(ns->low_value, n * sizeof(double));
+    acc_delete(ns->high_value, n * sizeof(double));
+    acc_delete(ns->delta_x, n * sizeof(double));
+    acc_delete(ns, sizeof(NewtonSpace));
+#endif
+}
+
 void nrn_sparseobj_copyto_device(SparseObj* so) {
 #ifdef _OPENACC
+    // FIXME this check needs to be tweaked if we ever want to run with a mix
+    //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
         return;
     }
@@ -984,6 +1154,29 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
 #endif
 }
 
+void nrn_sparseobj_delete_from_device(SparseObj* so) {
+#ifdef _OPENACC
+    // FIXME this check needs to be tweaked if we ever want to run with a mix
+    //       of CPU and GPU threads.
+    if (nrn_threads[0].compute_gpu == 0) {
+        return;
+    }
+    unsigned n1 = so->neqn + 1;
+    for (unsigned irow = 1; irow < n1; ++irow) {
+        for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
+            acc_delete(elm->value, so->_cntml_padded * sizeof(double));
+            acc_delete(elm, sizeof(Elm));
+        }
+    }
+    acc_delete(so->coef_list, so->coef_list_size * sizeof(double*));
+    acc_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double));
+    acc_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
+    acc_delete(so->diag, n1 * sizeof(Elm*));
+    acc_delete(so->rowst, n1 * sizeof(Elm*));
+    acc_delete(so, sizeof(SparseObj));
+#endif
+}
+
 #ifdef _OPENACC
 
 void nrn_ion_global_map_copyto_device() {
@@ -1001,6 +1194,17 @@ void nrn_ion_global_map_copyto_device() {
     }
 }
 
+void nrn_ion_global_map_delete_from_device() {
+    for (int j = 0; j < nrn_ion_global_map_size; j++) {
+        if (nrn_ion_global_map[j]) {
+            acc_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double));
+        }
+    }
+    if (nrn_ion_global_map_size) {
+        acc_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size);
+    }
+}
+
 void init_gpu() {
     // choose nvidia GPU by default
     acc_device_t device_type = acc_device_nvidia;
@@ -1057,5 +1261,19 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         acc_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*));
     }
 }
+
+void nrn_VecPlay_delete_from_device(NrnThread* nt) {
+    for (int i = 0; i < nt->n_vecplay; i++) {
+        auto* vecplay_instance = reinterpret_cast<VecPlayContinuous*>(nt->_vecplay[i]);
+        acc_delete(vecplay_instance->e_, sizeof(PlayRecordEvent));
+        if (vecplay_instance->discon_indices_) {
+            delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
+        }
+        delete_ivoc_vect_from_device(vecplay_instance->t_);
+        delete_ivoc_vect_from_device(vecplay_instance->y_);
+        acc_delete(vecplay_instance, sizeof(VecPlayContinuous));
+    }
+}
+
 #endif
 }  // namespace coreneuron
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -17,11 +17,11 @@
 
 namespace coreneuron {
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads);
+void delete_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void update_nrnthreads_on_host(NrnThread* threads, int nthreads);
 void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void modify_data_on_device(NrnThread* threads, int nthreads);
 void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);
-void finalize_data_on_device();
 
 void update_matrix_from_gpu(NrnThread* _nt);
 void update_matrix_to_gpu(NrnThread* _nt);
diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
@@ -148,6 +148,7 @@ extern double _modl_get_dt_thread(NrnThread*);
 extern void _modl_set_dt_thread(double, NrnThread*);
 
 void nrn_sparseobj_copyto_device(SparseObj* so);
+void nrn_sparseobj_delete_from_device(SparseObj* so);
 
 }  // namespace coreneuron
 
diff --git a/coreneuron/sim/scopmath/newton_struct.h b/coreneuron/sim/scopmath/newton_struct.h
@@ -58,6 +58,7 @@ extern NewtonSpace* nrn_cons_newtonspace(int n, int n_instance);
 extern void nrn_destroy_newtonspace(NewtonSpace* ns);
 
 void nrn_newtonspace_copyto_device(NewtonSpace* ns);
+void nrn_newtonspace_delete_from_device(NewtonSpace* ns);
 
 }  // namespace coreneuron
 
diff --git a/coreneuron/sim/scopmath/newton_thread.cpp b/coreneuron/sim/scopmath/newton_thread.cpp
@@ -234,6 +234,7 @@ NewtonSpace* nrn_cons_newtonspace(int n, int n_instance) {
 }
 
 void nrn_destroy_newtonspace(NewtonSpace* ns) {
+    nrn_newtonspace_delete_from_device(ns);
     free((char*) ns->perm);
     freevector(ns->delta_x);
     freematrix(ns->jacobian);
diff --git a/coreneuron/sim/scopmath/sparse_thread.cpp b/coreneuron/sim/scopmath/sparse_thread.cpp
@@ -763,6 +763,7 @@ void _nrn_destroy_sparseobj_thread(SparseObj* so) {
     if (!so) {
         return;
     }
+    nrn_sparseobj_delete_from_device(so);
     if (so->rowst)
         Free(so->rowst);
     if (so->diag)

Original file line number	Diff line number	Diff line change
`@@ -234,6 +234,7 @@ NewtonSpace* nrn_cons_newtonspace(int n, int n_instance) {`
`234`	`234`	`}`
`235`	`235`
`236`	`236`	`void nrn_destroy_newtonspace(NewtonSpace* ns) {`
	`237`	`+ nrn_newtonspace_delete_from_device(ns);`
`237`	`238`	`free((char*) ns->perm);`
`238`	`239`	`freevector(ns->delta_x);`
`239`	`240`	`freematrix(ns->jacobian);`
Original file line number	Diff line number	Diff line change
`@@ -763,6 +763,7 @@ void _nrn_destroy_sparseobj_thread(SparseObj* so) {`
`763`	`763`	`if (!so) {`
`764`	`764`	`return;`
`765`	`765`	`}`
	`766`	`+ nrn_sparseobj_delete_from_device(so);`
`766`	`767`	`if (so->rowst)`
`767`	`768`	`Free(so->rowst);`
`768`	`769`	`if (so->diag)`