Enable fast_imem on GPU.

olupton · olupton · commit 30845ddd379b · 2021-06-02T13:42:11.000+02:00
Updates mod2c/nmodl submodule commits to include relevant fixes, BlueBrain/mod2c#64 and BlueBrain/nmodl#681. Closes #197.
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -291,6 +291,23 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             acc_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*));
         }
 
+        /* Fast membrane current calculation struct */
+        if (nt->nrn_fast_imem) {
+            auto* d_fast_imem = reinterpret_cast<NrnFastImem*>(
+                acc_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem)));
+            acc_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*));
+            {
+                auto* d_ptr = reinterpret_cast<double*>(
+                    acc_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)));
+                acc_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*));
+            }
+            {
+                auto* d_ptr = reinterpret_cast<double*>(
+                    acc_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)));
+                acc_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*));
+            }
+        }
+
         if (nt->n_pntproc) {
             /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
              */
@@ -659,6 +676,11 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
                 acc_update_self(nt->_shadow_d, pcnt * sizeof(double));
             }
 
+            if (nt->nrn_fast_imem) {
+                acc_update_self(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
+                acc_update_self(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
+            }
+
             if (nt->n_pntproc) {
                 acc_update_self(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
             }
@@ -748,6 +770,11 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 acc_update_device(nt->_shadow_d, pcnt * sizeof(double));
             }
 
+            if (nt->nrn_fast_imem) {
+                acc_update_device(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
+                acc_update_device(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
+            }
+
             if (nt->n_pntproc) {
                 acc_update_device(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
             }
@@ -787,6 +814,19 @@ void update_voltage_from_gpu(NrnThread* nt) {
     }
 }
 
+/**
+ * @brief Copy fast_imem vectors from GPU to CPU.
+ *
+ */
+void update_fast_imem_from_gpu(NrnThread* nt) {
+    if (nt->compute_gpu && nt->end > 0 && nt->nrn_fast_imem) {
+        int num_fast_imem = nt->end;
+        double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d;
+        double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
+#pragma acc update host(fast_imem_d [0:num_fast_imem], fast_imem_rhs [0:num_fast_imem])
+    }
+}
+
 /**
  * Copy weights from GPU to CPU
  *
@@ -940,6 +980,11 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             acc_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
         }
 
+        if (nt->nrn_fast_imem) {
+            acc_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
+            acc_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
+        }
+
         if (nt->shadow_rhs_cnt) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
             acc_delete(nt->_shadow_d, pcnt * sizeof(double));
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -29,6 +29,7 @@ void update_net_receive_buffer(NrnThread* _nt);
 void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
 void update_voltage_from_gpu(NrnThread* nt);
+void update_fast_imem_from_gpu(NrnThread* nt);
 void update_weights_from_gpu(NrnThread* threads, int nthreads);
 void init_gpu();
 
diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp
@@ -298,6 +298,9 @@ void nrncore2nrn_send_values(NrnThread* nth) {
         //       Currently we are updating voltages if there is any trajectory
         //       requested by NEURON.
         update_voltage_from_gpu(nth);
+        // \todo Check if this information has been requested by the user for
+        //       this NrnThread object.
+        update_fast_imem_from_gpu(nth);
 
         if (tr->varrays) {  // full trajectories into Vector data
             double** va = tr->varrays;
diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp
@@ -50,6 +50,10 @@ void nrn_calc_fast_imem(NrnThread* nt) {
 
     double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d;
     double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
+#pragma acc parallel loop present(vec_rhs,     \
+                                  vec_area,    \
+                                  fast_imem_d, \
+                                  fast_imem_rhs) if (nt->compute_gpu) async(nt->stream_id)
     for (int i = i1; i < i3; ++i) {
         fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
     }
diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp
@@ -44,9 +44,14 @@ static void nrn_rhs(NrnThread* _nt) {
     }
 
     if (_nt->nrn_fast_imem) {
+        double* fast_imem_d = _nt->nrn_fast_imem->nrn_sav_d;
+        double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs;
+#pragma acc parallel loop present(fast_imem_d [i1:i3],                         \
+                                  fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) \
+    async(_nt->stream_id)
         for (int i = i1; i < i3; ++i) {
-            _nt->nrn_fast_imem->nrn_sav_rhs[i] = 0.;
-            _nt->nrn_fast_imem->nrn_sav_d[i] = 0.;
+            fast_imem_d[i] = 0.;
+            fast_imem_rhs[i] = 0.;
         }
     }
 
@@ -71,6 +76,7 @@ static void nrn_rhs(NrnThread* _nt) {
            so here we transform so it only has membrane current contribution
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
+#pragma acc parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id)
         for (int i = i1; i < i3; ++i) {
             p[i] -= vec_rhs[i];
         }
@@ -144,6 +150,7 @@ static void nrn_lhs(NrnThread* _nt) {
            so here we transform so it only has membrane current contribution
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_d;
+#pragma acc parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)
         for (int i = i1; i < i3; ++i) {
             p[i] += vec_d[i];
         }
diff --git a/external/mod2c b/external/mod2c
@@ -1 +1 @@
-Subproject commit 92d62cc780ec09acde0524246cb34403bcf03fb3
+Subproject commit ea286d1b8e34ade50f58a4e07bb66890aa521dbc
diff --git a/external/nmodl b/external/nmodl
@@ -1 +1 @@
-Subproject commit 2586c2e66ffd4d8e67244f8447ef56c9e79575a9
+Subproject commit 6300b47f9484836aafeaf964326daf8bf36fc63a

Original file line number	Diff line number	Diff line change
`@@ -44,9 +44,14 @@ static void nrn_rhs(NrnThread* _nt) {`
`44`	`44`	`}`
`45`	`45`
`46`	`46`	`if (_nt->nrn_fast_imem) {`
	`47`	`+ double* fast_imem_d = _nt->nrn_fast_imem->nrn_sav_d;`
	`48`	`+ double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs;`
	`49`	`+#pragma acc parallel loop present(fast_imem_d [i1:i3], \`
	`50`	`+ fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) \`
	`51`	`+ async(_nt->stream_id)`
`47`	`52`	`for (int i = i1; i < i3; ++i) {`
`48`		`- _nt->nrn_fast_imem->nrn_sav_rhs[i] = 0.;`
`49`		`- _nt->nrn_fast_imem->nrn_sav_d[i] = 0.;`
	`53`	`+ fast_imem_d[i] = 0.;`
	`54`	`+ fast_imem_rhs[i] = 0.;`
`50`	`55`	`}`
`51`	`56`	`}`
`52`	`57`
`@@ -71,6 +76,7 @@ static void nrn_rhs(NrnThread* _nt) {`
`71`	`76`	`so here we transform so it only has membrane current contribution`
`72`	`77`	`*/`
`73`	`78`	`double* p = _nt->nrn_fast_imem->nrn_sav_rhs;`
	`79`	`+#pragma acc parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id)`
`74`	`80`	`for (int i = i1; i < i3; ++i) {`
`75`	`81`	`p[i] -= vec_rhs[i];`
`76`	`82`	`}`
`@@ -144,6 +150,7 @@ static void nrn_lhs(NrnThread* _nt) {`
`144`	`150`	`so here we transform so it only has membrane current contribution`
`145`	`151`	`*/`
`146`	`152`	`double* p = _nt->nrn_fast_imem->nrn_sav_d;`
	`153`	`+#pragma acc parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)`
`147`	`154`	`for (int i = i1; i < i3; ++i) {`
`148`	`155`	`p[i] += vec_d[i];`
`149`	`156`	`}`