Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
57f12be
Simplify unified memory logic.
olupton Nov 23, 2021
30a69d5
Basic OpenACC -> OpenMP migration.
olupton Nov 18, 2021
9f14e63
nrn_{acc,omp}_pragma -> nrn_pragma_{acc,omp}.
olupton Nov 29, 2021
4d67bf0
Add --gpu to test.
olupton Nov 29, 2021
5d9c7e7
Default (BB5-valid) CORENRN_EXTERNAL_BENCHMARK_DATA.
olupton Nov 29, 2021
385a34b
Remove cuda_add_library.
olupton Nov 29, 2021
8c6210e
Define nrn_pragma_{acc,omp} in header.
olupton Nov 29, 2021
424b14f
Update NMODL with codegen fixes.
olupton Nov 30, 2021
4c86be3
Migrate more pragmas.
olupton Nov 30, 2021
4ffb0bc
Move input data.
olupton Dec 1, 2021
388fe26
Migrate more.
olupton Dec 1, 2021
e36f6ad
Migrate more directives.
olupton Dec 1, 2021
8cad9eb
Remove more OpenACC from the main simulation section.
olupton Dec 1, 2021
5c2a98c
Use nrn_pragma_acc for cell_permute=0 on GPU.
olupton Dec 6, 2021
fb1139e
OpenMP target offload for fast_imem.
olupton Dec 6, 2021
ee3588d
Don't print number of GPUs when quiet.
olupton Dec 6, 2021
5580951
OpenMP target offload for partrans.
olupton Dec 6, 2021
a62b2ad
OpenMP target offload for finitialize.
olupton Dec 6, 2021
6d06c02
Fixup for nrnthread_v_transfer + OpenMP.
olupton Dec 6, 2021
e548dbf
fix one weird clang-formatting in the nrn_acc_manager.cpp
pramodk Dec 6, 2021
184cdd7
Set OMP_NUM_THREADS=1 for lfp_test.
olupton Dec 6, 2021
587ff37
Address review comments.
olupton Dec 6, 2021
c951423
Drop nowait and depend clauses.
olupton Dec 6, 2021
6cb1b26
Rename CORENRN_PREFER_OPENMP_OFFLOAD.
olupton Dec 7, 2021
aefa0d4
Update NMODL with OpenMP codegen.
olupton Dec 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .clang-format.changes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
SortIncludes: false
IndentCaseLabels: true
SortIncludes: false
StatementMacros: [nrn_pragma_acc, nrn_pragma_omp]
5 changes: 0 additions & 5 deletions .cmake-format.changes.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
additional_commands:
cuda_add_library:
pargs: '*'
flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"]
kwargs:
OPTIONS: '*'
cpp_cc_build_time_copy:
flags: ['NO_TARGET']
kwargs:
Expand Down
8 changes: 7 additions & 1 deletion CMake/OpenAccHelper.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,20 @@ if(CORENRN_ENABLE_GPU)
# due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
# more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
# CUDA version as is used for the explicit CUDA code.
set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
# Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
# code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
# same default compute capabilities as each other, particularly on GPU-less build machines.
foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
endforeach()
if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
# for a region then prefer OpenMP.
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
endif()
# avoid PGI adding standard compliant "-A" flags
set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
Expand Down
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11)
# Build options
# =============================================================================
option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
Expand All @@ -104,7 +105,7 @@ option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF)
option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF)

set(CORENRN_EXTERNAL_BENCHMARK_DATA
""
"/gpfs/bbp.cscs.ch/project/proj12/nersc-gpu-hackathon-dec-2021"
CACHE PATH "Path to input data files and mechanisms for benchmarks")
set(CORENRN_NMODL_DIR
""
Expand Down Expand Up @@ -138,7 +139,7 @@ if(CORENRN_ENABLE_GPU)

# Set some sensible default CUDA architectures.
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
set(CMAKE_CUDA_ARCHITECTURES 70 80)
message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
endif()

Expand Down
2 changes: 1 addition & 1 deletion coreneuron/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "")
benchmark_command
"'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
" --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
" --tstop 1 &&"
" --tstop 1 --gpu &&"
"diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'"
)
add_test(NAME benchmark COMMAND sh -c "${benchmark_command}")
Expand Down
6 changes: 2 additions & 4 deletions coreneuron/apps/main1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,10 +558,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
#endif
bool compute_gpu = corenrn_param.gpu;

// clang-format off

#pragma acc update device(celsius, secondorder, pi) if (compute_gpu)
// clang-format on
nrn_pragma_acc(update device(celsius, secondorder, pi) if(compute_gpu))
nrn_pragma_omp(target update to(celsius, secondorder, pi) if(compute_gpu))
{
double v = corenrn_param.voltage;
double dt = corenrn_param.dt;
Expand Down
100 changes: 31 additions & 69 deletions coreneuron/gpu/nrn_acc_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#ifdef _OPENACC
#include <openacc.h>
#endif
#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
#include <omp.h>
#endif

#ifdef CRAYPAT
#include <pat_api.h>
Expand Down Expand Up @@ -605,25 +608,36 @@ void update_net_receive_buffer(NrnThread* nt) {
// instance order to avoid race. setup _displ and _nrb_index
net_receive_buffer_order(nrb);

#ifdef _OPENACC
if (nt->compute_gpu) {
Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu");
// note that dont update nrb otherwise we lose pointers

// clang-format off

/* update scalar elements */
acc_update_device(&nrb->_cnt, sizeof(int));
acc_update_device(&nrb->_displ_cnt, sizeof(int));

acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt);
acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt);
acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt);
acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt);
acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1));
acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt);
nrn_pragma_acc(update device(nrb->_cnt,
nrb->_displ_cnt,
nrb->_pnt_index[:nrb->_cnt],
nrb->_weight_index[:nrb->_cnt],
nrb->_nrb_t[:nrb->_cnt],
nrb->_nrb_flag[:nrb->_cnt],
nrb->_displ[:nrb->_displ_cnt + 1],
nrb->_nrb_index[:nrb->_cnt])
async(nt->stream_id))
nrn_pragma_omp(target update to(nrb->_cnt,
nrb->_displ_cnt,
nrb->_pnt_index[:nrb->_cnt],
nrb->_weight_index[:nrb->_cnt],
nrb->_nrb_t[:nrb->_cnt],
nrb->_nrb_flag[:nrb->_cnt],
nrb->_displ[:nrb->_displ_cnt + 1],
nrb->_nrb_index[:nrb->_cnt]))
// clang-format on
}
#endif
}
}
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_omp(taskwait)
}

void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
Expand Down Expand Up @@ -894,67 +908,12 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
size_t n_weight = nt->n_weight;
if (nt->compute_gpu && n_weight > 0) {
double* weights = nt->weights;
// clang-format off

#pragma acc update host(weights [0:n_weight])
// clang-format on
nrn_pragma_acc(update host(weights [0:n_weight]))
nrn_pragma_omp(target update from(weights [0:n_weight]))
}
}
}

void update_matrix_from_gpu(NrnThread* _nt) {
#ifdef _OPENACC
if (_nt->compute_gpu && (_nt->end > 0)) {
/* before copying, make sure all computations in the stream are completed */

// clang-format off

#pragma acc wait(_nt->stream_id)

/* openacc routine doesn't allow asyn, use pragma */
// acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double));

/* RHS and D are contigious, copy them in one go!
* NOTE: in pragma you have to give actual pointer like below and not nt->rhs...
*/
double* rhs = _nt->_actual_rhs;
int ne = nrn_soa_padded_size(_nt->end, 0);

#pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id)
#pragma acc wait(_nt->stream_id)
// clang-format on
}
#else
(void) _nt;
#endif
}

void update_matrix_to_gpu(NrnThread* _nt) {
#ifdef _OPENACC
if (_nt->compute_gpu && (_nt->end > 0)) {
/* before copying, make sure all computations in the stream are completed */

// clang-format off

#pragma acc wait(_nt->stream_id)

/* while discussion with Michael we found that RHS is also needed on
* gpu because nrn_cap_jacob uses rhs which is being updated on GPU
*/
double* v = _nt->_actual_v;
double* rhs = _nt->_actual_rhs;
int ne = nrn_soa_padded_size(_nt->end, 0);

#pragma acc update device(v[0 : ne]) async(_nt->stream_id)
#pragma acc update device(rhs[0 : ne]) async(_nt->stream_id)
#pragma acc wait(_nt->stream_id)
// clang-format on
}
#else
(void) _nt;
#endif
}

/** Cleanup device memory that is being tracked by the OpenACC runtime.
*
* This function painstakingly calls `acc_delete` in reverse order on all
Expand Down Expand Up @@ -1343,8 +1302,11 @@ void init_gpu() {

int device_num = local_rank % num_devices_per_node;
acc_set_device_num(device_num, device_type);
#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
omp_set_default_device(device_num);
#endif

if (nrnmpi_myid == 0) {
if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
<< " ranks per node\n";
}
Expand Down
2 changes: 0 additions & 2 deletions coreneuron/gpu/nrn_acc_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
void modify_data_on_device(NrnThread* threads, int nthreads);
void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);

void update_matrix_from_gpu(NrnThread* _nt);
void update_matrix_to_gpu(NrnThread* _nt);
void update_net_receive_buffer(NrnThread* _nt);
void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
Expand Down
3 changes: 0 additions & 3 deletions coreneuron/io/lfp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@


namespace coreneuron {
// extern variables require acc declare
#pragma acc declare create(pi)

namespace lfputils {

double line_source_lfp_factor(const Point3D& e_pos,
Expand Down
40 changes: 13 additions & 27 deletions coreneuron/mechanism/capac.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,9 @@
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/permute/data_layout.hpp"

// clang-format off

#if defined(_OPENACC)
#define _PRAGMA_FOR_INIT_ACC_LOOP_ \
_Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)")
#define _PRAGMA_FOR_CUR_ACC_LOOP_ \
_Pragma( \
"acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \
_Pragma( \
"acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
#else
#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("")
#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("")
#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("")
#endif

// clang-format on

#define _PRAGMA_FOR_INIT_ACC_LOOP_ \
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
#define _STRIDE _cntml_padded + _iml

namespace coreneuron {
Expand Down Expand Up @@ -78,15 +62,16 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
(void) _cntml_padded; /* unused when layout=1*/

double* _vec_d = _nt->_actual_d;
#if defined(_OPENACC)
int stream_id = _nt->stream_id;
#endif

{ /*if (use_cachevec) {*/
int* ni = ml->nodeindices;

vdata = ml->data;
_PRAGMA_FOR_JACOB_ACC_LOOP_
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
ni [0:_cntml_actual],
_vec_d [0:_nt->end]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
for (_iml = 0; _iml < _cntml_actual; _iml++) {
_vec_d[ni[_iml]] += cfac * cm;
}
Expand Down Expand Up @@ -126,12 +111,13 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
/* no need to distinguish secondorder */
int* ni = ml->nodeindices;
double* _vec_rhs = _nt->_actual_rhs;
#if defined(_OPENACC)
int stream_id = _nt->stream_id;
#endif

vdata = ml->data;
_PRAGMA_FOR_CUR_ACC_LOOP_
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
ni [0:_cntml_actual],
_vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
for (int _iml = 0; _iml < _cntml_actual; _iml++) {
i_cap = cfac * cm * _vec_rhs[ni[_iml]];
}
Expand Down
Loading