diff --git a/.clang-format.changes b/.clang-format.changes index 01b58702d..4c2b11b59 100644 --- a/.clang-format.changes +++ b/.clang-format.changes @@ -1,2 +1,3 @@ -SortIncludes: false IndentCaseLabels: true +SortIncludes: false +StatementMacros: [nrn_pragma_acc, nrn_pragma_omp] diff --git a/.cmake-format.changes.yaml b/.cmake-format.changes.yaml index 19ea9c084..2f20247f7 100644 --- a/.cmake-format.changes.yaml +++ b/.cmake-format.changes.yaml @@ -1,9 +1,4 @@ additional_commands: - cuda_add_library: - pargs: '*' - flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"] - kwargs: - OPTIONS: '*' cpp_cc_build_time_copy: flags: ['NO_TARGET'] kwargs: diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index 7767a3672..c7f91a7c9 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -55,7 +55,7 @@ if(CORENRN_ENABLE_GPU) # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same # CUDA version as is used for the explicit CUDA code. - set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}") + set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") set(NVHPC_ACC_LINK_FLAGS "-acc -cuda") # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the @@ -63,6 +63,12 @@ if(CORENRN_ENABLE_GPU) foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES}) string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}") endforeach() + if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD) + # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available + # for a region then prefer OpenMP. + add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) + string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp") + endif() # avoid PGI adding standard compliant "-A" flags set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14) string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}") diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e53a5de6..963703975 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11) # Build options # ============================================================================= option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON) +option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON) option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON) option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF) option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON) @@ -104,7 +105,7 @@ option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF) option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF) set(CORENRN_EXTERNAL_BENCHMARK_DATA - "" + "/gpfs/bbp.cscs.ch/project/proj12/nersc-gpu-hackathon-dec-2021" CACHE PATH "Path to input data files and mechanisms for benchmarks") set(CORENRN_NMODL_DIR "" @@ -138,7 +139,7 @@ if(CORENRN_ENABLE_GPU) # Set some sensible default CUDA architectures. if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 60 70 80) + set(CMAKE_CUDA_ARCHITECTURES 70 80) message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}") endif() diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 60bd2b370..2308ab99a 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -325,7 +325,7 @@ if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "") benchmark_command "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'" " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'" - " --tstop 1 &&" + " --tstop 1 --gpu &&" "diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'" ) add_test(NAME benchmark COMMAND sh -c "${benchmark_command}") diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp index 0fdaa509b..6a4d43bea 100644 --- a/coreneuron/apps/main1.cpp +++ b/coreneuron/apps/main1.cpp @@ -558,10 +558,8 @@ extern "C" int run_solve_core(int argc, char** argv) { #endif bool compute_gpu = corenrn_param.gpu; - // clang-format off - - #pragma acc update device(celsius, secondorder, pi) if (compute_gpu) - // clang-format on + nrn_pragma_acc(update device(celsius, secondorder, pi) if(compute_gpu)) + nrn_pragma_omp(target update to(celsius, secondorder, pi) if(compute_gpu)) { double v = corenrn_param.voltage; double dt = corenrn_param.dt; diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index ac98f5420..b249875dc 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -27,6 +27,9 @@ #ifdef _OPENACC #include #endif +#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD +#include +#endif #ifdef CRAYPAT #include @@ -605,25 +608,36 @@ void update_net_receive_buffer(NrnThread* nt) { // instance order to avoid race. setup _displ and _nrb_index net_receive_buffer_order(nrb); -#ifdef _OPENACC if (nt->compute_gpu) { Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu"); // note that dont update nrb otherwise we lose pointers + // clang-format off + /* update scalar elements */ - acc_update_device(&nrb->_cnt, sizeof(int)); - acc_update_device(&nrb->_displ_cnt, sizeof(int)); - - acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt); - acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt); - acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt); - acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt); - acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1)); - acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt); + nrn_pragma_acc(update device(nrb->_cnt, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_cnt], + nrb->_weight_index[:nrb->_cnt], + nrb->_nrb_t[:nrb->_cnt], + nrb->_nrb_flag[:nrb->_cnt], + nrb->_displ[:nrb->_displ_cnt + 1], + nrb->_nrb_index[:nrb->_cnt]) + async(nt->stream_id)) + nrn_pragma_omp(target update to(nrb->_cnt, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_cnt], + nrb->_weight_index[:nrb->_cnt], + nrb->_nrb_t[:nrb->_cnt], + nrb->_nrb_flag[:nrb->_cnt], + nrb->_displ[:nrb->_displ_cnt + 1], + nrb->_nrb_index[:nrb->_cnt])) + // clang-format on } -#endif } } + nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_omp(taskwait) } void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { @@ -894,67 +908,12 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) { size_t n_weight = nt->n_weight; if (nt->compute_gpu && n_weight > 0) { double* weights = nt->weights; - // clang-format off - - #pragma acc update host(weights [0:n_weight]) - // clang-format on + nrn_pragma_acc(update host(weights [0:n_weight])) + nrn_pragma_omp(target update from(weights [0:n_weight])) } } } -void update_matrix_from_gpu(NrnThread* _nt) { -#ifdef _OPENACC - if (_nt->compute_gpu && (_nt->end > 0)) { - /* before copying, make sure all computations in the stream are completed */ - - // clang-format off - - #pragma acc wait(_nt->stream_id) - - /* openacc routine doesn't allow asyn, use pragma */ - // acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double)); - - /* RHS and D are contigious, copy them in one go! - * NOTE: in pragma you have to give actual pointer like below and not nt->rhs... - */ - double* rhs = _nt->_actual_rhs; - int ne = nrn_soa_padded_size(_nt->end, 0); - - #pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id) - #pragma acc wait(_nt->stream_id) - // clang-format on - } -#else - (void) _nt; -#endif -} - -void update_matrix_to_gpu(NrnThread* _nt) { -#ifdef _OPENACC - if (_nt->compute_gpu && (_nt->end > 0)) { - /* before copying, make sure all computations in the stream are completed */ - - // clang-format off - - #pragma acc wait(_nt->stream_id) - - /* while discussion with Michael we found that RHS is also needed on - * gpu because nrn_cap_jacob uses rhs which is being updated on GPU - */ - double* v = _nt->_actual_v; - double* rhs = _nt->_actual_rhs; - int ne = nrn_soa_padded_size(_nt->end, 0); - - #pragma acc update device(v[0 : ne]) async(_nt->stream_id) - #pragma acc update device(rhs[0 : ne]) async(_nt->stream_id) - #pragma acc wait(_nt->stream_id) - // clang-format on - } -#else - (void) _nt; -#endif -} - /** Cleanup device memory that is being tracked by the OpenACC runtime. * * This function painstakingly calls `acc_delete` in reverse order on all @@ -1343,8 +1302,11 @@ void init_gpu() { int device_num = local_rank % num_devices_per_node; acc_set_device_num(device_num, device_type); +#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD + omp_set_default_device(device_num); +#endif - if (nrnmpi_myid == 0) { + if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) { std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size << " ranks per node\n"; } diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp index 67e6a058c..354bdc208 100644 --- a/coreneuron/gpu/nrn_acc_manager.hpp +++ b/coreneuron/gpu/nrn_acc_manager.hpp @@ -23,8 +23,6 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads); void modify_data_on_device(NrnThread* threads, int nthreads); void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads); -void update_matrix_from_gpu(NrnThread* _nt); -void update_matrix_to_gpu(NrnThread* _nt); void update_net_receive_buffer(NrnThread* _nt); void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml); void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb); diff --git a/coreneuron/io/lfp.cpp b/coreneuron/io/lfp.cpp index 646fbf5a0..2a001b85a 100644 --- a/coreneuron/io/lfp.cpp +++ b/coreneuron/io/lfp.cpp @@ -7,9 +7,6 @@ namespace coreneuron { -// extern variables require acc declare -#pragma acc declare create(pi) - namespace lfputils { double line_source_lfp_factor(const Point3D& e_pos, diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp index ee62f660d..42c65cb18 100644 --- a/coreneuron/mechanism/capac.cpp +++ b/coreneuron/mechanism/capac.cpp @@ -12,25 +12,9 @@ #include "coreneuron/coreneuron.hpp" #include "coreneuron/permute/data_layout.hpp" -// clang-format off - -#if defined(_OPENACC) -#define _PRAGMA_FOR_INIT_ACC_LOOP_ \ - _Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)") -#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)") -#else -#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("") -#endif - -// clang-format on - +#define _PRAGMA_FOR_INIT_ACC_LOOP_ \ + nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \ + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) #define _STRIDE _cntml_padded + _iml namespace coreneuron { @@ -78,15 +62,16 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { (void) _cntml_padded; /* unused when layout=1*/ double* _vec_d = _nt->_actual_d; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif { /*if (use_cachevec) {*/ int* ni = ml->nodeindices; vdata = ml->data; - _PRAGMA_FOR_JACOB_ACC_LOOP_ + nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], + ni [0:_cntml_actual], + _vec_d [0:_nt->end]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (_iml = 0; _iml < _cntml_actual; _iml++) { _vec_d[ni[_iml]] += cfac * cm; } @@ -126,12 +111,13 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { /* no need to distinguish secondorder */ int* ni = ml->nodeindices; double* _vec_rhs = _nt->_actual_rhs; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif vdata = ml->data; - _PRAGMA_FOR_CUR_ACC_LOOP_ + nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], + ni [0:_cntml_actual], + _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; _iml++) { i_cap = cfac * cm * _vec_rhs[ni[_iml]]; } diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 76adc9045..727f30ea6 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -19,26 +19,6 @@ #define _STRIDE _cntml_padded + _iml -// clang-format off - -#if defined(_OPENACC) -#define _PRAGMA_FOR_INIT_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(pd[0:_cntml_padded*5], ppd[0:1], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu)") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(pd[0:_cntml_padded*5], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu) async(stream_id)") -#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(pd[0:_cntml_padded*5], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)") -#else -#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ _Pragma("") -#endif - -// clang-format on - namespace coreneuron { // for each ion it refers to internal concentration, external concentration, and charge, @@ -277,14 +257,16 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) { double* pd; Datum* ppd; (void) nt; /* unused */ -#if defined(_OPENACC) - int stream_id = nt->stream_id; -#endif /*printf("ion_cur %s\n", memb_func[type].sym->name);*/ int _cntml_padded = ml->_nodecount_padded; pd = ml->data; ppd = ml->pdata; - _PRAGMA_FOR_CUR_ACC_LOOP_ + nrn_pragma_acc(parallel loop present( + pd [0:_cntml_padded * 5], + nrn_ion_global_map + [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { dcurdv = 0.; cur = 0.; @@ -312,7 +294,16 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) { int _cntml_padded = ml->_nodecount_padded; pd = ml->data; ppd = ml->pdata; - _PRAGMA_FOR_INIT_ACC_LOOP_ + // There was no async(...) clause in the initial OpenACC implementation, so + // no `nowait` clause has been added to the OpenMP implementation. TODO: + // verify if this can be made asynchronous or if there is a strong reason it + // needs to be like this. + nrn_pragma_acc(parallel loop present( + pd [0:_cntml_padded * 5], + ppd [0:1], + nrn_ion_global_map + [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { if (iontype & 04) { conci = conci0; @@ -332,9 +323,6 @@ void second_order_cur(NrnThread* _nt, int secondorder) { int _cntml_padded; double* pd; (void) _nt; /* unused */ -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif double* _vec_rhs = _nt->_actual_rhs; if (secondorder == 2) { @@ -345,7 +333,11 @@ void second_order_cur(NrnThread* _nt, int secondorder) { int* ni = ml->nodeindices; _cntml_padded = ml->_nodecount_padded; pd = ml->data; - _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ + nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5], + ni [0:_cntml_actual], + _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { cur += dcurdv * (_vec_rhs[ni[_iml]]); } diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp index 3acdff1ea..a8bff7a50 100644 --- a/coreneuron/mechanism/register_mech.cpp +++ b/coreneuron/mechanism/register_mech.cpp @@ -20,10 +20,6 @@ namespace coreneuron { int secondorder = 0; double t, dt, celsius, pi; -// declare copyin required for correct initialization -#pragma acc declare copyin(secondorder) -#pragma acc declare copyin(celsius) -#pragma acc declare copyin(pi) int rev_dt; using Pfrv = void (*)(); diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp index 6ed52dc34..31b2fec54 100644 --- a/coreneuron/network/cvodestb.cpp +++ b/coreneuron/network/cvodestb.cpp @@ -61,11 +61,9 @@ void init_net_events() { NrnThread* nt = nrn_threads + ith; double* weights = nt->weights; int n_weight = nt->n_weight; - if (n_weight) { - // clang-format off - - #pragma acc update device(weights[0 : n_weight]) if (nt->compute_gpu) - // clang-format on + if (n_weight && nt->compute_gpu) { + nrn_pragma_acc(update device(weights[0:n_weight])) + nrn_pragma_omp(target update to(weights[0:n_weight])) } } #endif diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index 899bc1e14..ee2e5cb3e 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -531,28 +531,13 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method PreSynHelper* presyns_helper = nt->presyns_helper; double* actual_v = nt->_actual_v; -#if defined(_OPENACC) - int stream_id = nt->stream_id; -#endif - if (nt->ncell == 0) return; - //_net_send_buffer_cnt is no longer used in openacc kernel, remove this? - //#ifdef _OPENACC - // if(nt->compute_gpu) - // acc_update_device(&(nt->_net_send_buffer_cnt), sizeof(int)); - //#endif - - // on GPU... - // clang-format off - - #pragma acc parallel loop present( \ - nt[0:1], presyns_helper[0:nt->n_presyn], \ - presyns[0:nt->n_presyn], actual_v[0:nt->end]) \ - copy(net_send_buf_count) if (nt->compute_gpu) \ - async(stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present( + nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end]) + copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu)) for (int i = 0; i < nt->ncell; ++i) { PreSyn* ps = presyns + i; PreSynHelper* psh = presyns_helper + i; @@ -563,7 +548,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method int* flag = &(psh->flag_); if (pscheck(v, threshold, flag)) { -#ifndef _OPENACC +#ifndef CORENEURON_ENABLE_GPU nt->_net_send_buffer_cnt = net_send_buf_count; if (nt->_net_send_buffer_cnt >= nt->_net_send_buffer_size) { nt->_net_send_buffer_size *= 2; @@ -572,31 +557,23 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method } #endif - // clang-format off - - #pragma acc atomic capture - // clang-format on + nrn_pragma_acc(atomic capture) + nrn_pragma_omp(atomic capture) idx = net_send_buf_count++; nt->_net_send_buffer[idx] = i; } } - - // clang-format off - - #pragma acc wait(stream_id) - // clang-format on + nrn_pragma_acc(wait(nt->stream_id)) nt->_net_send_buffer_cnt = net_send_buf_count; - if (nt->_net_send_buffer_cnt) { -#ifdef _OPENACC + if (nt->compute_gpu && nt->_net_send_buffer_cnt) { +#ifdef CORENEURON_ENABLE_GPU int* nsbuffer = nt->_net_send_buffer; #endif - // clang-format off - - #pragma acc update host(nsbuffer[0:nt->_net_send_buffer_cnt]) if (nt->compute_gpu) async(stream_id) - #pragma acc wait(stream_id) - // clang-format on + nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id)) + nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt])) } // on CPU... diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index e74d866ce..1bd822f54 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -41,40 +41,39 @@ void nrnmpi_v_transfer() { // gather the source values. can be done in parallel for (int tid = 0; tid < nrn_nthread; ++tid) { auto& ttd = transfer_thread_data_[tid]; - auto& nt = nrn_threads[tid]; + auto* nt = &nrn_threads[tid]; int n = int(ttd.outsrc_indices.size()); if (n == 0) { continue; } - double* src_data = nt._data; + double* src_data = nt->_data; int* src_indices = ttd.src_indices.data(); // gather sources on gpu and copy to cpu, cpu scatters to outsrc_buf double* src_gather = ttd.src_gather.data(); size_t n_src_gather = ttd.src_gather.size(); - // clang-format off - #pragma acc parallel loop present( \ - src_indices[0:n_src_gather], src_data[0:nt._ndata], \ - src_gather[0 : n_src_gather]) /*copyout(src_gather[0:n_src_gather])*/ \ - if (nt.compute_gpu) async(nt.stream_id) + nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather], + src_data [0:nt->_ndata], + src_gather [0:n_src_gather]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = 0; i < n_src_gather; ++i) { src_gather[i] = src_data[src_indices[i]]; } - // do not know why the copyout above did not work - // and the following update is needed - #pragma acc update host(src_gather[0 : n_src_gather]) \ - if (nrn_threads[0].compute_gpu) \ - async(nt.stream_id) - // clang-format on + nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu)) } // copy gathered source values to outsrc_buf_ + bool compute_gpu = false; for (int tid = 0; tid < nrn_nthread; ++tid) { - // clang-format off - - #pragma acc wait(nrn_threads[tid].stream_id) - // clang-format on + if (nrn_threads[tid].compute_gpu) { + compute_gpu = true; + nrn_pragma_acc(wait(nrn_threads[tid].stream_id)) + nrn_pragma_omp(taskwait) + } TransferThreadData& ttd = transfer_thread_data_[tid]; size_t n_outsrc_indices = ttd.outsrc_indices.size(); int* outsrc_indices = ttd.outsrc_indices.data(); @@ -102,12 +101,8 @@ void nrnmpi_v_transfer() { } // insrc_buf_ will get copied to targets via nrnthread_v_transfer - // clang-format off - - #pragma acc update device( \ - insrc_buf_[0:n_insrc_buf]) \ - if (nrn_threads[0].compute_gpu) - // clang-format on + nrn_pragma_acc(update device(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu)) + nrn_pragma_omp(target update to(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu)) } void nrnthread_v_transfer(NrnThread* _nt) { @@ -119,33 +114,32 @@ void nrnthread_v_transfer(NrnThread* _nt) { int* insrc_indices = ttd.insrc_indices.data(); double* tar_data = _nt->_data; // last element in the displacement vector gives total length +#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) int n_insrc_buf = insrcdspl_[nrnmpi_numprocs]; int ndata = _nt->_ndata; +#endif - // clang-format off - - #pragma acc parallel loop present( \ - insrc_indices[0:ntar], \ - tar_data[0:ndata], \ - insrc_buf_[0:n_insrc_buf]) \ - if (_nt->compute_gpu) \ - async(_nt->stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar], + tar_data [0:ndata], + insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu)) for (size_t i = 0; i < ntar; ++i) { tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]]; } } +/// TODO: Corresponding exit data cluase for OpenACC/OpenMP is missing and hence +/// GPU buffers are not freed. void nrn_partrans::gap_update_indices() { // Ensure index vectors, src_gather, and insrc_buf_ are on the gpu. if (insrcdspl_) { int n_insrc_buf = insrcdspl_[nrnmpi_numprocs]; + nrn_pragma_acc(enter data create(insrc_buf_ [0:n_insrc_buf]) if (corenrn_param.gpu)) + // clang-format off + nrn_pragma_omp(target enter data map(alloc: insrc_buf_[0:n_insrc_buf]) + if(corenrn_param.gpu)) // clang-format off - - #pragma acc enter data create( \ - insrc_buf_[0:n_insrc_buf]) \ - if (nrn_threads[0].compute_gpu) - // clang-format on } for (int tid = 0; tid < nrn_nthread; ++tid) { TransferThreadData& ttd = transfer_thread_data_[tid]; @@ -154,21 +148,25 @@ void nrn_partrans::gap_update_indices() { size_t n_src_gather = ttd.src_gather.size(); NrnThread* nt = nrn_threads + tid; if (n_src_indices) { + int* src_indices = ttd.src_indices.data(); + double* src_gather = ttd.src_gather.data(); + nrn_pragma_acc(enter data copyin(src_indices[0:n_src_indices]) if(nt->compute_gpu)) + nrn_pragma_acc(enter data create(src_gather[0:n_src_gather]) if(nt->compute_gpu)) // clang-format off - - int *src_indices = ttd.src_indices.data(); - double *src_gather = ttd.src_gather.data(); - #pragma acc enter data copyin(src_indices[0 : n_src_indices]) if (nt->compute_gpu) - #pragma acc enter data create(src_gather[0 : n_src_gather]) if (nt->compute_gpu) + nrn_pragma_omp(target enter data map(to: src_indices [0:n_src_indices]) + map(alloc: src_gather[0:n_src_gather]) + if(nt->compute_gpu)) // clang-format on } if (ttd.insrc_indices.size()) { - // clang-format off - - int *insrc_indices = ttd.insrc_indices.data(); + int* insrc_indices = ttd.insrc_indices.data(); size_t n_insrc_indices = ttd.insrc_indices.size(); - #pragma acc enter data copyin(insrc_indices[0 : n_insrc_indices]) if (nt->compute_gpu) + nrn_pragma_acc( + enter data copyin(insrc_indices [0:n_insrc_indices]) if (nt->compute_gpu)) + // clang-format off + nrn_pragma_omp(target enter data map(to: insrc_indices[0:n_insrc_indices]) + if(nt->compute_gpu)) // clang-format on } } diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h index 2c7fb8bb9..225d6d2ad 100644 --- a/coreneuron/nrnconf.h +++ b/coreneuron/nrnconf.h @@ -9,6 +9,8 @@ #ifndef _H_NRNCONF_ #define _H_NRNCONF_ +#include "coreneuron/utils/offload.hpp" + #include #include #include @@ -32,14 +34,16 @@ using Symbol = char; #define VECTORIZE 1 // extern variables require acc declare +nrn_pragma_omp(declare target) extern double celsius; -#pragma acc declare create(celsius) +nrn_pragma_acc(declare create(celsius)) extern double pi; -#pragma acc declare create(pi) +nrn_pragma_acc(declare create(pi)) extern int secondorder; -#pragma acc declare create(secondorder) +nrn_pragma_acc(declare create(secondorder)) +nrn_pragma_omp(end declare target) extern double t, dt; extern int rev_dt; diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index 2b6167f57..fd784fe38 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -6,8 +6,6 @@ # ============================================================================= */ -#include - #include "coreneuron/nrnconf.h" #include "coreneuron/sim/multicore.hpp" #include "coreneuron/utils/nrn_assert.h" @@ -15,6 +13,7 @@ #include "coreneuron/network/tnode.hpp" #include "coreneuron/utils/lpt.hpp" #include "coreneuron/utils/memory.h" +#include "coreneuron/utils/offload.hpp" #include "coreneuron/apps/corenrn_parameters.hpp" #include "coreneuron/permute/node_permute.h" // for print_quality @@ -22,6 +21,9 @@ #ifdef _OPENACC #include #endif + +#include + namespace coreneuron { int interleave_permute_type; InterleaveInfo* interleave_info; // nrn_nthread array @@ -488,8 +490,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid bool has_subtrees_to_compute = true; // clang-format off - - #pragma acc loop seq + nrn_pragma_acc(loop seq) for (; has_subtrees_to_compute; ) { // ncycle loop #if !defined(_OPENACC) // serial test, gpu does this in parallel @@ -500,9 +501,11 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid // what is the index int ip = GPU_PARENT(i); double p = GPU_A(i) / GPU_D(i); - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) GPU_D(ip) -= p * GPU_B(i); - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) GPU_RHS(ip) -= p * GPU_RHS(i); } #if !defined(_OPENACC) @@ -535,10 +538,7 @@ static void bksub_interleaved2(NrnThread* nt, #if !defined(_OPENACC) for (int i = root; i < lastroot; i += 1) { #else - // clang-format off - - #pragma acc loop seq - // clang-format on + nrn_pragma_acc(loop seq) for (int i = root; i < lastroot; i += warpsize) { #endif GPU_RHS(i) /= GPU_D(i); // the root @@ -596,21 +596,17 @@ void solve_interleaved2(int ith) { int* strides = ii.stride; // sum ncycles of these (bad since ncompart/warpsize) int* rootbegin = ii.firstnode; // nwarp+1 of these int* nodebegin = ii.lastnode; // nwarp+1 of these -#ifdef _OPENACC +#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) int nstride = stridedispl[nwarp]; - int stream_id = nt->stream_id; -#endif - -#ifdef _OPENACC - // clang-format off - - #pragma acc parallel loop gang vector vector_length(warpsize) \ - present(nt[0:1], strides[0:nstride], \ - ncycles[0:nwarp], stridedispl[0:nwarp+1], \ - rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) \ - if (nt->compute_gpu) async(stream_id) -// clang-format on #endif + nrn_pragma_acc(parallel loop gang vector vector_length( + warpsize) present(nt [0:1], + strides [0:nstride], + ncycles [0:nwarp], + stridedispl [0:nwarp + 1], + rootbegin [0:nwarp + 1], + nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int icore = 0; icore < ncore; ++icore) { int iwarp = icore / warpsize; // figure out the >> value int ic = icore & (warpsize - 1); // figure out the & mask @@ -629,9 +625,7 @@ void solve_interleaved2(int ith) { } // serial test mode #endif } -#ifdef _OPENACC -#pragma acc wait(nt->stream_id) -#endif + nrn_pragma_acc(wait(nt->stream_id)) #ifdef _OPENACC } #endif @@ -656,28 +650,23 @@ void solve_interleaved1(int ith) { int* firstnode = ii.firstnode; int* lastnode = ii.lastnode; int* cellsize = ii.cellsize; -#if _OPENACC - int stream_id = nt->stream_id; -#endif -#ifdef _OPENACC - // clang-format off - - #pragma acc parallel loop present( \ - nt[0:1], stride[0:nstride], \ - firstnode[0:ncell], lastnode[0:ncell], \ - cellsize[0:ncell]) if (nt->compute_gpu) \ - async(stream_id) -// clang-format on -#endif + // OL211123: can we preserve the error checking behaviour of OpenACC's + // present clause with OpenMP? It is a bug if these data are not present, + // so diagnostics are helpful... + nrn_pragma_acc(parallel loop present(nt [0:1], + stride [0:nstride], + firstnode [0:ncell], + lastnode [0:ncell], + cellsize [0:ncell]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int icell = 0; icell < ncell; ++icell) { int icellsize = cellsize[icell]; triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode); bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode); } -#ifdef _OPENACC -#pragma acc wait(stream_id) -#endif + nrn_pragma_acc(wait(nt->stream_id)) } void solve_interleaved(int ith) { diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 8f4ac14cf..a46f83535 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -78,10 +78,11 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */ } else { nt->cj = 1.0 / dt; } + nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj) + async(nt->stream_id) if (nt->compute_gpu)) // clang-format off - - #pragma acc update device(nt->_t, nt->_dt, nt->cj) \ - async(nt->stream_id) if(nt->compute_gpu) + nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj) + if(nt->compute_gpu)) // clang-format on } } @@ -201,35 +202,24 @@ void update(NrnThread* _nt) { double* vec_v = &(VEC_V(0)); double* vec_rhs = &(VEC_RHS(0)); int i2 = _nt->end; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif /* do not need to worry about linmod or extracellular*/ if (secondorder) { - // clang-format off - - #pragma acc parallel loop present( \ - vec_v[0:i2], vec_rhs[0:i2]) \ - if (_nt->compute_gpu) async(stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += 2. * vec_rhs[i]; } } else { - // clang-format off - - #pragma acc parallel loop present( \ - vec_v[0:i2], vec_rhs[0:i2]) \ - if (_nt->compute_gpu) async(stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += vec_rhs[i]; } } - // update_matrix_to_gpu(_nt); - if (_nt->tml) { assert(_nt->tml->index == CAP); nrn_cur_capacitance(_nt, _nt->tml->ml, _nt->tml->index); @@ -304,10 +294,9 @@ void nrncore2nrn_send_values(NrnThread* nth) { // make sure we do not overflow the `varrays` buffers assert(vs < tr->bsize); - // clang-format off - - #pragma acc parallel loop present(tr[0:1]) if(nth->compute_gpu) async(nth->stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu) + async(nth->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu)) for (int i = 0; i < tr->n_trajec; ++i) { tr->varrays[i][vs] = *tr->gather[i]; } @@ -326,12 +315,12 @@ void nrncore2nrn_send_values(NrnThread* nth) { // https://github.com/BlueBrain/CoreNeuron/issues/611 for (int i = 0; i < tr->n_trajec; ++i) { double* gather_i = tr->gather[i]; - // clang-format off - - #pragma acc update self(gather_i[0:1]) if(nth->compute_gpu) async(nth->stream_id) + nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu) + async(nth->stream_id)) + nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)) } - #pragma acc wait(nth->stream_id) - // clang-format on + nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_omp(taskwait) for (int i = 0; i < tr->n_trajec; ++i) { *(tr->scatter[i]) = *(tr->gather[i]); } @@ -351,15 +340,11 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { nth->_t += .5 * nth->_dt; if (nth->ncell) { -#if defined(_OPENACC) - int stream_id = nth->stream_id; - /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */ - // clang-format off - - #pragma acc update device(nth->_t) if (nth->compute_gpu) async(stream_id) - #pragma acc wait(stream_id) -// clang-format on -#endif + /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can + launch kernel) */ + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)) + nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); { @@ -393,12 +378,9 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU */ - // clang-format off - - #pragma acc update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id) - #pragma acc wait(nth->stream_id) - // clang-format on - + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)) + nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); nonvint(nth); nrncore2nrn_send_values(nth); diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp index 8dfb0cd76..1218b7967 100644 --- a/coreneuron/sim/fast_imem.cpp +++ b/coreneuron/sim/fast_imem.cpp @@ -50,10 +50,10 @@ void nrn_calc_fast_imem(NrnThread* nt) { double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d; double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(vec_rhs, \ - vec_area, \ - fast_imem_d, \ - fast_imem_rhs) if (nt->compute_gpu) async(nt->stream_id) + nrn_pragma_acc( + parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; } @@ -68,8 +68,9 @@ void nrn_calc_fast_imem_init(NrnThread* nt) { double* vec_area = nt->_actual_area; double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) \ - async(nt->stream_id) + nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; } diff --git a/coreneuron/sim/finitialize.cpp b/coreneuron/sim/finitialize.cpp index 1ae79a92f..d711ae247 100644 --- a/coreneuron/sim/finitialize.cpp +++ b/coreneuron/sim/finitialize.cpp @@ -53,12 +53,9 @@ void nrn_finitialize(int setv, double v) { if (setv) { for (auto _nt = nrn_threads; _nt < nrn_threads + nrn_nthread; ++_nt) { double* vec_v = &(VEC_V(0)); - // clang-format off - - #pragma acc parallel loop present( \ - _nt[0:1], vec_v[0:_nt->end]) \ - if (_nt->compute_gpu) - // clang-format on + nrn_pragma_acc( + parallel loop present(_nt [0:1], vec_v [0:_nt->end]) if (_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < _nt->end; ++i) { vec_v[i] = v; } diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp index a24c8360f..60ba2b660 100644 --- a/coreneuron/sim/solve_core.cpp +++ b/coreneuron/sim/solve_core.cpp @@ -24,7 +24,9 @@ void nrn_solve_minimal(NrnThread* _nt) { } } -/** TODO loops are executed seq in OpenACC just for debugging, remove it! */ +/** @todo OpenACC GPU offload is sequential/slow. Because --cell-permute=0 and + * --gpu is forbidden anyway, no OpenMP target offload equivalent is implemented. + */ /* triangularization of the matrix equations */ static void triang(NrnThread* _nt) { @@ -37,17 +39,9 @@ static void triang(NrnThread* _nt) { double* vec_rhs = &(VEC_RHS(0)); int* parent_index = _nt->_v_parent_index; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif - /** @todo: just for benchmarking, otherwise produces wrong results */ - // clang-format off - - #pragma acc parallel loop seq present( \ - vec_a[0:i3], vec_b[0:i3], vec_d[0:i3], \ - vec_rhs[0:i3], parent_index[0:i3]) \ - async(stream_id) if (_nt->compute_gpu) - // clang-format on + nrn_pragma_acc(parallel loop seq present( + vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3]) + async(_nt->stream_id) if (_nt->compute_gpu)) for (int i = i3 - 1; i >= i2; --i) { double p = vec_a[i] / vec_d[i]; vec_d[parent_index[i]] -= p * vec_b[i]; @@ -66,33 +60,22 @@ static void bksub(NrnThread* _nt) { double* vec_rhs = &(VEC_RHS(0)); int* parent_index = _nt->_v_parent_index; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif - /** @todo: just for benchmarking, otherwise produces wrong results */ - // clang-format off - - #pragma acc parallel loop seq present( \ - vec_d[0:i2], vec_rhs[0:i2]) \ - async(stream_id) if (_nt->compute_gpu) - // clang-format on + nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2]) + async(_nt->stream_id) if (_nt->compute_gpu)) for (int i = i1; i < i2; ++i) { vec_rhs[i] /= vec_d[i]; } - /** @todo: just for benchmarking, otherwise produces wrong results */ - // clang-format off - - #pragma acc parallel loop seq present( \ - vec_b[0:i3], vec_d[0:i3], vec_rhs[0:i3], \ - parent_index[0:i3]) async(stream_id) \ - if (_nt->compute_gpu) + nrn_pragma_acc( + parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3]) + async(_nt->stream_id) if (_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]]; vec_rhs[i] /= vec_d[i]; } - #pragma acc wait(stream_id) - // clang-format on + if (_nt->compute_gpu) { + nrn_pragma_acc(wait(_nt->stream_id)) + } } } // namespace coreneuron diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index 943980bcd..bb92d2ab1 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -32,12 +32,9 @@ static void nrn_rhs(NrnThread* _nt) { double* vec_v = &(VEC_V(0)); int* parent_index = _nt->_v_parent_index; - // clang-format off - - #pragma acc parallel loop present( \ - vec_rhs[0:i3], vec_d[0:i3]) \ - if (_nt->compute_gpu) async(_nt->stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { vec_rhs[i] = 0.; vec_d[i] = 0.; @@ -46,9 +43,10 @@ static void nrn_rhs(NrnThread* _nt) { if (_nt->nrn_fast_imem) { double* fast_imem_d = _nt->nrn_fast_imem->nrn_sav_d; double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(fast_imem_d [i1:i3], \ - fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) \ - async(_nt->stream_id) + nrn_pragma_acc( + parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_d[i] = 0.; fast_imem_rhs[i] = 0.; @@ -76,7 +74,9 @@ static void nrn_rhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] -= vec_rhs[i]; } @@ -86,22 +86,24 @@ static void nrn_rhs(NrnThread* _nt) { The extracellular mechanism contribution is already done. rhs += ai_j*(vi_j - vi) */ - // clang-format off - - #pragma acc parallel loop present( \ - vec_rhs[0:i3], vec_d[0:i3], \ - vec_a[0:i3], vec_b[0:i3], \ - vec_v[0:i3], parent_index[0:i3]) \ - if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], + vec_d [0:i3], + vec_a [0:i3], + vec_b [0:i3], + vec_v [0:i3], + parent_index [0:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { double dv = vec_v[parent_index[i]] - vec_v[i]; /* our connection coefficients are negative so */ - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_rhs[i] -= vec_b[i] * dv; - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_rhs[parent_index[i]] += vec_a[i] * dv; } - // clang-format on } /* calculate left hand side of @@ -150,34 +152,32 @@ static void nrn_lhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_d; -#pragma acc parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; } } /* now add the axial currents */ - // clang-format off - - #pragma acc parallel loop present( \ - vec_d[0:i3], vec_a[0:i3], \ - vec_b[0:i3], parent_index[0:i3]) \ - if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present( + vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_d[i] -= vec_b[i]; - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_d[parent_index[i]] -= vec_a[i]; } - // clang-format on } /* for the fixed step method */ void* setup_tree_matrix_minimal(NrnThread* _nt) { nrn_rhs(_nt); nrn_lhs(_nt); - // update_matrix_from_gpu(_nt); - return nullptr; } } // namespace coreneuron diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h index 965c06e78..2f0e24458 100644 --- a/coreneuron/utils/memory.h +++ b/coreneuron/utils/memory.h @@ -115,8 +115,7 @@ auto allocate_unique(const Alloc& alloc, Args&&... args) { } // namespace coreneuron /// for gpu builds with unified memory support -/// OL210812: why do we include __CUDACC__ here? -#if (defined(__CUDACC__) || defined(CORENEURON_UNIFIED_MEMORY)) +#ifdef CORENEURON_UNIFIED_MEMORY #include diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp new file mode 100644 index 000000000..d90cc10fd --- /dev/null +++ b/coreneuron/utils/offload.hpp @@ -0,0 +1,20 @@ +/* +# ============================================================================= +# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL +# +# See top-level LICENSE file for details. +# ============================================================================= +*/ +#pragma once +#define nrn_pragma_stringify(x) #x +#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) +#define nrn_pragma_acc(x) +#define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x)) +#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) +#define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x)) +#define nrn_pragma_omp(x) +#else +#define nrn_pragma_acc(x) +#define nrn_pragma_omp(x) +#endif diff --git a/external/nmodl b/external/nmodl index 794b419f5..a60c5e903 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit 794b419f5256f40efcdca1674f712a6e544c235a +Subproject commit a60c5e903126ad95cfe2bceb904d0efe83ba9d8a diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt index 3e2ac8e80..ec795f178 100644 --- a/tests/unit/lfp/CMakeLists.txt +++ b/tests/unit/lfp/CMakeLists.txt @@ -22,3 +22,4 @@ set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF) target_compile_options(lfp_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS}) add_dependencies(lfp_test_bin nrniv-core) add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $) +set_tests_properties(lfp_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1)