From d452e1a59e8a911ac6aa0cbd452b707f5032540d Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Tue, 23 Nov 2021 09:00:39 +0100 Subject: [PATCH 01/31] Update nmodl to hackathon_main. --- external/nmodl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/nmodl b/external/nmodl index 85dec3618..794b419f5 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit 85dec36180cc8d012db3392c06c065d39de79960 +Subproject commit 794b419f5256f40efcdca1674f712a6e544c235a From 8ab49e9a22dc0e2c47773de55fdc20c3c95a5be8 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Tue, 23 Nov 2021 17:53:09 +0100 Subject: [PATCH 02/31] [Hackathon] disable a lot of CI (#694) * Disable cmake-format and clang-format checks. * Disable GitLab CI except for NMODL + GPU. --- .../workflows/clang_cmake_format_check.yaml | 37 -------- .gitlab-ci.yml | 95 ------------------- 2 files changed, 132 deletions(-) delete mode 100644 .github/workflows/clang_cmake_format_check.yaml diff --git a/.github/workflows/clang_cmake_format_check.yaml b/.github/workflows/clang_cmake_format_check.yaml deleted file mode 100644 index b438a8080..000000000 --- a/.github/workflows/clang_cmake_format_check.yaml +++ /dev/null @@ -1,37 +0,0 @@ -name: clang-cmake-format-check - -concurrency: - group: ${{ github.workflow }}#${{ github.ref }} - cancel-in-progress: true - -on: - push: - -jobs: - build: - name: clang-cmake-format-check - runs-on: ubuntu-20.04 - steps: - - name: Fetch repository - uses: actions/checkout@v2 - - name: Install clang-format 11 - run: | - sudo apt-get update - sudo apt-get install clang-format-11 python3-pip libboost-all-dev libopenmpi-dev openmpi-bin - - name: Install cmake-format 0.6.13 - run: python3 -m pip install cmake-format==0.6.13 - - name: Configure - shell: bash - working-directory: ${{runner.workspace}}/CoreNeuron - run: | - export PATH=/home/runner/.local/bin:$PATH - mkdir BUILD && cd BUILD - cmake -DCORENRN_CLANG_FORMAT=ON -DCORENRN_CMAKE_FORMAT=ON -DCORENRN_ENABLE_MPI=ON -DCORENRN_ENABLE_OPENMP=OFF -DClangFormat_EXECUTABLE=$(which clang-format-11) -DCMakeFormat_EXECUTABLE=$(which cmake-format) .. - - name: Run clang-format - shell: bash - working-directory: ${{runner.workspace}}/CoreNeuron/BUILD - run: make check-clang-format VERBOSE=1 - - name: Run cmake-format - shell: bash - working-directory: ${{runner.workspace}}/CoreNeuron/BUILD - run: make check-cmake-format diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5e3967f7d..1d89f8eca 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -35,9 +35,6 @@ spack_setup: - git diff - fi -.spack_intel: - variables: - SPACK_PACKAGE_COMPILER: intel .spack_nvhpc: variables: SPACK_PACKAGE_COMPILER: nvhpc @@ -50,16 +47,6 @@ spack_setup: variables: bb5_constraint: volta -build:nmodl:intel: - stage: build_nmodl - variables: - SPACK_PACKAGE: nmodl - SPACK_PACKAGE_REF: '' - SPACK_PACKAGE_SPEC: ~legacy-unit - extends: - - .spack_build - - .spack_intel - build:nmodl:gpu: stage: build_nmodl variables: @@ -71,23 +58,6 @@ build:nmodl:gpu: - .spack_build - .spack_nvhpc -build:coreneuron+nmodl:intel: - variables: - SPACK_PACKAGE: coreneuron - SPACK_PACKAGE_SPEC: +nmodl+tests~legacy-unit build_type=Debug - extends: - - .spack_build - - .spack_intel - needs: ["build:nmodl:intel"] - -build:coreneuron:intel: - variables: - SPACK_PACKAGE: coreneuron - SPACK_PACKAGE_SPEC: +tests~legacy-unit build_type=Debug - extends: - - .spack_build - - .spack_intel - build:coreneuron+nmodl:gpu: variables: SPACK_PACKAGE: coreneuron @@ -99,48 +69,10 @@ build:coreneuron+nmodl:gpu: - .spack_nvhpc needs: ["build:nmodl:gpu"] -build:coreneuron:gpu: - variables: - SPACK_PACKAGE: coreneuron - # +report pulls in a lot of dependencies and the tests fail. - # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type - SPACK_PACKAGE_SPEC: +gpu+tests~legacy-unit~report build_type=RelWithDebInfo - extends: - - .spack_build - - .spack_nvhpc - -test:coreneuron+nmodl:intel: - extends: [.ctest] - needs: ["build:coreneuron+nmodl:intel"] - -test:coreneuron:intel: - extends: [.ctest] - needs: ["build:coreneuron:intel"] - test:coreneuron+nmodl:gpu: extends: [.ctest, .gpu_node] needs: ["build:coreneuron+nmodl:gpu"] -test:coreneuron:gpu: - extends: [.ctest, .gpu_node] - needs: ["build:coreneuron:gpu"] - -build:neuron+nmodl:intel: - stage: build_neuron - extends: - - .spack_build - - .spack_neuron - - .spack_intel - needs: ["build:coreneuron+nmodl:intel"] - -build:neuron:intel: - stage: build_neuron - extends: - - .spack_build - - .spack_neuron - - .spack_intel - needs: ["build:coreneuron:intel"] - build:neuron+nmodl:gpu: stage: build_neuron extends: @@ -153,34 +85,7 @@ build:neuron+nmodl:gpu: - !reference [.spack_build, before_script] needs: ["build:coreneuron+nmodl:gpu"] -build:neuron:gpu: - stage: build_neuron - extends: - - .spack_build - - .spack_neuron - - .spack_nvhpc - before_script: - # Build py-cython and py-numpy with GCC instead of NVHPC. - - SPACK_PACKAGE_DEPENDENCIES="${SPACK_PACKAGE_DEPENDENCIES}^py-cython%gcc^py-numpy%gcc" - - !reference [.spack_build, before_script] - needs: ["build:coreneuron:gpu"] - -test:neuron+nmodl:intel: - stage: test_neuron - extends: [.ctest] - needs: ["build:neuron+nmodl:intel"] - -test:neuron:intel: - stage: test_neuron - extends: [.ctest] - needs: ["build:neuron:intel"] - test:neuron+nmodl:gpu: stage: test_neuron extends: [.ctest, .gpu_node] needs: ["build:neuron+nmodl:gpu"] - -test:neuron:gpu: - stage: test_neuron - extends: [.ctest, .gpu_node] - needs: ["build:neuron:gpu"] From 560cc3f2fff6b53c461022ec1f018e5d0781082e Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Thu, 25 Nov 2021 19:32:45 +0100 Subject: [PATCH 03/31] [Hackathon] Add a temporary option for benchmark data. (#695) * Add a hackathon-specific argument for benchmarks. * Add a reference comparison for channel-benchmark. --- CMakeLists.txt | 3 +++ coreneuron/CMakeLists.txt | 30 +++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b3edf3a5..4e53a5de6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,6 +103,9 @@ option(CORENRN_ENABLE_SHARED "Enable shared library build" ON) option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF) option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF) +set(CORENRN_EXTERNAL_BENCHMARK_DATA + "" + CACHE PATH "Path to input data files and mechanisms for benchmarks") set(CORENRN_NMODL_DIR "" CACHE PATH "Path to nmodl source-to-source compiler installation") diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 6fd5c98a8..e7337331e 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -293,6 +293,33 @@ set_target_properties( # ============================================================================= # create special-core with halfgap.mod for tests # ============================================================================= +set(all_output_binaries) +if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "") + # Hack for the december 2021 hackathon, build an extra special-core with channel-benchmark + # mechanisms. + set(modfile_directory + "${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark/benchmark/channels/lib/modlib") + file(GLOB modfiles "${modfile_directory}/*.mod") + set(output_binaries "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core" + "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech.a") + add_custom_command( + OUTPUT ${output_binaries} + DEPENDS scopmath coreneuron ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES} + COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b STATIC -m ${CORENRN_MOD2CPP_BINARY} -p 1 + "${modfile_directory}" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark + COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms") + list(APPEND all_output_binaries ${output_binaries}) + string( + CONCAT + benchmark_command + "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'" + " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'" + " --tstop 1 &&" + "diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'" + ) + add_test(NAME benchmark COMMAND sh -c "${benchmark_command}") +endif() set(modfile_directory "${CORENEURON_PROJECT_SOURCE_DIR}/tests/integration/ring_gap/mod") file(GLOB modfiles "${modfile_directory}/*.mod") set(output_binaries "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/special-core" @@ -304,7 +331,8 @@ add_custom_command( "${modfile_directory}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin COMMENT "Running nrnivmodl-core with halfgap.mod") -add_custom_target(nrniv-core ALL DEPENDS ${output_binaries}) +list(APPEND all_output_binaries ${output_binaries}) +add_custom_target(nrniv-core ALL DEPENDS ${all_output_binaries}) include_directories(${CORENEURON_PROJECT_SOURCE_DIR}) From de4e4337da16e380de81853b108cfacbeb2a6d8b Mon Sep 17 00:00:00 2001 From: Pramod Kumbhar Date: Fri, 26 Nov 2021 08:50:58 +0100 Subject: [PATCH 04/31] Minor changes for building on perlmutter (#697) * create build/benchmark folder before trying to use it * run nrnivmodl-core in parallel than serially (too slow) --- coreneuron/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index e7337331e..5bea0569a 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -302,10 +302,11 @@ if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "") file(GLOB modfiles "${modfile_directory}/*.mod") set(output_binaries "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core" "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech.a") + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark) add_custom_command( OUTPUT ${output_binaries} DEPENDS scopmath coreneuron ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES} - COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b STATIC -m ${CORENRN_MOD2CPP_BINARY} -p 1 + COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b STATIC -m ${CORENRN_MOD2CPP_BINARY} -p 6 "${modfile_directory}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms") From 81dd5ef4bbecbb3b8769d0753c6910785ca82b11 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Mon, 29 Nov 2021 15:00:07 +0100 Subject: [PATCH 05/31] Enable OpenMP in CoreNEURON CI. (#698) --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1d89f8eca..84e83c0ac 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -63,7 +63,7 @@ build:coreneuron+nmodl:gpu: SPACK_PACKAGE: coreneuron # +report pulls in a lot of dependencies and the tests fail. # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type - SPACK_PACKAGE_SPEC: +nmodl+gpu+tests~legacy-unit~report build_type=RelWithDebInfo + SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~report build_type=RelWithDebInfo extends: - .spack_build - .spack_nvhpc From 3e394c499e9746d3daee5793ae0816eabe39e2e1 Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Mon, 29 Nov 2021 21:39:55 +0100 Subject: [PATCH 06/31] Set by default the number of warps to execute in a large reasonable number and update the related documentation (#700) --- coreneuron/apps/corenrn_parameters.cpp | 7 ++++++- coreneuron/apps/corenrn_parameters.hpp | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/coreneuron/apps/corenrn_parameters.cpp b/coreneuron/apps/corenrn_parameters.cpp index c0aa02ab0..4403f44db 100644 --- a/coreneuron/apps/corenrn_parameters.cpp +++ b/coreneuron/apps/corenrn_parameters.cpp @@ -47,7 +47,12 @@ corenrn_parameters::corenrn_parameters() { "Print number of instances of each mechanism and detailed memory stats."); auto sub_gpu = app.add_option_group("GPU", "Commands relative to GPU."); - sub_gpu->add_option("-W, --nwarp", this->nwarp, "Number of warps to balance.", true) + sub_gpu + ->add_option("-W, --nwarp", + this->nwarp, + "Number of warps to execute in parallel the Hines solver. Each warp solves a " + "group of cells. (Only used with cell permute 2)", + true) ->check(CLI::Range(0, 1'000'000)); sub_gpu ->add_option("-R, --cell-permute", diff --git a/coreneuron/apps/corenrn_parameters.hpp b/coreneuron/apps/corenrn_parameters.hpp index ea7ef8aba..21f2f7767 100644 --- a/coreneuron/apps/corenrn_parameters.hpp +++ b/coreneuron/apps/corenrn_parameters.hpp @@ -46,7 +46,7 @@ struct corenrn_parameters { unsigned ms_subint = 2; /// Number of multisend interval. 1 or 2 unsigned spkcompress = 0; /// Spike Compression unsigned cell_interleave_permute = 0; /// Cell interleaving permutation - unsigned nwarp = 0; /// Number of warps to balance for cell_interleave_permute == 2 + unsigned nwarp = 1024; /// Number of warps to balance for cell_interleave_permute == 2 unsigned num_gpus = 0; /// Number of gpus to use per node unsigned report_buff_size = report_buff_size_default; /// Size in MB of the report buffer. int seed = -1; /// Initialization seed for random number generator (int) From a8bb7164a5b3b8802c97ce11a0c083d463c7dbc5 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 1 Dec 2021 21:05:19 +0100 Subject: [PATCH 07/31] Add memory pool for Random123 streams. (#702) * Add memory pool for Random123 streams. This speeds up initialisation when running on GPU. * Make Boost optional. --- coreneuron/CMakeLists.txt | 9 +++++ coreneuron/utils/randoms/nrnran123.cu | 55 ++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 5bea0569a..60bd2b370 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -284,6 +284,15 @@ target_include_directories(coreneuron SYSTEM target_include_directories(coreneuron SYSTEM PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include) +if(CORENRN_ENABLE_GPU) + # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available. + find_package(Boost QUIET) + if(Boost_FOUND) + target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS}) + target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL) + endif() +endif() + set_target_properties( coreneuron scopmath PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu index b13dad7eb..9a3d205a3 100644 --- a/coreneuron/utils/randoms/nrnran123.cu +++ b/coreneuron/utils/randoms/nrnran123.cu @@ -15,6 +15,11 @@ #include #include +#ifdef CORENEURON_USE_BOOST_POOL +#include +#include +#endif + // In a GPU build this file will be compiled by NVCC as CUDA code // In a CPU build this file will be compiled by a C++ compiler as C++ code #ifdef __CUDACC__ @@ -24,6 +29,48 @@ #endif namespace { +#ifdef CORENEURON_USE_BOOST_POOL +/** Tag type for use with boost::fast_pool_allocator that forwards to + * coreneuron::[de]allocate_unified(). Using a Random123-specific type here + * makes sure that allocations do not come from the same global pool as other + * usage of boost pools for objects with sizeof == sizeof(nrnran123_State). + * + * The messy m_block_sizes map is just because `deallocate_unified` uses sized + * deallocations, but the Boost pool allocators don't. Because this is hidden + * behind the pool mechanism, these methods are not called very often and the + * overhead is minimal. + */ +struct random123_allocate_unified { + using size_type = std::size_t; + using difference_type = std::size_t; + static char* malloc(const size_type bytes) { + std::lock_guard const lock{m_mutex}; + static_cast(lock); + auto* buffer = coreneuron::allocate_unified(bytes); + m_block_sizes[buffer] = bytes; + return reinterpret_cast(buffer); + } + static void free(char* const block) { + std::lock_guard const lock{m_mutex}; + static_cast(lock); + auto const iter = m_block_sizes.find(block); + assert(iter != m_block_sizes.end()); + auto const size = iter->second; + m_block_sizes.erase(iter); + return coreneuron::deallocate_unified(block, size); + } + static std::mutex m_mutex; + static std::unordered_map m_block_sizes; +}; + +std::mutex random123_allocate_unified::m_mutex{}; +std::unordered_map random123_allocate_unified::m_block_sizes{}; + +using random123_allocator = + boost::fast_pool_allocator; +#else +using random123_allocator = coreneuron::unified_allocator; +#endif /* Global data structure per process. Using a unique_ptr here causes [minor] * problems because its destructor can be called very late during application * shutdown. If the destructor calls cudaFree and the CUDA runtime has already @@ -212,9 +259,7 @@ nrnran123_State* nrnran123_newstream3(uint32_t id1, #endif nrnran123_State* s{nullptr}; if (use_unified_memory) { - s = coreneuron::allocate_unique( - coreneuron::unified_allocator{}) - .release(); + s = coreneuron::allocate_unique(random123_allocator{}).release(); } else { s = new nrnran123_State{}; } @@ -244,9 +289,7 @@ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) { --g_instance_count; } if (use_unified_memory) { - std::unique_ptr>> - _{s}; + std::unique_ptr> _{s}; } else { delete s; } From 96498142ce60d15edbd4c5161c7153f41eda6b20 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Thu, 2 Dec 2021 11:04:21 +0100 Subject: [PATCH 08/31] Fix Boost-free compilation. (#703) This was a silly bug in #702. --- coreneuron/utils/randoms/nrnran123.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu index 9a3d205a3..8a02c4e26 100644 --- a/coreneuron/utils/randoms/nrnran123.cu +++ b/coreneuron/utils/randoms/nrnran123.cu @@ -69,7 +69,7 @@ std::unordered_map random123_allocate_unified::m_block_sizes using random123_allocator = boost::fast_pool_allocator; #else -using random123_allocator = coreneuron::unified_allocator; +using random123_allocator = coreneuron::unified_allocator; #endif /* Global data structure per process. Using a unique_ptr here causes [minor] * problems because its destructor can be called very late during application From 21dc2c8b40bf817ccaeeab05ae91f0b4ce88145f Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Tue, 7 Dec 2021 13:13:36 +0100 Subject: [PATCH 09/31] Basic OpenACC -> OpenMP migration. (#693) * Simplify unified memory logic. * Pass -mp=gpu when we pass -acc * Pass -gpu=lineinfo for better debug information. * Pass -Minfo=accel,mp for better compile time diagnostics. * Add nrn_pragma_{acc,omp} macros for single-source Open{ACC,MP} support. * Call omp_set_default_device. * Drop cc60 because of OpenMP offload incompatibility. * Add --gpu to test. * Default (BB5-valid) CORENRN_EXTERNAL_BENCHMARK_DATA. * Remove cuda_add_library. * Don't print number of GPUs when quiet. * Set OMP_NUM_THREADS=1 for lfp_test. * Update NMODL to emit nrn_pragma{acc,omp} macros. Co-authored-by: Pramod Kumbhar --- .clang-format.changes | 3 +- .cmake-format.changes.yaml | 5 -- CMake/OpenAccHelper.cmake | 8 +- CMakeLists.txt | 5 +- coreneuron/CMakeLists.txt | 2 +- coreneuron/apps/main1.cpp | 6 +- coreneuron/gpu/nrn_acc_manager.cpp | 100 ++++++++----------------- coreneuron/gpu/nrn_acc_manager.hpp | 2 - coreneuron/io/lfp.cpp | 3 - coreneuron/mechanism/capac.cpp | 40 ++++------ coreneuron/mechanism/eion.cpp | 50 ++++++------- coreneuron/mechanism/register_mech.cpp | 4 - coreneuron/network/cvodestb.cpp | 8 +- coreneuron/network/netcvode.cpp | 49 ++++-------- coreneuron/network/partrans.cpp | 90 +++++++++++----------- coreneuron/nrnconf.h | 10 ++- coreneuron/permute/cellorder.cpp | 73 ++++++++---------- coreneuron/sim/fadvance_core.cpp | 70 +++++++---------- coreneuron/sim/fast_imem.cpp | 13 ++-- coreneuron/sim/finitialize.cpp | 9 +-- coreneuron/sim/solve_core.cpp | 45 ++++------- coreneuron/sim/treeset_core.cpp | 64 ++++++++-------- coreneuron/utils/memory.h | 3 +- coreneuron/utils/offload.hpp | 20 +++++ external/nmodl | 2 +- tests/unit/lfp/CMakeLists.txt | 1 + 26 files changed, 283 insertions(+), 402 deletions(-) create mode 100644 coreneuron/utils/offload.hpp diff --git a/.clang-format.changes b/.clang-format.changes index 01b58702d..4c2b11b59 100644 --- a/.clang-format.changes +++ b/.clang-format.changes @@ -1,2 +1,3 @@ -SortIncludes: false IndentCaseLabels: true +SortIncludes: false +StatementMacros: [nrn_pragma_acc, nrn_pragma_omp] diff --git a/.cmake-format.changes.yaml b/.cmake-format.changes.yaml index 19ea9c084..2f20247f7 100644 --- a/.cmake-format.changes.yaml +++ b/.cmake-format.changes.yaml @@ -1,9 +1,4 @@ additional_commands: - cuda_add_library: - pargs: '*' - flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"] - kwargs: - OPTIONS: '*' cpp_cc_build_time_copy: flags: ['NO_TARGET'] kwargs: diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index 7767a3672..c7f91a7c9 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -55,7 +55,7 @@ if(CORENRN_ENABLE_GPU) # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same # CUDA version as is used for the explicit CUDA code. - set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}") + set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") set(NVHPC_ACC_LINK_FLAGS "-acc -cuda") # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the @@ -63,6 +63,12 @@ if(CORENRN_ENABLE_GPU) foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES}) string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}") endforeach() + if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD) + # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available + # for a region then prefer OpenMP. + add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) + string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp") + endif() # avoid PGI adding standard compliant "-A" flags set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14) string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}") diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e53a5de6..963703975 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11) # Build options # ============================================================================= option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON) +option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON) option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON) option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF) option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON) @@ -104,7 +105,7 @@ option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF) option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF) set(CORENRN_EXTERNAL_BENCHMARK_DATA - "" + "/gpfs/bbp.cscs.ch/project/proj12/nersc-gpu-hackathon-dec-2021" CACHE PATH "Path to input data files and mechanisms for benchmarks") set(CORENRN_NMODL_DIR "" @@ -138,7 +139,7 @@ if(CORENRN_ENABLE_GPU) # Set some sensible default CUDA architectures. if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 60 70 80) + set(CMAKE_CUDA_ARCHITECTURES 70 80) message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}") endif() diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 60bd2b370..2308ab99a 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -325,7 +325,7 @@ if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "") benchmark_command "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'" " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'" - " --tstop 1 &&" + " --tstop 1 --gpu &&" "diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'" ) add_test(NAME benchmark COMMAND sh -c "${benchmark_command}") diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp index 0fdaa509b..6a4d43bea 100644 --- a/coreneuron/apps/main1.cpp +++ b/coreneuron/apps/main1.cpp @@ -558,10 +558,8 @@ extern "C" int run_solve_core(int argc, char** argv) { #endif bool compute_gpu = corenrn_param.gpu; - // clang-format off - - #pragma acc update device(celsius, secondorder, pi) if (compute_gpu) - // clang-format on + nrn_pragma_acc(update device(celsius, secondorder, pi) if(compute_gpu)) + nrn_pragma_omp(target update to(celsius, secondorder, pi) if(compute_gpu)) { double v = corenrn_param.voltage; double dt = corenrn_param.dt; diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index ac98f5420..b249875dc 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -27,6 +27,9 @@ #ifdef _OPENACC #include #endif +#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD +#include +#endif #ifdef CRAYPAT #include @@ -605,25 +608,36 @@ void update_net_receive_buffer(NrnThread* nt) { // instance order to avoid race. setup _displ and _nrb_index net_receive_buffer_order(nrb); -#ifdef _OPENACC if (nt->compute_gpu) { Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu"); // note that dont update nrb otherwise we lose pointers + // clang-format off + /* update scalar elements */ - acc_update_device(&nrb->_cnt, sizeof(int)); - acc_update_device(&nrb->_displ_cnt, sizeof(int)); - - acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt); - acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt); - acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt); - acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt); - acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1)); - acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt); + nrn_pragma_acc(update device(nrb->_cnt, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_cnt], + nrb->_weight_index[:nrb->_cnt], + nrb->_nrb_t[:nrb->_cnt], + nrb->_nrb_flag[:nrb->_cnt], + nrb->_displ[:nrb->_displ_cnt + 1], + nrb->_nrb_index[:nrb->_cnt]) + async(nt->stream_id)) + nrn_pragma_omp(target update to(nrb->_cnt, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_cnt], + nrb->_weight_index[:nrb->_cnt], + nrb->_nrb_t[:nrb->_cnt], + nrb->_nrb_flag[:nrb->_cnt], + nrb->_displ[:nrb->_displ_cnt + 1], + nrb->_nrb_index[:nrb->_cnt])) + // clang-format on } -#endif } } + nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_omp(taskwait) } void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { @@ -894,67 +908,12 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) { size_t n_weight = nt->n_weight; if (nt->compute_gpu && n_weight > 0) { double* weights = nt->weights; - // clang-format off - - #pragma acc update host(weights [0:n_weight]) - // clang-format on + nrn_pragma_acc(update host(weights [0:n_weight])) + nrn_pragma_omp(target update from(weights [0:n_weight])) } } } -void update_matrix_from_gpu(NrnThread* _nt) { -#ifdef _OPENACC - if (_nt->compute_gpu && (_nt->end > 0)) { - /* before copying, make sure all computations in the stream are completed */ - - // clang-format off - - #pragma acc wait(_nt->stream_id) - - /* openacc routine doesn't allow asyn, use pragma */ - // acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double)); - - /* RHS and D are contigious, copy them in one go! - * NOTE: in pragma you have to give actual pointer like below and not nt->rhs... - */ - double* rhs = _nt->_actual_rhs; - int ne = nrn_soa_padded_size(_nt->end, 0); - - #pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id) - #pragma acc wait(_nt->stream_id) - // clang-format on - } -#else - (void) _nt; -#endif -} - -void update_matrix_to_gpu(NrnThread* _nt) { -#ifdef _OPENACC - if (_nt->compute_gpu && (_nt->end > 0)) { - /* before copying, make sure all computations in the stream are completed */ - - // clang-format off - - #pragma acc wait(_nt->stream_id) - - /* while discussion with Michael we found that RHS is also needed on - * gpu because nrn_cap_jacob uses rhs which is being updated on GPU - */ - double* v = _nt->_actual_v; - double* rhs = _nt->_actual_rhs; - int ne = nrn_soa_padded_size(_nt->end, 0); - - #pragma acc update device(v[0 : ne]) async(_nt->stream_id) - #pragma acc update device(rhs[0 : ne]) async(_nt->stream_id) - #pragma acc wait(_nt->stream_id) - // clang-format on - } -#else - (void) _nt; -#endif -} - /** Cleanup device memory that is being tracked by the OpenACC runtime. * * This function painstakingly calls `acc_delete` in reverse order on all @@ -1343,8 +1302,11 @@ void init_gpu() { int device_num = local_rank % num_devices_per_node; acc_set_device_num(device_num, device_type); +#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD + omp_set_default_device(device_num); +#endif - if (nrnmpi_myid == 0) { + if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) { std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size << " ranks per node\n"; } diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp index 67e6a058c..354bdc208 100644 --- a/coreneuron/gpu/nrn_acc_manager.hpp +++ b/coreneuron/gpu/nrn_acc_manager.hpp @@ -23,8 +23,6 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads); void modify_data_on_device(NrnThread* threads, int nthreads); void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads); -void update_matrix_from_gpu(NrnThread* _nt); -void update_matrix_to_gpu(NrnThread* _nt); void update_net_receive_buffer(NrnThread* _nt); void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml); void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb); diff --git a/coreneuron/io/lfp.cpp b/coreneuron/io/lfp.cpp index 646fbf5a0..2a001b85a 100644 --- a/coreneuron/io/lfp.cpp +++ b/coreneuron/io/lfp.cpp @@ -7,9 +7,6 @@ namespace coreneuron { -// extern variables require acc declare -#pragma acc declare create(pi) - namespace lfputils { double line_source_lfp_factor(const Point3D& e_pos, diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp index ee62f660d..42c65cb18 100644 --- a/coreneuron/mechanism/capac.cpp +++ b/coreneuron/mechanism/capac.cpp @@ -12,25 +12,9 @@ #include "coreneuron/coreneuron.hpp" #include "coreneuron/permute/data_layout.hpp" -// clang-format off - -#if defined(_OPENACC) -#define _PRAGMA_FOR_INIT_ACC_LOOP_ \ - _Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)") -#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)") -#else -#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("") -#endif - -// clang-format on - +#define _PRAGMA_FOR_INIT_ACC_LOOP_ \ + nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \ + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) #define _STRIDE _cntml_padded + _iml namespace coreneuron { @@ -78,15 +62,16 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { (void) _cntml_padded; /* unused when layout=1*/ double* _vec_d = _nt->_actual_d; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif { /*if (use_cachevec) {*/ int* ni = ml->nodeindices; vdata = ml->data; - _PRAGMA_FOR_JACOB_ACC_LOOP_ + nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], + ni [0:_cntml_actual], + _vec_d [0:_nt->end]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (_iml = 0; _iml < _cntml_actual; _iml++) { _vec_d[ni[_iml]] += cfac * cm; } @@ -126,12 +111,13 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { /* no need to distinguish secondorder */ int* ni = ml->nodeindices; double* _vec_rhs = _nt->_actual_rhs; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif vdata = ml->data; - _PRAGMA_FOR_CUR_ACC_LOOP_ + nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], + ni [0:_cntml_actual], + _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; _iml++) { i_cap = cfac * cm * _vec_rhs[ni[_iml]]; } diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 76adc9045..727f30ea6 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -19,26 +19,6 @@ #define _STRIDE _cntml_padded + _iml -// clang-format off - -#if defined(_OPENACC) -#define _PRAGMA_FOR_INIT_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(pd[0:_cntml_padded*5], ppd[0:1], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu)") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(pd[0:_cntml_padded*5], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu) async(stream_id)") -#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(pd[0:_cntml_padded*5], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)") -#else -#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ _Pragma("") -#endif - -// clang-format on - namespace coreneuron { // for each ion it refers to internal concentration, external concentration, and charge, @@ -277,14 +257,16 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) { double* pd; Datum* ppd; (void) nt; /* unused */ -#if defined(_OPENACC) - int stream_id = nt->stream_id; -#endif /*printf("ion_cur %s\n", memb_func[type].sym->name);*/ int _cntml_padded = ml->_nodecount_padded; pd = ml->data; ppd = ml->pdata; - _PRAGMA_FOR_CUR_ACC_LOOP_ + nrn_pragma_acc(parallel loop present( + pd [0:_cntml_padded * 5], + nrn_ion_global_map + [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { dcurdv = 0.; cur = 0.; @@ -312,7 +294,16 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) { int _cntml_padded = ml->_nodecount_padded; pd = ml->data; ppd = ml->pdata; - _PRAGMA_FOR_INIT_ACC_LOOP_ + // There was no async(...) clause in the initial OpenACC implementation, so + // no `nowait` clause has been added to the OpenMP implementation. TODO: + // verify if this can be made asynchronous or if there is a strong reason it + // needs to be like this. + nrn_pragma_acc(parallel loop present( + pd [0:_cntml_padded * 5], + ppd [0:1], + nrn_ion_global_map + [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { if (iontype & 04) { conci = conci0; @@ -332,9 +323,6 @@ void second_order_cur(NrnThread* _nt, int secondorder) { int _cntml_padded; double* pd; (void) _nt; /* unused */ -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif double* _vec_rhs = _nt->_actual_rhs; if (secondorder == 2) { @@ -345,7 +333,11 @@ void second_order_cur(NrnThread* _nt, int secondorder) { int* ni = ml->nodeindices; _cntml_padded = ml->_nodecount_padded; pd = ml->data; - _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ + nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5], + ni [0:_cntml_actual], + _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { cur += dcurdv * (_vec_rhs[ni[_iml]]); } diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp index 3acdff1ea..a8bff7a50 100644 --- a/coreneuron/mechanism/register_mech.cpp +++ b/coreneuron/mechanism/register_mech.cpp @@ -20,10 +20,6 @@ namespace coreneuron { int secondorder = 0; double t, dt, celsius, pi; -// declare copyin required for correct initialization -#pragma acc declare copyin(secondorder) -#pragma acc declare copyin(celsius) -#pragma acc declare copyin(pi) int rev_dt; using Pfrv = void (*)(); diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp index 6ed52dc34..31b2fec54 100644 --- a/coreneuron/network/cvodestb.cpp +++ b/coreneuron/network/cvodestb.cpp @@ -61,11 +61,9 @@ void init_net_events() { NrnThread* nt = nrn_threads + ith; double* weights = nt->weights; int n_weight = nt->n_weight; - if (n_weight) { - // clang-format off - - #pragma acc update device(weights[0 : n_weight]) if (nt->compute_gpu) - // clang-format on + if (n_weight && nt->compute_gpu) { + nrn_pragma_acc(update device(weights[0:n_weight])) + nrn_pragma_omp(target update to(weights[0:n_weight])) } } #endif diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index 899bc1e14..ee2e5cb3e 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -531,28 +531,13 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method PreSynHelper* presyns_helper = nt->presyns_helper; double* actual_v = nt->_actual_v; -#if defined(_OPENACC) - int stream_id = nt->stream_id; -#endif - if (nt->ncell == 0) return; - //_net_send_buffer_cnt is no longer used in openacc kernel, remove this? - //#ifdef _OPENACC - // if(nt->compute_gpu) - // acc_update_device(&(nt->_net_send_buffer_cnt), sizeof(int)); - //#endif - - // on GPU... - // clang-format off - - #pragma acc parallel loop present( \ - nt[0:1], presyns_helper[0:nt->n_presyn], \ - presyns[0:nt->n_presyn], actual_v[0:nt->end]) \ - copy(net_send_buf_count) if (nt->compute_gpu) \ - async(stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present( + nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end]) + copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu)) for (int i = 0; i < nt->ncell; ++i) { PreSyn* ps = presyns + i; PreSynHelper* psh = presyns_helper + i; @@ -563,7 +548,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method int* flag = &(psh->flag_); if (pscheck(v, threshold, flag)) { -#ifndef _OPENACC +#ifndef CORENEURON_ENABLE_GPU nt->_net_send_buffer_cnt = net_send_buf_count; if (nt->_net_send_buffer_cnt >= nt->_net_send_buffer_size) { nt->_net_send_buffer_size *= 2; @@ -572,31 +557,23 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method } #endif - // clang-format off - - #pragma acc atomic capture - // clang-format on + nrn_pragma_acc(atomic capture) + nrn_pragma_omp(atomic capture) idx = net_send_buf_count++; nt->_net_send_buffer[idx] = i; } } - - // clang-format off - - #pragma acc wait(stream_id) - // clang-format on + nrn_pragma_acc(wait(nt->stream_id)) nt->_net_send_buffer_cnt = net_send_buf_count; - if (nt->_net_send_buffer_cnt) { -#ifdef _OPENACC + if (nt->compute_gpu && nt->_net_send_buffer_cnt) { +#ifdef CORENEURON_ENABLE_GPU int* nsbuffer = nt->_net_send_buffer; #endif - // clang-format off - - #pragma acc update host(nsbuffer[0:nt->_net_send_buffer_cnt]) if (nt->compute_gpu) async(stream_id) - #pragma acc wait(stream_id) - // clang-format on + nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id)) + nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt])) } // on CPU... diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index e74d866ce..1bd822f54 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -41,40 +41,39 @@ void nrnmpi_v_transfer() { // gather the source values. can be done in parallel for (int tid = 0; tid < nrn_nthread; ++tid) { auto& ttd = transfer_thread_data_[tid]; - auto& nt = nrn_threads[tid]; + auto* nt = &nrn_threads[tid]; int n = int(ttd.outsrc_indices.size()); if (n == 0) { continue; } - double* src_data = nt._data; + double* src_data = nt->_data; int* src_indices = ttd.src_indices.data(); // gather sources on gpu and copy to cpu, cpu scatters to outsrc_buf double* src_gather = ttd.src_gather.data(); size_t n_src_gather = ttd.src_gather.size(); - // clang-format off - #pragma acc parallel loop present( \ - src_indices[0:n_src_gather], src_data[0:nt._ndata], \ - src_gather[0 : n_src_gather]) /*copyout(src_gather[0:n_src_gather])*/ \ - if (nt.compute_gpu) async(nt.stream_id) + nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather], + src_data [0:nt->_ndata], + src_gather [0:n_src_gather]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = 0; i < n_src_gather; ++i) { src_gather[i] = src_data[src_indices[i]]; } - // do not know why the copyout above did not work - // and the following update is needed - #pragma acc update host(src_gather[0 : n_src_gather]) \ - if (nrn_threads[0].compute_gpu) \ - async(nt.stream_id) - // clang-format on + nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu)) } // copy gathered source values to outsrc_buf_ + bool compute_gpu = false; for (int tid = 0; tid < nrn_nthread; ++tid) { - // clang-format off - - #pragma acc wait(nrn_threads[tid].stream_id) - // clang-format on + if (nrn_threads[tid].compute_gpu) { + compute_gpu = true; + nrn_pragma_acc(wait(nrn_threads[tid].stream_id)) + nrn_pragma_omp(taskwait) + } TransferThreadData& ttd = transfer_thread_data_[tid]; size_t n_outsrc_indices = ttd.outsrc_indices.size(); int* outsrc_indices = ttd.outsrc_indices.data(); @@ -102,12 +101,8 @@ void nrnmpi_v_transfer() { } // insrc_buf_ will get copied to targets via nrnthread_v_transfer - // clang-format off - - #pragma acc update device( \ - insrc_buf_[0:n_insrc_buf]) \ - if (nrn_threads[0].compute_gpu) - // clang-format on + nrn_pragma_acc(update device(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu)) + nrn_pragma_omp(target update to(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu)) } void nrnthread_v_transfer(NrnThread* _nt) { @@ -119,33 +114,32 @@ void nrnthread_v_transfer(NrnThread* _nt) { int* insrc_indices = ttd.insrc_indices.data(); double* tar_data = _nt->_data; // last element in the displacement vector gives total length +#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) int n_insrc_buf = insrcdspl_[nrnmpi_numprocs]; int ndata = _nt->_ndata; +#endif - // clang-format off - - #pragma acc parallel loop present( \ - insrc_indices[0:ntar], \ - tar_data[0:ndata], \ - insrc_buf_[0:n_insrc_buf]) \ - if (_nt->compute_gpu) \ - async(_nt->stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar], + tar_data [0:ndata], + insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu)) for (size_t i = 0; i < ntar; ++i) { tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]]; } } +/// TODO: Corresponding exit data cluase for OpenACC/OpenMP is missing and hence +/// GPU buffers are not freed. void nrn_partrans::gap_update_indices() { // Ensure index vectors, src_gather, and insrc_buf_ are on the gpu. if (insrcdspl_) { int n_insrc_buf = insrcdspl_[nrnmpi_numprocs]; + nrn_pragma_acc(enter data create(insrc_buf_ [0:n_insrc_buf]) if (corenrn_param.gpu)) + // clang-format off + nrn_pragma_omp(target enter data map(alloc: insrc_buf_[0:n_insrc_buf]) + if(corenrn_param.gpu)) // clang-format off - - #pragma acc enter data create( \ - insrc_buf_[0:n_insrc_buf]) \ - if (nrn_threads[0].compute_gpu) - // clang-format on } for (int tid = 0; tid < nrn_nthread; ++tid) { TransferThreadData& ttd = transfer_thread_data_[tid]; @@ -154,21 +148,25 @@ void nrn_partrans::gap_update_indices() { size_t n_src_gather = ttd.src_gather.size(); NrnThread* nt = nrn_threads + tid; if (n_src_indices) { + int* src_indices = ttd.src_indices.data(); + double* src_gather = ttd.src_gather.data(); + nrn_pragma_acc(enter data copyin(src_indices[0:n_src_indices]) if(nt->compute_gpu)) + nrn_pragma_acc(enter data create(src_gather[0:n_src_gather]) if(nt->compute_gpu)) // clang-format off - - int *src_indices = ttd.src_indices.data(); - double *src_gather = ttd.src_gather.data(); - #pragma acc enter data copyin(src_indices[0 : n_src_indices]) if (nt->compute_gpu) - #pragma acc enter data create(src_gather[0 : n_src_gather]) if (nt->compute_gpu) + nrn_pragma_omp(target enter data map(to: src_indices [0:n_src_indices]) + map(alloc: src_gather[0:n_src_gather]) + if(nt->compute_gpu)) // clang-format on } if (ttd.insrc_indices.size()) { - // clang-format off - - int *insrc_indices = ttd.insrc_indices.data(); + int* insrc_indices = ttd.insrc_indices.data(); size_t n_insrc_indices = ttd.insrc_indices.size(); - #pragma acc enter data copyin(insrc_indices[0 : n_insrc_indices]) if (nt->compute_gpu) + nrn_pragma_acc( + enter data copyin(insrc_indices [0:n_insrc_indices]) if (nt->compute_gpu)) + // clang-format off + nrn_pragma_omp(target enter data map(to: insrc_indices[0:n_insrc_indices]) + if(nt->compute_gpu)) // clang-format on } } diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h index 2c7fb8bb9..225d6d2ad 100644 --- a/coreneuron/nrnconf.h +++ b/coreneuron/nrnconf.h @@ -9,6 +9,8 @@ #ifndef _H_NRNCONF_ #define _H_NRNCONF_ +#include "coreneuron/utils/offload.hpp" + #include #include #include @@ -32,14 +34,16 @@ using Symbol = char; #define VECTORIZE 1 // extern variables require acc declare +nrn_pragma_omp(declare target) extern double celsius; -#pragma acc declare create(celsius) +nrn_pragma_acc(declare create(celsius)) extern double pi; -#pragma acc declare create(pi) +nrn_pragma_acc(declare create(pi)) extern int secondorder; -#pragma acc declare create(secondorder) +nrn_pragma_acc(declare create(secondorder)) +nrn_pragma_omp(end declare target) extern double t, dt; extern int rev_dt; diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index 2b6167f57..fd784fe38 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -6,8 +6,6 @@ # ============================================================================= */ -#include - #include "coreneuron/nrnconf.h" #include "coreneuron/sim/multicore.hpp" #include "coreneuron/utils/nrn_assert.h" @@ -15,6 +13,7 @@ #include "coreneuron/network/tnode.hpp" #include "coreneuron/utils/lpt.hpp" #include "coreneuron/utils/memory.h" +#include "coreneuron/utils/offload.hpp" #include "coreneuron/apps/corenrn_parameters.hpp" #include "coreneuron/permute/node_permute.h" // for print_quality @@ -22,6 +21,9 @@ #ifdef _OPENACC #include #endif + +#include + namespace coreneuron { int interleave_permute_type; InterleaveInfo* interleave_info; // nrn_nthread array @@ -488,8 +490,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid bool has_subtrees_to_compute = true; // clang-format off - - #pragma acc loop seq + nrn_pragma_acc(loop seq) for (; has_subtrees_to_compute; ) { // ncycle loop #if !defined(_OPENACC) // serial test, gpu does this in parallel @@ -500,9 +501,11 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid // what is the index int ip = GPU_PARENT(i); double p = GPU_A(i) / GPU_D(i); - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) GPU_D(ip) -= p * GPU_B(i); - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) GPU_RHS(ip) -= p * GPU_RHS(i); } #if !defined(_OPENACC) @@ -535,10 +538,7 @@ static void bksub_interleaved2(NrnThread* nt, #if !defined(_OPENACC) for (int i = root; i < lastroot; i += 1) { #else - // clang-format off - - #pragma acc loop seq - // clang-format on + nrn_pragma_acc(loop seq) for (int i = root; i < lastroot; i += warpsize) { #endif GPU_RHS(i) /= GPU_D(i); // the root @@ -596,21 +596,17 @@ void solve_interleaved2(int ith) { int* strides = ii.stride; // sum ncycles of these (bad since ncompart/warpsize) int* rootbegin = ii.firstnode; // nwarp+1 of these int* nodebegin = ii.lastnode; // nwarp+1 of these -#ifdef _OPENACC +#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) int nstride = stridedispl[nwarp]; - int stream_id = nt->stream_id; -#endif - -#ifdef _OPENACC - // clang-format off - - #pragma acc parallel loop gang vector vector_length(warpsize) \ - present(nt[0:1], strides[0:nstride], \ - ncycles[0:nwarp], stridedispl[0:nwarp+1], \ - rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) \ - if (nt->compute_gpu) async(stream_id) -// clang-format on #endif + nrn_pragma_acc(parallel loop gang vector vector_length( + warpsize) present(nt [0:1], + strides [0:nstride], + ncycles [0:nwarp], + stridedispl [0:nwarp + 1], + rootbegin [0:nwarp + 1], + nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int icore = 0; icore < ncore; ++icore) { int iwarp = icore / warpsize; // figure out the >> value int ic = icore & (warpsize - 1); // figure out the & mask @@ -629,9 +625,7 @@ void solve_interleaved2(int ith) { } // serial test mode #endif } -#ifdef _OPENACC -#pragma acc wait(nt->stream_id) -#endif + nrn_pragma_acc(wait(nt->stream_id)) #ifdef _OPENACC } #endif @@ -656,28 +650,23 @@ void solve_interleaved1(int ith) { int* firstnode = ii.firstnode; int* lastnode = ii.lastnode; int* cellsize = ii.cellsize; -#if _OPENACC - int stream_id = nt->stream_id; -#endif -#ifdef _OPENACC - // clang-format off - - #pragma acc parallel loop present( \ - nt[0:1], stride[0:nstride], \ - firstnode[0:ncell], lastnode[0:ncell], \ - cellsize[0:ncell]) if (nt->compute_gpu) \ - async(stream_id) -// clang-format on -#endif + // OL211123: can we preserve the error checking behaviour of OpenACC's + // present clause with OpenMP? It is a bug if these data are not present, + // so diagnostics are helpful... + nrn_pragma_acc(parallel loop present(nt [0:1], + stride [0:nstride], + firstnode [0:ncell], + lastnode [0:ncell], + cellsize [0:ncell]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int icell = 0; icell < ncell; ++icell) { int icellsize = cellsize[icell]; triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode); bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode); } -#ifdef _OPENACC -#pragma acc wait(stream_id) -#endif + nrn_pragma_acc(wait(nt->stream_id)) } void solve_interleaved(int ith) { diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 8f4ac14cf..a46f83535 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -78,10 +78,11 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */ } else { nt->cj = 1.0 / dt; } + nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj) + async(nt->stream_id) if (nt->compute_gpu)) // clang-format off - - #pragma acc update device(nt->_t, nt->_dt, nt->cj) \ - async(nt->stream_id) if(nt->compute_gpu) + nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj) + if(nt->compute_gpu)) // clang-format on } } @@ -201,35 +202,24 @@ void update(NrnThread* _nt) { double* vec_v = &(VEC_V(0)); double* vec_rhs = &(VEC_RHS(0)); int i2 = _nt->end; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif /* do not need to worry about linmod or extracellular*/ if (secondorder) { - // clang-format off - - #pragma acc parallel loop present( \ - vec_v[0:i2], vec_rhs[0:i2]) \ - if (_nt->compute_gpu) async(stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += 2. * vec_rhs[i]; } } else { - // clang-format off - - #pragma acc parallel loop present( \ - vec_v[0:i2], vec_rhs[0:i2]) \ - if (_nt->compute_gpu) async(stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += vec_rhs[i]; } } - // update_matrix_to_gpu(_nt); - if (_nt->tml) { assert(_nt->tml->index == CAP); nrn_cur_capacitance(_nt, _nt->tml->ml, _nt->tml->index); @@ -304,10 +294,9 @@ void nrncore2nrn_send_values(NrnThread* nth) { // make sure we do not overflow the `varrays` buffers assert(vs < tr->bsize); - // clang-format off - - #pragma acc parallel loop present(tr[0:1]) if(nth->compute_gpu) async(nth->stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu) + async(nth->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu)) for (int i = 0; i < tr->n_trajec; ++i) { tr->varrays[i][vs] = *tr->gather[i]; } @@ -326,12 +315,12 @@ void nrncore2nrn_send_values(NrnThread* nth) { // https://github.com/BlueBrain/CoreNeuron/issues/611 for (int i = 0; i < tr->n_trajec; ++i) { double* gather_i = tr->gather[i]; - // clang-format off - - #pragma acc update self(gather_i[0:1]) if(nth->compute_gpu) async(nth->stream_id) + nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu) + async(nth->stream_id)) + nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)) } - #pragma acc wait(nth->stream_id) - // clang-format on + nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_omp(taskwait) for (int i = 0; i < tr->n_trajec; ++i) { *(tr->scatter[i]) = *(tr->gather[i]); } @@ -351,15 +340,11 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { nth->_t += .5 * nth->_dt; if (nth->ncell) { -#if defined(_OPENACC) - int stream_id = nth->stream_id; - /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */ - // clang-format off - - #pragma acc update device(nth->_t) if (nth->compute_gpu) async(stream_id) - #pragma acc wait(stream_id) -// clang-format on -#endif + /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can + launch kernel) */ + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)) + nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); { @@ -393,12 +378,9 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU */ - // clang-format off - - #pragma acc update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id) - #pragma acc wait(nth->stream_id) - // clang-format on - + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)) + nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); nonvint(nth); nrncore2nrn_send_values(nth); diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp index 8dfb0cd76..1218b7967 100644 --- a/coreneuron/sim/fast_imem.cpp +++ b/coreneuron/sim/fast_imem.cpp @@ -50,10 +50,10 @@ void nrn_calc_fast_imem(NrnThread* nt) { double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d; double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(vec_rhs, \ - vec_area, \ - fast_imem_d, \ - fast_imem_rhs) if (nt->compute_gpu) async(nt->stream_id) + nrn_pragma_acc( + parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; } @@ -68,8 +68,9 @@ void nrn_calc_fast_imem_init(NrnThread* nt) { double* vec_area = nt->_actual_area; double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) \ - async(nt->stream_id) + nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; } diff --git a/coreneuron/sim/finitialize.cpp b/coreneuron/sim/finitialize.cpp index 1ae79a92f..d711ae247 100644 --- a/coreneuron/sim/finitialize.cpp +++ b/coreneuron/sim/finitialize.cpp @@ -53,12 +53,9 @@ void nrn_finitialize(int setv, double v) { if (setv) { for (auto _nt = nrn_threads; _nt < nrn_threads + nrn_nthread; ++_nt) { double* vec_v = &(VEC_V(0)); - // clang-format off - - #pragma acc parallel loop present( \ - _nt[0:1], vec_v[0:_nt->end]) \ - if (_nt->compute_gpu) - // clang-format on + nrn_pragma_acc( + parallel loop present(_nt [0:1], vec_v [0:_nt->end]) if (_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < _nt->end; ++i) { vec_v[i] = v; } diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp index a24c8360f..60ba2b660 100644 --- a/coreneuron/sim/solve_core.cpp +++ b/coreneuron/sim/solve_core.cpp @@ -24,7 +24,9 @@ void nrn_solve_minimal(NrnThread* _nt) { } } -/** TODO loops are executed seq in OpenACC just for debugging, remove it! */ +/** @todo OpenACC GPU offload is sequential/slow. Because --cell-permute=0 and + * --gpu is forbidden anyway, no OpenMP target offload equivalent is implemented. + */ /* triangularization of the matrix equations */ static void triang(NrnThread* _nt) { @@ -37,17 +39,9 @@ static void triang(NrnThread* _nt) { double* vec_rhs = &(VEC_RHS(0)); int* parent_index = _nt->_v_parent_index; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif - /** @todo: just for benchmarking, otherwise produces wrong results */ - // clang-format off - - #pragma acc parallel loop seq present( \ - vec_a[0:i3], vec_b[0:i3], vec_d[0:i3], \ - vec_rhs[0:i3], parent_index[0:i3]) \ - async(stream_id) if (_nt->compute_gpu) - // clang-format on + nrn_pragma_acc(parallel loop seq present( + vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3]) + async(_nt->stream_id) if (_nt->compute_gpu)) for (int i = i3 - 1; i >= i2; --i) { double p = vec_a[i] / vec_d[i]; vec_d[parent_index[i]] -= p * vec_b[i]; @@ -66,33 +60,22 @@ static void bksub(NrnThread* _nt) { double* vec_rhs = &(VEC_RHS(0)); int* parent_index = _nt->_v_parent_index; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif - /** @todo: just for benchmarking, otherwise produces wrong results */ - // clang-format off - - #pragma acc parallel loop seq present( \ - vec_d[0:i2], vec_rhs[0:i2]) \ - async(stream_id) if (_nt->compute_gpu) - // clang-format on + nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2]) + async(_nt->stream_id) if (_nt->compute_gpu)) for (int i = i1; i < i2; ++i) { vec_rhs[i] /= vec_d[i]; } - /** @todo: just for benchmarking, otherwise produces wrong results */ - // clang-format off - - #pragma acc parallel loop seq present( \ - vec_b[0:i3], vec_d[0:i3], vec_rhs[0:i3], \ - parent_index[0:i3]) async(stream_id) \ - if (_nt->compute_gpu) + nrn_pragma_acc( + parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3]) + async(_nt->stream_id) if (_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]]; vec_rhs[i] /= vec_d[i]; } - #pragma acc wait(stream_id) - // clang-format on + if (_nt->compute_gpu) { + nrn_pragma_acc(wait(_nt->stream_id)) + } } } // namespace coreneuron diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index 943980bcd..bb92d2ab1 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -32,12 +32,9 @@ static void nrn_rhs(NrnThread* _nt) { double* vec_v = &(VEC_V(0)); int* parent_index = _nt->_v_parent_index; - // clang-format off - - #pragma acc parallel loop present( \ - vec_rhs[0:i3], vec_d[0:i3]) \ - if (_nt->compute_gpu) async(_nt->stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { vec_rhs[i] = 0.; vec_d[i] = 0.; @@ -46,9 +43,10 @@ static void nrn_rhs(NrnThread* _nt) { if (_nt->nrn_fast_imem) { double* fast_imem_d = _nt->nrn_fast_imem->nrn_sav_d; double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(fast_imem_d [i1:i3], \ - fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) \ - async(_nt->stream_id) + nrn_pragma_acc( + parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_d[i] = 0.; fast_imem_rhs[i] = 0.; @@ -76,7 +74,9 @@ static void nrn_rhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] -= vec_rhs[i]; } @@ -86,22 +86,24 @@ static void nrn_rhs(NrnThread* _nt) { The extracellular mechanism contribution is already done. rhs += ai_j*(vi_j - vi) */ - // clang-format off - - #pragma acc parallel loop present( \ - vec_rhs[0:i3], vec_d[0:i3], \ - vec_a[0:i3], vec_b[0:i3], \ - vec_v[0:i3], parent_index[0:i3]) \ - if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], + vec_d [0:i3], + vec_a [0:i3], + vec_b [0:i3], + vec_v [0:i3], + parent_index [0:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { double dv = vec_v[parent_index[i]] - vec_v[i]; /* our connection coefficients are negative so */ - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_rhs[i] -= vec_b[i] * dv; - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_rhs[parent_index[i]] += vec_a[i] * dv; } - // clang-format on } /* calculate left hand side of @@ -150,34 +152,32 @@ static void nrn_lhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_d; -#pragma acc parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; } } /* now add the axial currents */ - // clang-format off - - #pragma acc parallel loop present( \ - vec_d[0:i3], vec_a[0:i3], \ - vec_b[0:i3], parent_index[0:i3]) \ - if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present( + vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_d[i] -= vec_b[i]; - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_d[parent_index[i]] -= vec_a[i]; } - // clang-format on } /* for the fixed step method */ void* setup_tree_matrix_minimal(NrnThread* _nt) { nrn_rhs(_nt); nrn_lhs(_nt); - // update_matrix_from_gpu(_nt); - return nullptr; } } // namespace coreneuron diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h index 965c06e78..2f0e24458 100644 --- a/coreneuron/utils/memory.h +++ b/coreneuron/utils/memory.h @@ -115,8 +115,7 @@ auto allocate_unique(const Alloc& alloc, Args&&... args) { } // namespace coreneuron /// for gpu builds with unified memory support -/// OL210812: why do we include __CUDACC__ here? -#if (defined(__CUDACC__) || defined(CORENEURON_UNIFIED_MEMORY)) +#ifdef CORENEURON_UNIFIED_MEMORY #include diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp new file mode 100644 index 000000000..d90cc10fd --- /dev/null +++ b/coreneuron/utils/offload.hpp @@ -0,0 +1,20 @@ +/* +# ============================================================================= +# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL +# +# See top-level LICENSE file for details. +# ============================================================================= +*/ +#pragma once +#define nrn_pragma_stringify(x) #x +#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) +#define nrn_pragma_acc(x) +#define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x)) +#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) +#define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x)) +#define nrn_pragma_omp(x) +#else +#define nrn_pragma_acc(x) +#define nrn_pragma_omp(x) +#endif diff --git a/external/nmodl b/external/nmodl index 794b419f5..a60c5e903 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit 794b419f5256f40efcdca1674f712a6e544c235a +Subproject commit a60c5e903126ad95cfe2bceb904d0efe83ba9d8a diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt index 3e2ac8e80..ec795f178 100644 --- a/tests/unit/lfp/CMakeLists.txt +++ b/tests/unit/lfp/CMakeLists.txt @@ -22,3 +22,4 @@ set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF) target_compile_options(lfp_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS}) add_dependencies(lfp_test_bin nrniv-core) add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $) +set_tests_properties(lfp_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1) From 02abf78c1ffd57130bccccbe2c325f6bedb33a3e Mon Sep 17 00:00:00 2001 From: Pramod Kumbhar Date: Thu, 9 Dec 2021 14:13:32 +0100 Subject: [PATCH 10/31] GPU data management using OpenACC as well as OpenMP API (#704) * Add wrapper functions for using OpenMP or OpenACC API * Add -mp=gpu in order to link gpu runtime with tests as well * Avoid copying VecPlay members twice otherwise association fails with OpenMP * IvocVect members t_ and y_ were copied twice * only discon_indices_ is pointer and hence that needs to be copied --- CMake/OpenAccHelper.cmake | 1 + coreneuron/gpu/nrn_acc_manager.cpp | 908 ++++++++++++++++------------- coreneuron/utils/vrecord.cpp | 9 +- 3 files changed, 523 insertions(+), 395 deletions(-) diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index c7f91a7c9..e8fa6738a 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -68,6 +68,7 @@ if(CORENRN_ENABLE_GPU) # for a region then prefer OpenMP. add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp") + string(APPEND NVHPC_ACC_LINK_FLAGS " -mp=gpu") endif() # avoid PGI adding standard compliant "-A" flags set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index b249875dc..089b90848 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -36,13 +36,66 @@ #endif namespace coreneuron { extern InterleaveInfo* interleave_info; -void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div); +void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div, bool vector_copy_needed = false); void delete_ivoc_vect_from_device(IvocVect&); void nrn_ion_global_map_copyto_device(); void nrn_ion_global_map_delete_from_device(); void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay); void nrn_VecPlay_delete_from_device(NrnThread* nt); +void* cnrn_gpu_copyin(void* h_ptr, std::size_t len) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + return acc_copyin(h_ptr, len); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + auto host_id = omp_get_initial_device(); + auto device_id = omp_get_default_device(); + auto* d_ptr = omp_target_alloc(len, device_id); + nrn_assert(d_ptr != nullptr); + nrn_assert(omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id) == 0); + nrn_assert(omp_target_associate_ptr(h_ptr, d_ptr, len, 0, device_id) == 0); + return d_ptr; +#else + throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + acc_memcpy_to_device(d_ptr, h_ptr, len); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + auto host_id = omp_get_initial_device(); + auto device_id = omp_get_default_device(); + omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id); +#else + throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +void cnrn_target_delete(void* h_ptr, size_t len) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + acc_delete(h_ptr, len); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + (void)len; + auto device_id = omp_get_default_device(); + omp_target_disassociate_ptr(h_ptr, device_id); + auto* d_ptr = omp_get_mapped_ptr(h_ptr, device_id); + omp_target_free(d_ptr, device_id); +#else + throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +void* cnrn_target_deviceptr(void* h_ptr) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + return acc_deviceptr(h_ptr); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + auto device_id = omp_get_default_device(); + return omp_get_mapped_ptr(h_ptr, device_id); +#else + throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + /* note: threads here are corresponding to global nrn_threads array */ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { #ifdef _OPENACC @@ -61,13 +114,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { NrnThread* nt = threads + i; // NrnThread on host if (nt->n_presyn) { - PreSyn* d_presyns = (PreSyn*) acc_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); + PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) acc_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu, // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay // inside nt on cpu (due to unified memory). @@ -85,7 +138,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * find * corresponding NrnThread using Point_process in NET_RECEIVE block */ - NrnThread* d_threads = (NrnThread*) acc_copyin(threads, sizeof(NrnThread) * nthreads); + NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, sizeof(NrnThread) * nthreads); if (interleave_info == nullptr) { printf("\n Warning: No permutation data? Required for linear algebra!"); @@ -104,7 +157,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* -- copy _data to device -- */ /*copy all double data for thread */ - d__data = (double*) acc_copyin(nt->_data, nt->_ndata * sizeof(double)); + d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata * sizeof(double)); + /* Here is the example of using OpenACC data enter/exit * Remember that we are not allowed to use nt->_data but we have to use: @@ -114,7 +168,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { */ /*update d_nt._data to point to device copy */ - acc_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*)); + cnrn_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*)); + auto host_id = omp_get_initial_device(); /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */ double* dptr; @@ -123,36 +178,36 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int ne = nrn_soa_padded_size(nt->end, 0); dptr = d__data + 0 * ne; - acc_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*)); + cnrn_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*)); dptr = d__data + 1 * ne; - acc_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*)); + cnrn_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*)); dptr = d__data + 2 * ne; - acc_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*)); + cnrn_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*)); dptr = d__data + 3 * ne; - acc_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*)); + cnrn_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*)); dptr = d__data + 4 * ne; - acc_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*)); + cnrn_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*)); dptr = d__data + 5 * ne; - acc_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*)); + cnrn_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*)); if (nt->_actual_diam) { dptr = d__data + 6 * ne; - acc_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*)); + cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*)); } - int* d_v_parent_index = (int*) acc_copyin(nt->_v_parent_index, nt->end * sizeof(int)); - acc_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*)); + int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end * sizeof(int)); + cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*)); /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/ - Memb_list** d_ml_list = (Memb_list**) acc_copyin(nt->_ml_list, + Memb_list** d_ml_list = (Memb_list**) cnrn_gpu_copyin(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*)); - acc_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**)); + cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**)); /* -- copy NrnThreadMembList list ml to device -- */ @@ -163,26 +218,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { for (auto tml = nt->tml; tml; tml = tml->next) { /*copy tml to device*/ /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/ - auto d_tml = (NrnThreadMembList*) acc_copyin(tml, sizeof(NrnThreadMembList)); + auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml, sizeof(NrnThreadMembList)); /*first tml is pointed by nt */ if (first_tml) { - acc_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*)); + cnrn_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*)); first_tml = false; } else { /*rest of tml forms linked list */ - acc_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*)); + cnrn_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*)); } // book keeping for linked-list d_last_tml = d_tml; /* now for every tml, there is a ml. copy that and setup pointer */ - auto d_ml = (Memb_list*) acc_copyin(tml->ml, sizeof(Memb_list)); - acc_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*)); + auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml, sizeof(Memb_list)); + cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*)); /* setup nt._ml_list */ - acc_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*)); + cnrn_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*)); int type = tml->index; int n = tml->ml->nodecount; @@ -191,26 +246,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int is_art = corenrn.get_is_artificial()[type]; // get device pointer for corresponding mechanism data - dptr = (double*) acc_deviceptr(tml->ml->data); - acc_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*)); + dptr = (double*) cnrn_target_deviceptr(tml->ml->data); + cnrn_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*)); if (!is_art) { - int* d_nodeindices = (int*) acc_copyin(tml->ml->nodeindices, sizeof(int) * n); - acc_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*)); + int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, sizeof(int) * n); + cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*)); } if (szdp) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - int* d_pdata = (int*) acc_copyin(tml->ml->pdata, sizeof(int) * pcnt); - acc_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*)); + int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, sizeof(int) * pcnt); + cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*)); } int ts = corenrn.get_memb_funcs()[type].thread_size_; if (ts) { - ThreadDatum* td = (ThreadDatum*) acc_copyin(tml->ml->_thread, + ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread, ts * sizeof(ThreadDatum)); - acc_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*)); + cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*)); } NetReceiveBuffer_t *nrb, *d_nrb; @@ -222,28 +277,28 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // if net receive buffer exist for mechanism if (nrb) { - d_nrb = (NetReceiveBuffer_t*) acc_copyin(nrb, sizeof(NetReceiveBuffer_t)); - acc_memcpy_to_device(&(d_ml->_net_receive_buffer), + d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb, sizeof(NetReceiveBuffer_t)); + cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb, sizeof(NetReceiveBuffer_t*)); - d_pnt_index = (int*) acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); + d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); - d_weight_index = (int*) acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); + d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); - d_nrb_t = (double*) acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); + d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); - d_nrb_flag = (double*) acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); + d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); - d_displ = (int*) acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); + d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); + cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); - d_nrb_index = (int*) acc_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); + d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); } /* copy NetSendBuffer_t on to GPU */ @@ -255,26 +310,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int* d_iptr; double* d_dptr; - d_nsb = (NetSendBuffer_t*) acc_copyin(nsb, sizeof(NetSendBuffer_t)); - acc_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*)); + d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb, sizeof(NetSendBuffer_t)); + cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*)); - d_iptr = (int*) acc_copyin(nsb->_sendtype, sizeof(int) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*)); + d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, sizeof(int) * nsb->_size); + cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*)); - d_iptr = (int*) acc_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*)); + d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size); + cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*)); - d_iptr = (int*) acc_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*)); + d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size); + cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*)); - d_iptr = (int*) acc_copyin(nsb->_weight_index, sizeof(int) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*)); + d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, sizeof(int) * nsb->_size); + cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*)); - d_dptr = (double*) acc_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*)); + d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size); + cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*)); - d_dptr = (double*) acc_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*)); + d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size); + cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*)); } } @@ -284,28 +339,28 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); /* copy shadow_rhs to device and fix-up the pointer */ - d_shadow_ptr = (double*) acc_copyin(nt->_shadow_rhs, pcnt * sizeof(double)); - acc_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*)); + d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt * sizeof(double)); + cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*)); /* copy shadow_d to device and fix-up the pointer */ - d_shadow_ptr = (double*) acc_copyin(nt->_shadow_d, pcnt * sizeof(double)); - acc_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*)); + d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt * sizeof(double)); + cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*)); } /* Fast membrane current calculation struct */ if (nt->nrn_fast_imem) { auto* d_fast_imem = reinterpret_cast( - acc_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem))); - acc_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*)); + cnrn_gpu_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem))); + cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*)); { auto* d_ptr = reinterpret_cast( - acc_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double))); - acc_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*)); + cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double))); + cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*)); } { auto* d_ptr = reinterpret_cast( - acc_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double))); - acc_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*)); + cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double))); + cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*)); } } @@ -313,21 +368,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU */ Point_process* pntptr = - (Point_process*) acc_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); - acc_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*)); + (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); + cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*)); } if (nt->n_weight) { /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */ - double* d_weights = (double*) acc_copyin(nt->weights, sizeof(double) * nt->n_weight); - acc_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*)); + double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, sizeof(double) * nt->n_weight); + cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*)); } if (nt->_nvdata) { /* copy vdata which is setup in bbcore_read. This contains cuda allocated * nrnran123_State * */ - void** d_vdata = (void**) acc_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata); - acc_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**)); + void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata); + cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**)); } if (nt->n_presyn) { @@ -337,24 +392,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * to * VTable and alignment */ PreSynHelper* d_presyns_helper = - (PreSynHelper*) acc_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); - acc_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*)); - PreSyn* d_presyns = (PreSyn*) acc_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - acc_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*)); + (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); + cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*)); + PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); + cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*)); } if (nt->_net_send_buffer_size) { /* copy send_receive buffer */ - int* d_net_send_buffer = (int*) acc_copyin(nt->_net_send_buffer, + int* d_net_send_buffer = (int*) cnrn_gpu_copyin(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size); - acc_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*)); + cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*)); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) acc_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); - acc_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**)); + void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**)); nrn_VecPlay_copyto_device(nt, d_vecplay); } @@ -363,41 +418,41 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (interleave_permute_type == 1) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = (InterleaveInfo*) acc_copyin(info, sizeof(InterleaveInfo)); + InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo)); int* d_ptr = nullptr; - d_ptr = (int*) acc_copyin(info->stride, sizeof(int) * (info->nstride + 1)); - acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); + d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * (info->nstride + 1)); + cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); - d_ptr = (int*) acc_copyin(info->firstnode, sizeof(int) * nt->ncell); - acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); + d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * nt->ncell); + cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) acc_copyin(info->lastnode, sizeof(int) * nt->ncell); - acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); + d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * nt->ncell); + cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) acc_copyin(info->cellsize, sizeof(int) * nt->ncell); - acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); + d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * nt->ncell); + cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); } else if (interleave_permute_type == 2) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = (InterleaveInfo*) acc_copyin(info, sizeof(InterleaveInfo)); + InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo)); int* d_ptr = nullptr; - d_ptr = (int*) acc_copyin(info->stride, sizeof(int) * info->nstride); - acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); + d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * info->nstride); + cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); - d_ptr = (int*) acc_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1)); - acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); + d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1)); + cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) acc_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1)); - acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); + d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1)); + cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); - d_ptr = (int*) acc_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1)); - acc_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*)); + d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1)); + cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*)); - d_ptr = (int*) acc_copyin(info->cellsize, sizeof(int) * info->nwarp); - acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); + d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * info->nwarp); + cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); } else { printf("\n ERROR: only --cell_permute = [12] implemented"); abort(); @@ -412,21 +467,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // Create a device-side copy of the `trajec_requests` struct and // make sure the device-side NrnThread object knows about it. auto* d_trajec_requests = reinterpret_cast( - acc_copyin(tr, sizeof(TrajectoryRequests))); - acc_memcpy_to_device(&(d_nt->trajec_requests), + cnrn_gpu_copyin(tr, sizeof(TrajectoryRequests))); + cnrn_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests, sizeof(TrajectoryRequests*)); // Initialise the double** gather member of the struct. auto* d_tr_gather = reinterpret_cast( - acc_copyin(tr->gather, sizeof(double*) * tr->n_trajec)); - acc_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**)); + cnrn_gpu_copyin(tr->gather, sizeof(double*) * tr->n_trajec)); + cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**)); // Initialise the double** varrays member of the struct if it's // set. double** d_tr_varrays{nullptr}; if (tr->varrays) { d_tr_varrays = reinterpret_cast( - acc_copyin(tr->varrays, sizeof(double*) * tr->n_trajec)); - acc_memcpy_to_device(&(d_trajec_requests->varrays), + cnrn_gpu_copyin(tr->varrays, sizeof(double*) * tr->n_trajec)); + cnrn_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays, sizeof(double**)); } @@ -436,13 +491,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // make a device-side copy of it and store a pointer to it in // the device-side version of tr->varrays. auto* d_buf_traj_i = reinterpret_cast( - acc_copyin(tr->varrays[i], tr->bsize * sizeof(double))); - acc_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*)); + cnrn_gpu_copyin(tr->varrays[i], tr->bsize * sizeof(double))); + cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*)); } // tr->gather[i] is a double* referring to (host) data in the // (host) _data block - auto* d_gather_i = acc_deviceptr(tr->gather[i]); - acc_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*)); + auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]); + cnrn_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*)); } // TODO: other `double** scatter` and `void** vpr` members of // the TrajectoryRequests struct are not copied to the device. @@ -459,15 +514,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { #endif } -void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) { +void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_copy_needed) { #ifdef _OPENACC - IvocVect* d_iv = (IvocVect*) acc_copyin((void*) &from, sizeof(IvocVect)); - acc_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*)); - + /// by default `to` is desitionation pointer on a device + IvocVect* d_iv = &to; + + /// if we need to copy IvocVect vector then newly alloated vector + /// on the device is a new destination pointer + if(vector_copy_needed) { + d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from, sizeof(IvocVect)); + cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*)); + } size_t n = from.size(); if (n) { - double* d_data = (double*) acc_copyin((void*) from.data(), sizeof(double) * n); - acc_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*)); + double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), sizeof(double) * n); + cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*)); } #else (void) from; @@ -479,9 +540,9 @@ void delete_ivoc_vect_from_device(IvocVect& vec) { #ifdef _OPENACC auto const n = vec.size(); if (n) { - acc_delete(vec.data(), sizeof(double) * n); + cnrn_target_delete(vec.data(), sizeof(double) * n); } - acc_delete(&vec, sizeof(IvocVect)); + cnrn_target_delete(&vec, sizeof(IvocVect)); #else (void) vec; #endif @@ -496,12 +557,12 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { #ifdef _OPENACC if (nt->compute_gpu) { // free existing vectors in buffers on gpu - acc_delete(nrb->_pnt_index, nrb->_size * sizeof(int)); - acc_delete(nrb->_weight_index, nrb->_size * sizeof(int)); - acc_delete(nrb->_nrb_t, nrb->_size * sizeof(double)); - acc_delete(nrb->_nrb_flag, nrb->_size * sizeof(double)); - acc_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int)); - acc_delete(nrb->_nrb_index, nrb->_size * sizeof(int)); + cnrn_target_delete(nrb->_pnt_index, nrb->_size * sizeof(int)); + cnrn_target_delete(nrb->_weight_index, nrb->_size * sizeof(int)); + cnrn_target_delete(nrb->_nrb_t, nrb->_size * sizeof(double)); + cnrn_target_delete(nrb->_nrb_flag, nrb->_size * sizeof(double)); + cnrn_target_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int)); + cnrn_target_delete(nrb->_nrb_index, nrb->_size * sizeof(int)); } #endif @@ -520,28 +581,29 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { double *d_nrb_t, *d_nrb_flag; // update device copy - acc_update_device(nrb, sizeof(NetReceiveBuffer_t)); + nrn_pragma_acc(update device(nrb)); + nrn_pragma_omp(target update to(nrb)); - NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) acc_deviceptr(nrb); + NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb); // recopy the vectors in the buffer - d_pnt_index = (int*) acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); + d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); - d_weight_index = (int*) acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); + d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); - d_nrb_t = (double*) acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); + d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); - d_nrb_flag = (double*) acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); + d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); - d_displ = (int*) acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); + d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); + cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); - d_nrb_index = (int*) acc_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); + d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); + cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); } #endif } @@ -655,13 +717,23 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { if (nsb->_cnt) { Instrumentor::phase p_net_receive_buffer_order("net-send-buf-gpu2cpu"); - acc_update_self(nsb->_sendtype, sizeof(int) * nsb->_cnt); - acc_update_self(nsb->_vdata_index, sizeof(int) * nsb->_cnt); - acc_update_self(nsb->_pnt_index, sizeof(int) * nsb->_cnt); - acc_update_self(nsb->_weight_index, sizeof(int) * nsb->_cnt); - acc_update_self(nsb->_nsb_t, sizeof(double) * nsb->_cnt); - acc_update_self(nsb->_nsb_flag, sizeof(double) * nsb->_cnt); } + nrn_pragma_acc(update self( + nsb->_sendtype[:nsb->_cnt], + nsb->_vdata_index[:nsb->_cnt], + nsb->_pnt_index[:nsb->_cnt], + nsb->_weight_index[:nsb->_cnt], + nsb->_nsb_t[:nsb->_cnt], + nsb->_nsb_flag[:nsb->_cnt]) + if nsb->_cnt) + nrn_pragma_omp(target update from( + nsb->_sendtype[:nsb->_cnt], + nsb->_vdata_index[:nsb->_cnt], + nsb->_pnt_index[:nsb->_cnt], + nsb->_weight_index[:nsb->_cnt], + nsb->_nsb_t[:nsb->_cnt], + nsb->_nsb_flag[:nsb->_cnt]) + if (nsb->_cnt)) #else (void) nt; (void) nsb; @@ -679,15 +751,23 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { int ne = nrn_soa_padded_size(nt->end, 0); - acc_update_self(nt->_actual_rhs, ne * sizeof(double)); - acc_update_self(nt->_actual_d, ne * sizeof(double)); - acc_update_self(nt->_actual_a, ne * sizeof(double)); - acc_update_self(nt->_actual_b, ne * sizeof(double)); - acc_update_self(nt->_actual_v, ne * sizeof(double)); - acc_update_self(nt->_actual_area, ne * sizeof(double)); - if (nt->_actual_diam) { - acc_update_self(nt->_actual_diam, ne * sizeof(double)); - } + nrn_pragma_acc(update self( + nt->_actual_rhs[:ne], + nt->_actual_d[:ne], + nt->_actual_a[:ne], + nt->_actual_b[:ne], + nt->_actual_v[:ne], + nt->_actual_area[:ne])) + nrn_pragma_omp(target update from( + nt->_actual_rhs[:ne], + nt->_actual_d[:ne], + nt->_actual_a[:ne], + nt->_actual_b[:ne], + nt->_actual_v[:ne], + nt->_actual_area[:ne])) + + nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if nt->_actual_diam) + nrn_pragma_omp(target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) /* @todo: nt._ml_list[tml->index] = tml->ml; */ @@ -695,8 +775,10 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { for (auto tml = nt->tml; tml; tml = tml->next) { Memb_list* ml = tml->ml; - acc_update_self(&tml->index, sizeof(int)); - acc_update_self(&ml->nodecount, sizeof(int)); + nrn_pragma_acc(update self(&tml->index, + &ml->nodecount)) + nrn_pragma_omp(target update from(tml->index, + ml->nodecount)) int type = tml->index; int n = ml->nodecount; @@ -713,54 +795,72 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp; - acc_update_self(ml->data, pcnt * sizeof(double)); - acc_update_self(ml->nodeindices, n * sizeof(int)); + nrn_pragma_acc(update self(ml->data[:pcnt], + ml->nodeindices[:n])) + nrn_pragma_omp(target update from(ml->data[:pcnt], + ml->nodeindices[:n])) - if (szdp) { - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - acc_update_self(ml->pdata, pcnt * sizeof(int)); - } + int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; + nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if szdp) + nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp)) auto nrb = tml->ml->_net_receive_buffer; - if (nrb) { - acc_update_self(&nrb->_cnt, sizeof(int)); - acc_update_self(&nrb->_size, sizeof(int)); - acc_update_self(&nrb->_pnt_offset, sizeof(int)); - acc_update_self(&nrb->_displ_cnt, sizeof(int)); - - acc_update_self(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_update_self(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_update_self(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_update_self(nrb->_nrb_index, sizeof(int) * nrb->_size); - } - } - - if (nt->shadow_rhs_cnt) { - int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); - /* copy shadow_rhs to host */ - acc_update_self(nt->_shadow_rhs, pcnt * sizeof(double)); - /* copy shadow_d to host */ - acc_update_self(nt->_shadow_d, pcnt * sizeof(double)); - } - - if (nt->nrn_fast_imem) { - acc_update_self(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)); - acc_update_self(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)); - } - - if (nt->n_pntproc) { - acc_update_self(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); + nrn_pragma_acc(update self( + &nrb->_cnt, + &nrb->_size, + &nrb->_pnt_offset, + &nrb->_displ_cnt, + + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size + 1], + nrb->_nrb_index[:nrb->_size]) + if nrb) + nrn_pragma_omp(target update from( + nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, + + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size + 1], + nrb->_nrb_index[:nrb->_size]) + if (nrb != nullptr)) } - if (nt->n_weight) { - acc_update_self(nt->weights, sizeof(double) * nt->n_weight); - } - - if (nt->n_presyn) { - acc_update_self(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); - acc_update_self(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - } + int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); + /* copy shadow_rhs to host */ + /* copy shadow_d to host */ + nrn_pragma_acc(update self(nt->_shadow_rhs[:pcnt], + nt->_shadow_d[:pcnt]) + if nt->shadow_rhs_cnt) + nrn_pragma_omp(target update from(nt->_shadow_rhs[:pcnt], + nt->_shadow_d[:pcnt]) + if (nt->shadow_rhs_cnt)) + + nrn_pragma_acc(update self(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], + nt->nrn_fast_imem->nrn_sav_d[:nt->end]) + if nt->nrn_fast_imem) + nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], + nt->nrn_fast_imem->nrn_sav_d[:nt->end]) + if (nt->nrn_fast_imem != nullptr)) + + nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if nt->n_pntproc) + nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc)) + + nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if nt->n_weight) + nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight)) + + nrn_pragma_acc(update self( + nt->presyns_helper[:nt->n_presyn], + nt->presyns[:nt->n_presyn]) + if nt->n_presyn) + nrn_pragma_omp(target update from( + nt->presyns_helper[:nt->n_presyn], + nt->presyns[:nt->n_presyn]) + if (nt->n_presyn)) { TrajectoryRequests* tr = nt->trajec_requests; @@ -768,15 +868,17 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { // The full buffers have `bsize` entries, but only `vsize` // of them are valid. for (int i = 0; i < tr->n_trajec; ++i) { - acc_update_self(tr->varrays[i], tr->vsize * sizeof(double)); + nrn_pragma_acc(update self( + tr->varrays[i][:tr->vsize])) + nrn_pragma_omp(target update from( + tr->varrays[i][:tr->vsize])) } } } /* dont update vdata, its pointer array - if(nt->_nvdata) { - acc_update_self(nt->_vdata, sizeof(double)*nt->_nvdata); - } + nrn_pragma_acc(update self(nt->_vdata[:nt->_nvdata) if nt->_nvdata) + nrn_pragma_omp(target update from(nt->_vdata[:nt->_nvdata) if (nt->_nvdata)) */ } } @@ -797,15 +899,23 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { int ne = nrn_soa_padded_size(nt->end, 0); - acc_update_device(nt->_actual_rhs, ne * sizeof(double)); - acc_update_device(nt->_actual_d, ne * sizeof(double)); - acc_update_device(nt->_actual_a, ne * sizeof(double)); - acc_update_device(nt->_actual_b, ne * sizeof(double)); - acc_update_device(nt->_actual_v, ne * sizeof(double)); - acc_update_device(nt->_actual_area, ne * sizeof(double)); - if (nt->_actual_diam) { - acc_update_device(nt->_actual_diam, ne * sizeof(double)); - } + nrn_pragma_acc(update device( + nt->_actual_rhs[:ne], + nt->_actual_d[:ne], + nt->_actual_a[:ne], + nt->_actual_b[:ne], + nt->_actual_v[:ne], + nt->_actual_area[:ne])) + nrn_pragma_omp(target update to( + nt->_actual_rhs[:ne], + nt->_actual_d[:ne], + nt->_actual_a[:ne], + nt->_actual_b[:ne], + nt->_actual_v[:ne], + nt->_actual_area[:ne])) + + nrn_pragma_acc(update device(nt->_actual_diam[:ne]) if nt->_actual_diam) + nrn_pragma_omp(target update to(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) /* @todo: nt._ml_list[tml->index] = tml->ml; */ @@ -819,57 +929,70 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp; - acc_update_device(ml->data, pcnt * sizeof(double)); - - if (!corenrn.get_is_artificial()[type]) { - acc_update_device(ml->nodeindices, n * sizeof(int)); - } + nrn_pragma_acc(update device(ml->data[:pcnt])) + nrn_pragma_omp(target update to(ml->data[:pcnt])) - if (szdp) { - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - acc_update_device(ml->pdata, pcnt * sizeof(int)); - } + nrn_pragma_acc(update device(ml->nodeindices[:n]) + if (!corenrn.get_is_artificial()[type])) + nrn_pragma_omp(target update to(ml->nodeindices[:n]) + if (!corenrn.get_is_artificial()[type])) + int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; + nrn_pragma_acc(update device(ml->pdata[:dpcnt]) if szdp) + nrn_pragma_omp(target update to(ml->pdata[:dpcnt]) if (szdp)) auto nrb = tml->ml->_net_receive_buffer; - - if (nrb) { - acc_update_device(&nrb->_cnt, sizeof(int)); - acc_update_device(&nrb->_size, sizeof(int)); - acc_update_device(&nrb->_pnt_offset, sizeof(int)); - acc_update_device(&nrb->_displ_cnt, sizeof(int)); - - acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_update_device(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_size); - } - } - - if (nt->shadow_rhs_cnt) { - int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); - /* copy shadow_rhs to host */ - acc_update_device(nt->_shadow_rhs, pcnt * sizeof(double)); - /* copy shadow_d to host */ - acc_update_device(nt->_shadow_d, pcnt * sizeof(double)); - } - - if (nt->nrn_fast_imem) { - acc_update_device(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)); - acc_update_device(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)); - } - - if (nt->n_pntproc) { - acc_update_device(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); - } - - if (nt->n_weight) { - acc_update_device(nt->weights, sizeof(double) * nt->n_weight); - } - - if (nt->n_presyn) { - acc_update_device(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); - acc_update_device(nt->presyns, sizeof(PreSyn) * nt->n_presyn); + nrn_pragma_acc(update device(&nrb->_cnt, + &nrb->_size, + &nrb->_pnt_offset, + &nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size], + nrb->_nrb_index[:nrb->_size]) + if nrb) + nrn_pragma_omp(target update to(nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size], + nrb->_nrb_index[:nrb->_size]) + if (nrb != nullptr)) } + int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); + /* copy shadow_rhs to host */ + nrn_pragma_acc(update device(nt->_shadow_rhs[:pcnt], + /* copy shadow_d to host */ + nt->_shadow_d[:pcnt]) + if nt->shadow_rhs_cnt) + nrn_pragma_omp(target update to(nt->_shadow_rhs[:pcnt], + /* copy shadow_d to host */ + nt->_shadow_d[:pcnt]) + if (nt->shadow_rhs_cnt)) + + + nrn_pragma_acc(update device(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], + nt->nrn_fast_imem->nrn_sav_d[:nt->end]) + if nt->nrn_fast_imem) + nrn_pragma_omp(target update to(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], + nt->nrn_fast_imem->nrn_sav_d[:nt->end]) + if (nt->nrn_fast_imem != nullptr)) + + nrn_pragma_acc(update device(nt->pntprocs[:nt->n_pntproc]) + if nt->n_pntproc) + nrn_pragma_omp(target update to(nt->pntprocs[:nt->n_pntproc]) + if (nt->n_pntproc)) + + nrn_pragma_acc(update device(nt->weights[:nt->n_weight]) if nt->n_weight) + nrn_pragma_omp(target update to(nt->weights[:nt->n_weight]) if (nt->n_weight)) + + nrn_pragma_acc(update device(nt->presyns_helper[:nt->n_presyn], + nt->presyns[:nt->n_presyn]) + if nt->n_presyn) + nrn_pragma_omp(target update to(nt->presyns_helper[:nt->n_presyn], + nt->presyns[:nt->n_presyn]) + if (nt->n_presyn)) { TrajectoryRequests* tr = nt->trajec_requests; @@ -877,15 +1000,15 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { // The full buffers have `bsize` entries, but only `vsize` // of them are valid. for (int i = 0; i < tr->n_trajec; ++i) { - acc_update_device(tr->varrays[i], tr->vsize * sizeof(double)); + nrn_pragma_acc(update device(tr->varrays[i][:tr->vsize])) + nrn_pragma_omp(target update to(tr->varrays[i][:tr->vsize])) } } } /* don't and don't update vdata, its pointer array - if(nt->_nvdata) { - acc_update_device(nt->_vdata, sizeof(double)*nt->_nvdata); - } + nrn_pragma_acc(update device(nt->_vdata[:nt->_nvdata) if nt->_nvdata) + nrn_pragma_omp(target update tp(nt->_vdata[:nt->_nvdata) if (nt->_nvdata)) */ } } @@ -916,22 +1039,22 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) { /** Cleanup device memory that is being tracked by the OpenACC runtime. * - * This function painstakingly calls `acc_delete` in reverse order on all - * pointers that were passed to `acc_copyin` in `setup_nrnthreads_on_device`. + * This function painstakingly calls `cnrn_target_delete` in reverse order on all + * pointers that were passed to `cnrn_gpu_copyin` in `setup_nrnthreads_on_device`. * This cleanup ensures that if the GPU is initialised multiple times from the * same process then the OpenACC runtime will not be polluted with old * pointers, which can cause errors. In particular if we do: * @code * { * // ... some_ptr is dynamically allocated ... - * acc_copyin(some_ptr, some_size); + * cnrn_gpu_copyin(some_ptr, some_size); * // ... do some work ... - * // acc_delete(some_ptr); + * // cnrn_target_delete(some_ptr); * free(some_ptr); * } * { * // ... same_ptr_again is dynamically allocated at the same address ... - * acc_copyin(same_ptr_again, some_other_size); // ERROR + * cnrn_gpu_copyin(same_ptr_again, some_other_size); // ERROR * } * @endcode * the application will/may abort with an error such as: @@ -948,73 +1071,73 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (tr) { if (tr->varrays) { for (int i = 0; i < tr->n_trajec; ++i) { - acc_delete(tr->varrays[i], tr->bsize * sizeof(double)); + cnrn_target_delete(tr->varrays[i], tr->bsize * sizeof(double)); } - acc_delete(tr->varrays, sizeof(double*) * tr->n_trajec); + cnrn_target_delete(tr->varrays, sizeof(double*) * tr->n_trajec); } - acc_delete(tr->gather, sizeof(double*) * tr->n_trajec); - acc_delete(tr, sizeof(TrajectoryRequests)); + cnrn_target_delete(tr->gather, sizeof(double*) * tr->n_trajec); + cnrn_target_delete(tr, sizeof(TrajectoryRequests)); } } if (nt->_permute) { if (interleave_permute_type == 1) { InterleaveInfo* info = interleave_info + i; - acc_delete(info->cellsize, sizeof(int) * nt->ncell); - acc_delete(info->lastnode, sizeof(int) * nt->ncell); - acc_delete(info->firstnode, sizeof(int) * nt->ncell); - acc_delete(info->stride, sizeof(int) * (info->nstride + 1)); - acc_delete(info, sizeof(InterleaveInfo)); + cnrn_target_delete(info->cellsize, sizeof(int) * nt->ncell); + cnrn_target_delete(info->lastnode, sizeof(int) * nt->ncell); + cnrn_target_delete(info->firstnode, sizeof(int) * nt->ncell); + cnrn_target_delete(info->stride, sizeof(int) * (info->nstride + 1)); + cnrn_target_delete(info, sizeof(InterleaveInfo)); } else if (interleave_permute_type == 2) { InterleaveInfo* info = interleave_info + i; - acc_delete(info->cellsize, sizeof(int) * info->nwarp); - acc_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1)); - acc_delete(info->lastnode, sizeof(int) * (info->nwarp + 1)); - acc_delete(info->firstnode, sizeof(int) * (info->nwarp + 1)); - acc_delete(info->stride, sizeof(int) * info->nstride); - acc_delete(info, sizeof(InterleaveInfo)); + cnrn_target_delete(info->cellsize, sizeof(int) * info->nwarp); + cnrn_target_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1)); + cnrn_target_delete(info->lastnode, sizeof(int) * (info->nwarp + 1)); + cnrn_target_delete(info->firstnode, sizeof(int) * (info->nwarp + 1)); + cnrn_target_delete(info->stride, sizeof(int) * info->nstride); + cnrn_target_delete(info, sizeof(InterleaveInfo)); } } if (nt->n_vecplay) { nrn_VecPlay_delete_from_device(nt); - acc_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + cnrn_target_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay); } // Cleanup send_receive buffer. if (nt->_net_send_buffer_size) { - acc_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size); + cnrn_target_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size); } if (nt->n_presyn) { - acc_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - acc_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); + cnrn_target_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn); + cnrn_target_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); } // Cleanup data that's setup in bbcore_read. if (nt->_nvdata) { - acc_delete(nt->_vdata, sizeof(void*) * nt->_nvdata); + cnrn_target_delete(nt->_vdata, sizeof(void*) * nt->_nvdata); } // Cleanup weight vector used in NET_RECEIVE if (nt->n_weight) { - acc_delete(nt->weights, sizeof(double) * nt->n_weight); + cnrn_target_delete(nt->weights, sizeof(double) * nt->n_weight); } // Cleanup point processes if (nt->n_pntproc) { - acc_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); + cnrn_target_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); } if (nt->nrn_fast_imem) { - acc_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)); - acc_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)); - acc_delete(nt->nrn_fast_imem, sizeof(NrnFastImem)); + cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)); + cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)); + cnrn_target_delete(nt->nrn_fast_imem, sizeof(NrnFastImem)); } if (nt->shadow_rhs_cnt) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); - acc_delete(nt->_shadow_d, pcnt * sizeof(double)); - acc_delete(nt->_shadow_rhs, pcnt * sizeof(double)); + cnrn_target_delete(nt->_shadow_d, pcnt * sizeof(double)); + cnrn_target_delete(nt->_shadow_rhs, pcnt * sizeof(double)); } for (auto tml = nt->tml; tml; tml = tml->next) { @@ -1022,26 +1145,26 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { { NetSendBuffer_t* nsb{tml->ml->_net_send_buffer}; if (nsb) { - acc_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size); - acc_delete(nsb->_nsb_t, sizeof(double) * nsb->_size); - acc_delete(nsb->_weight_index, sizeof(int) * nsb->_size); - acc_delete(nsb->_pnt_index, sizeof(int) * nsb->_size); - acc_delete(nsb->_vdata_index, sizeof(int) * nsb->_size); - acc_delete(nsb->_sendtype, sizeof(int) * nsb->_size); - acc_delete(nsb, sizeof(NetSendBuffer_t)); + cnrn_target_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size); + cnrn_target_delete(nsb->_nsb_t, sizeof(double) * nsb->_size); + cnrn_target_delete(nsb->_weight_index, sizeof(int) * nsb->_size); + cnrn_target_delete(nsb->_pnt_index, sizeof(int) * nsb->_size); + cnrn_target_delete(nsb->_vdata_index, sizeof(int) * nsb->_size); + cnrn_target_delete(nsb->_sendtype, sizeof(int) * nsb->_size); + cnrn_target_delete(nsb, sizeof(NetSendBuffer_t)); } } // Cleanup the net receive buffer if it exists. { NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer}; if (nrb) { - acc_delete(nrb->_nrb_index, sizeof(int) * nrb->_size); - acc_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size); - acc_delete(nrb->_nrb_t, sizeof(double) * nrb->_size); - acc_delete(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_delete(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_delete(nrb, sizeof(NetReceiveBuffer_t)); + cnrn_target_delete(nrb->_nrb_index, sizeof(int) * nrb->_size); + cnrn_target_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1)); + cnrn_target_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size); + cnrn_target_delete(nrb->_nrb_t, sizeof(double) * nrb->_size); + cnrn_target_delete(nrb->_weight_index, sizeof(int) * nrb->_size); + cnrn_target_delete(nrb->_pnt_index, sizeof(int) * nrb->_size); + cnrn_target_delete(nrb, sizeof(NetReceiveBuffer_t)); } } int type = tml->index; @@ -1050,23 +1173,23 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { int is_art = corenrn.get_is_artificial()[type]; int ts = corenrn.get_memb_funcs()[type].thread_size_; if (ts) { - acc_delete(tml->ml->_thread, ts * sizeof(ThreadDatum)); + cnrn_target_delete(tml->ml->_thread, ts * sizeof(ThreadDatum)); } if (szdp) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - acc_delete(tml->ml->pdata, sizeof(int) * pcnt); + cnrn_target_delete(tml->ml->pdata, sizeof(int) * pcnt); } if (!is_art) { - acc_delete(tml->ml->nodeindices, sizeof(int) * n); + cnrn_target_delete(tml->ml->nodeindices, sizeof(int) * n); } - acc_delete(tml->ml, sizeof(Memb_list)); - acc_delete(tml, sizeof(NrnThreadMembList)); + cnrn_target_delete(tml->ml, sizeof(Memb_list)); + cnrn_target_delete(tml, sizeof(NrnThreadMembList)); } - acc_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*)); - acc_delete(nt->_v_parent_index, nt->end * sizeof(int)); - acc_delete(nt->_data, nt->_ndata * sizeof(double)); + cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*)); + cnrn_target_delete(nt->_v_parent_index, nt->end * sizeof(int)); + cnrn_target_delete(nt->_data, nt->_ndata * sizeof(double)); } - acc_delete(threads, sizeof(NrnThread) * nthreads); + cnrn_target_delete(threads, sizeof(NrnThread) * nthreads); nrn_ion_global_map_delete_from_device(); #endif } @@ -1082,34 +1205,34 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) { int n = ns->n * ns->n_instance; // actually, the values of double do not matter, only the pointers. - NewtonSpace* d_ns = (NewtonSpace*) acc_copyin(ns, sizeof(NewtonSpace)); + NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns, sizeof(NewtonSpace)); double* pd; - pd = (double*) acc_copyin(ns->delta_x, n * sizeof(double)); - acc_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*)); + pd = (double*) cnrn_gpu_copyin(ns->delta_x, n * sizeof(double)); + cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*)); - pd = (double*) acc_copyin(ns->high_value, n * sizeof(double)); - acc_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*)); + pd = (double*) cnrn_gpu_copyin(ns->high_value, n * sizeof(double)); + cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*)); - pd = (double*) acc_copyin(ns->low_value, n * sizeof(double)); - acc_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*)); + pd = (double*) cnrn_gpu_copyin(ns->low_value, n * sizeof(double)); + cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*)); - pd = (double*) acc_copyin(ns->rowmax, n * sizeof(double)); - acc_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*)); + pd = (double*) cnrn_gpu_copyin(ns->rowmax, n * sizeof(double)); + cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*)); - auto pint = (int*) acc_copyin(ns->perm, n * sizeof(int)); - acc_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*)); + auto pint = (int*) cnrn_gpu_copyin(ns->perm, n * sizeof(int)); + cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*)); - auto ppd = (double**) acc_copyin(ns->jacobian, ns->n * sizeof(double*)); - acc_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**)); + auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n * sizeof(double*)); + cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**)); // the actual jacobian doubles were allocated as a single array - double* d_jacdat = (double*) acc_copyin(ns->jacobian[0], ns->n * n * sizeof(double)); + double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n * sizeof(double)); for (int i = 0; i < ns->n; ++i) { pd = d_jacdat + i * n; - acc_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*)); + cnrn_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*)); } #endif } @@ -1122,14 +1245,14 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) { return; } int n = ns->n * ns->n_instance; - acc_delete(ns->jacobian[0], ns->n * n * sizeof(double)); - acc_delete(ns->jacobian, ns->n * sizeof(double*)); - acc_delete(ns->perm, n * sizeof(int)); - acc_delete(ns->rowmax, n * sizeof(double)); - acc_delete(ns->low_value, n * sizeof(double)); - acc_delete(ns->high_value, n * sizeof(double)); - acc_delete(ns->delta_x, n * sizeof(double)); - acc_delete(ns, sizeof(NewtonSpace)); + cnrn_target_delete(ns->jacobian[0], ns->n * n * sizeof(double)); + cnrn_target_delete(ns->jacobian, ns->n * sizeof(double*)); + cnrn_target_delete(ns->perm, n * sizeof(int)); + cnrn_target_delete(ns->rowmax, n * sizeof(double)); + cnrn_target_delete(ns->low_value, n * sizeof(double)); + cnrn_target_delete(ns->high_value, n * sizeof(double)); + cnrn_target_delete(ns->delta_x, n * sizeof(double)); + cnrn_target_delete(ns, sizeof(NewtonSpace)); #endif } @@ -1142,76 +1265,76 @@ void nrn_sparseobj_copyto_device(SparseObj* so) { } unsigned n1 = so->neqn + 1; - SparseObj* d_so = (SparseObj*) acc_copyin(so, sizeof(SparseObj)); + SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so, sizeof(SparseObj)); // only pointer fields in SparseObj that need setting up are // rowst, diag, rhs, ngetcall, coef_list // only pointer fields in Elm that need setting up are // r_down, c_right, value // do not care about the Elm* ptr value, just the space. - Elm** d_rowst = (Elm**) acc_copyin(so->rowst, n1 * sizeof(Elm*)); - acc_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**)); + Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1 * sizeof(Elm*)); + cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**)); - Elm** d_diag = (Elm**) acc_copyin(so->diag, n1 * sizeof(Elm*)); - acc_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**)); + Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1 * sizeof(Elm*)); + cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**)); - auto pu = (unsigned*) acc_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); - acc_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**)); + auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); + cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**)); - auto pd = (double*) acc_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double)); - acc_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*)); + auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double)); + cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*)); - auto d_coef_list = (double**) acc_copyin(so->coef_list, so->coef_list_size * sizeof(double*)); - acc_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**)); + auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size * sizeof(double*)); + cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**)); // Fill in relevant Elm pointer values for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - Elm* pelm = (Elm*) acc_copyin(elm, sizeof(Elm)); + Elm* pelm = (Elm*) cnrn_gpu_copyin(elm, sizeof(Elm)); if (elm == so->rowst[irow]) { - acc_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*)); + cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*)); } else { - Elm* d_e = (Elm*) acc_deviceptr(elm->c_left); - acc_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*)); + Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->c_left); + cnrn_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*)); } if (elm->col == elm->row) { - acc_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*)); + cnrn_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*)); } if (irow > 1) { if (elm->r_up) { - Elm* d_e = (Elm*) acc_deviceptr(elm->r_up); - acc_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*)); + Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->r_up); + cnrn_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*)); } } - pd = (double*) acc_copyin(elm->value, so->_cntml_padded * sizeof(double)); - acc_memcpy_to_device(&(pelm->value), &pd, sizeof(double*)); + pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded * sizeof(double)); + cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*)); } } // visit all the Elm again and fill in pelm->r_down and pelm->c_left for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - auto pelm = (Elm*) acc_deviceptr(elm); + auto pelm = (Elm*) cnrn_target_deviceptr(elm); if (elm->r_down) { - auto d_e = (Elm*) acc_deviceptr(elm->r_down); - acc_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*)); + auto d_e = (Elm*) cnrn_target_deviceptr(elm->r_down); + cnrn_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*)); } if (elm->c_right) { - auto d_e = (Elm*) acc_deviceptr(elm->c_right); - acc_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*)); + auto d_e = (Elm*) cnrn_target_deviceptr(elm->c_right); + cnrn_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*)); } } } // Fill in the d_so->coef_list for (unsigned i = 0; i < so->coef_list_size; ++i) { - pd = (double*) acc_deviceptr(so->coef_list[i]); - acc_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*)); + pd = (double*) cnrn_target_deviceptr(so->coef_list[i]); + cnrn_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*)); } #endif } @@ -1226,16 +1349,16 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { unsigned n1 = so->neqn + 1; for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - acc_delete(elm->value, so->_cntml_padded * sizeof(double)); - acc_delete(elm, sizeof(Elm)); + cnrn_target_delete(elm->value, so->_cntml_padded * sizeof(double)); + cnrn_target_delete(elm, sizeof(Elm)); } } - acc_delete(so->coef_list, so->coef_list_size * sizeof(double*)); - acc_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double)); - acc_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); - acc_delete(so->diag, n1 * sizeof(Elm*)); - acc_delete(so->rowst, n1 * sizeof(Elm*)); - acc_delete(so, sizeof(SparseObj)); + cnrn_target_delete(so->coef_list, so->coef_list_size * sizeof(double*)); + cnrn_target_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double)); + cnrn_target_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); + cnrn_target_delete(so->diag, n1 * sizeof(Elm*)); + cnrn_target_delete(so->rowst, n1 * sizeof(Elm*)); + cnrn_target_delete(so, sizeof(SparseObj)); #endif } @@ -1243,14 +1366,14 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { void nrn_ion_global_map_copyto_device() { if (nrn_ion_global_map_size) { - double** d_data = (double**) acc_copyin(nrn_ion_global_map, + double** d_data = (double**) cnrn_gpu_copyin(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size); for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { - double* d_mechmap = (double*) acc_copyin(nrn_ion_global_map[j], + double* d_mechmap = (double*) cnrn_gpu_copyin(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double)); - acc_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*)); + cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*)); } } } @@ -1259,11 +1382,11 @@ void nrn_ion_global_map_copyto_device() { void nrn_ion_global_map_delete_from_device() { for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { - acc_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double)); + cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double)); } } if (nrn_ion_global_map_size) { - acc_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size); + cnrn_target_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size); } } @@ -1317,8 +1440,8 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i]; /** just VecPlayContinuous object */ - void* d_p = (void*) acc_copyin(vecplay_instance, sizeof(VecPlayContinuous)); - acc_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*)); + void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance, sizeof(VecPlayContinuous)); + cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*)); VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p; @@ -1327,32 +1450,33 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_); if (vecplay_instance->discon_indices_) { copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_), - *(d_vecplay_instance->discon_indices_)); + *(d_vecplay_instance->discon_indices_), + true); } /** copy PlayRecordEvent : todo: verify this */ - PlayRecordEvent* d_e_ = (PlayRecordEvent*) acc_copyin(vecplay_instance->e_, + PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_, sizeof(PlayRecordEvent)); - acc_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*)); - acc_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*)); + cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*)); + cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*)); /** copy pd_ : note that it's pointer inside ml->data and hence data itself is * already on GPU */ - double* d_pd_ = (double*) acc_deviceptr(vecplay_instance->pd_); - acc_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*)); + double* d_pd_ = (double*) cnrn_target_deviceptr(vecplay_instance->pd_); + cnrn_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*)); } } void nrn_VecPlay_delete_from_device(NrnThread* nt) { for (int i = 0; i < nt->n_vecplay; i++) { auto* vecplay_instance = reinterpret_cast(nt->_vecplay[i]); - acc_delete(vecplay_instance->e_, sizeof(PlayRecordEvent)); + cnrn_target_delete(vecplay_instance->e_, sizeof(PlayRecordEvent)); if (vecplay_instance->discon_indices_) { delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_)); } delete_ivoc_vect_from_device(vecplay_instance->t_); delete_ivoc_vect_from_device(vecplay_instance->y_); - acc_delete(vecplay_instance, sizeof(VecPlayContinuous)); + cnrn_target_delete(vecplay_instance, sizeof(VecPlayContinuous)); } } diff --git a/coreneuron/utils/vrecord.cpp b/coreneuron/utils/vrecord.cpp index 8af2b028e..a972e754a 100644 --- a/coreneuron/utils/vrecord.cpp +++ b/coreneuron/utils/vrecord.cpp @@ -78,7 +78,8 @@ void VecPlayContinuous::deliver(double tt, NetCvode* ns) { last_index_ = ubound_index_; // clang-format off - #pragma acc update device(last_index_) if (nt->compute_gpu) + nrn_pragma_acc(update device(last_index_) if (nt->compute_gpu)) + nrn_pragma_omp(target update to(last_index_) if (nt->compute_gpu)) // clang-format on if (discon_indices_) { if (discon_index_ < discon_indices_->size()) { @@ -96,7 +97,8 @@ void VecPlayContinuous::deliver(double tt, NetCvode* ns) { } // clang-format off - #pragma acc update device(ubound_index_) if (nt->compute_gpu) + nrn_pragma_acc(update device(ubound_index_) if (nt->compute_gpu)) + nrn_pragma_omp(target update to(ubound_index_) if (nt->compute_gpu)) // clang-format on continuous(tt); } @@ -105,7 +107,8 @@ void VecPlayContinuous::continuous(double tt) { NrnThread* nt = nrn_threads + ith_; // clang-format off - #pragma acc kernels present(this) if(nt->compute_gpu) + nrn_pragma_acc(kernels present(this) if(nt->compute_gpu)) + nrn_pragma_omp(target if(nt->compute_gpu)) { *pd_ = interpolate(tt); } From 57f77244fd91b7d05313588a38587c7b75327efa Mon Sep 17 00:00:00 2001 From: Christos Kotsalos Date: Fri, 10 Dec 2021 16:22:55 +0100 Subject: [PATCH 11/31] small openacc fixes (#707) --- coreneuron/gpu/nrn_acc_manager.cpp | 55 +++++++++++++++--------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 089b90848..373fcdbc3 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -169,7 +169,6 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /*update d_nt._data to point to device copy */ cnrn_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*)); - auto host_id = omp_get_initial_device(); /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */ double* dptr; @@ -725,7 +724,7 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { nsb->_weight_index[:nsb->_cnt], nsb->_nsb_t[:nsb->_cnt], nsb->_nsb_flag[:nsb->_cnt]) - if nsb->_cnt) + if (nsb->_cnt)) nrn_pragma_omp(target update from( nsb->_sendtype[:nsb->_cnt], nsb->_vdata_index[:nsb->_cnt], @@ -766,7 +765,7 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { nt->_actual_v[:ne], nt->_actual_area[:ne])) - nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if nt->_actual_diam) + nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) nrn_pragma_omp(target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) /* @todo: nt._ml_list[tml->index] = tml->ml; */ @@ -775,8 +774,8 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { for (auto tml = nt->tml; tml; tml = tml->next) { Memb_list* ml = tml->ml; - nrn_pragma_acc(update self(&tml->index, - &ml->nodecount)) + nrn_pragma_acc(update self(tml->index, + ml->nodecount)) nrn_pragma_omp(target update from(tml->index, ml->nodecount)) @@ -801,22 +800,22 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { ml->nodeindices[:n])) int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if szdp) + nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp)) nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp)) auto nrb = tml->ml->_net_receive_buffer; nrn_pragma_acc(update self( - &nrb->_cnt, - &nrb->_size, - &nrb->_pnt_offset, - &nrb->_displ_cnt, + nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, nrb->_pnt_index[:nrb->_size], nrb->_weight_index[:nrb->_size], nrb->_displ[:nrb->_size + 1], nrb->_nrb_index[:nrb->_size]) - if nrb) + if (nrb != nullptr)) nrn_pragma_omp(target update from( nrb->_cnt, nrb->_size, @@ -835,28 +834,28 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { /* copy shadow_d to host */ nrn_pragma_acc(update self(nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) - if nt->shadow_rhs_cnt) + if (nt->shadow_rhs_cnt)) nrn_pragma_omp(target update from(nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt)) nrn_pragma_acc(update self(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], nt->nrn_fast_imem->nrn_sav_d[:nt->end]) - if nt->nrn_fast_imem) + if (nt->nrn_fast_imem != nullptr)) nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], nt->nrn_fast_imem->nrn_sav_d[:nt->end]) if (nt->nrn_fast_imem != nullptr)) - nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if nt->n_pntproc) + nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc)) nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc)) - nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if nt->n_weight) + nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if (nt->n_weight)) nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight)) nrn_pragma_acc(update self( nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) - if nt->n_presyn) + if (nt->n_presyn)) nrn_pragma_omp(target update from( nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) @@ -914,7 +913,7 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { nt->_actual_v[:ne], nt->_actual_area[:ne])) - nrn_pragma_acc(update device(nt->_actual_diam[:ne]) if nt->_actual_diam) + nrn_pragma_acc(update device(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) nrn_pragma_omp(target update to(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) /* @todo: nt._ml_list[tml->index] = tml->ml; */ @@ -937,19 +936,19 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { nrn_pragma_omp(target update to(ml->nodeindices[:n]) if (!corenrn.get_is_artificial()[type])) int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - nrn_pragma_acc(update device(ml->pdata[:dpcnt]) if szdp) + nrn_pragma_acc(update device(ml->pdata[:dpcnt]) if (szdp)) nrn_pragma_omp(target update to(ml->pdata[:dpcnt]) if (szdp)) auto nrb = tml->ml->_net_receive_buffer; - nrn_pragma_acc(update device(&nrb->_cnt, - &nrb->_size, - &nrb->_pnt_offset, - &nrb->_displ_cnt, + nrn_pragma_acc(update device(nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, nrb->_pnt_index[:nrb->_size], nrb->_weight_index[:nrb->_size], nrb->_displ[:nrb->_size], nrb->_nrb_index[:nrb->_size]) - if nrb) + if (nrb != nullptr)) nrn_pragma_omp(target update to(nrb->_cnt, nrb->_size, nrb->_pnt_offset, @@ -965,7 +964,7 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { nrn_pragma_acc(update device(nt->_shadow_rhs[:pcnt], /* copy shadow_d to host */ nt->_shadow_d[:pcnt]) - if nt->shadow_rhs_cnt) + if (nt->shadow_rhs_cnt)) nrn_pragma_omp(target update to(nt->_shadow_rhs[:pcnt], /* copy shadow_d to host */ nt->_shadow_d[:pcnt]) @@ -974,22 +973,22 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { nrn_pragma_acc(update device(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], nt->nrn_fast_imem->nrn_sav_d[:nt->end]) - if nt->nrn_fast_imem) + if (nt->nrn_fast_imem != nullptr)) nrn_pragma_omp(target update to(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], nt->nrn_fast_imem->nrn_sav_d[:nt->end]) if (nt->nrn_fast_imem != nullptr)) nrn_pragma_acc(update device(nt->pntprocs[:nt->n_pntproc]) - if nt->n_pntproc) + if (nt->n_pntproc)) nrn_pragma_omp(target update to(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc)) - nrn_pragma_acc(update device(nt->weights[:nt->n_weight]) if nt->n_weight) + nrn_pragma_acc(update device(nt->weights[:nt->n_weight]) if (nt->n_weight)) nrn_pragma_omp(target update to(nt->weights[:nt->n_weight]) if (nt->n_weight)) nrn_pragma_acc(update device(nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) - if nt->n_presyn) + if (nt->n_presyn)) nrn_pragma_omp(target update to(nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn)) From 56889cccaafedfffe3948cb9e721a2b66a1bd14f Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Mon, 13 Dec 2021 11:33:51 +0100 Subject: [PATCH 12/31] Fixup to make the CI work better while finalising hackathon changes. --- coreneuron/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 2308ab99a..437eb8ea7 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -303,7 +303,7 @@ set_target_properties( # create special-core with halfgap.mod for tests # ============================================================================= set(all_output_binaries) -if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "") +if(EXISTS "${CORENRN_EXTERNAL_BENCHMARK_DATA}") # Hack for the december 2021 hackathon, build an extra special-core with channel-benchmark # mechanisms. set(modfile_directory From 01a39d7d12f14a737b1e37cdecca1deadd21b102 Mon Sep 17 00:00:00 2001 From: Christos Kotsalos Date: Mon, 13 Dec 2021 13:45:59 +0100 Subject: [PATCH 13/31] solve_interleaved2_launcher (CUDA interface) : fixing size of blocksPerGrid & threadsPerBlock (#710) --- coreneuron/permute/cellorder.cu | 35 ++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu index 82198410f..1226b4bf7 100644 --- a/coreneuron/permute/cellorder.cu +++ b/coreneuron/permute/cellorder.cu @@ -72,25 +72,32 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int int* rootbegin = ii->firstnode; // nwarp+1 of these int* nodebegin = ii->lastnode; // nwarp+1 of these - int iwarp = icore / warpsize; // figure out the >> value - int ic = icore & (warpsize - 1); // figure out the & mask - int ncycle = ncycles[iwarp]; - int* stride = strides + stridedispl[iwarp]; - int root = rootbegin[iwarp]; - int lastroot = rootbegin[iwarp + 1]; - int firstnode = nodebegin[iwarp]; - int lastnode = nodebegin[iwarp + 1]; - - triang_interleaved2_device(nt, ic, ncycle, stride, lastnode); - bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); + while (icore < ncore) { + int iwarp = icore / warpsize; // figure out the >> value + int ic = icore & (warpsize - 1); // figure out the & mask + int ncycle = ncycles[iwarp]; + int* stride = strides + stridedispl[iwarp]; + int root = rootbegin[iwarp]; + int lastroot = rootbegin[iwarp + 1]; + int firstnode = nodebegin[iwarp]; + int lastnode = nodebegin[iwarp + 1]; + + triang_interleaved2_device(nt, ic, ncycle, stride, lastnode); + bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); + + icore += blockDim.x * gridDim.x; + } } void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) { auto cuda_stream = static_cast(stream); - int threadsPerBlock = warpsize; - // TODO: Should blocksPerGrid be a fixed number and have a while block inside the kernel? - int blocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock; + // the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e. + // 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells. + // The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU. + // In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs. + int threadsPerBlock = 128; + int blocksPerGrid = 512; solve_interleaved2_kernel<<>>(nt, info, ncore); From 0fe815e525d00de617b20e6a6cc9e3213ddbe0b8 Mon Sep 17 00:00:00 2001 From: Nicolas Cornu Date: Mon, 13 Dec 2021 16:08:08 +0100 Subject: [PATCH 14/31] OpenMP offload: use #pragma instead of runtime API (#708) * Use #pragma omp instead of runtime API in `cnrn_target_{copyin,delete}` * Fix `VecPlayContinuous::discon_indices_` device transfer. * Name `cnrn_target_` wrappers more consistently. Co-authored-by: Olli Lupton --- coreneuron/gpu/nrn_acc_manager.cpp | 611 ++++++++++++++--------------- 1 file changed, 291 insertions(+), 320 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 373fcdbc3..4fe0004fd 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -36,63 +36,55 @@ #endif namespace coreneuron { extern InterleaveInfo* interleave_info; -void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div, bool vector_copy_needed = false); +void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div); void delete_ivoc_vect_from_device(IvocVect&); void nrn_ion_global_map_copyto_device(); void nrn_ion_global_map_delete_from_device(); void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay); void nrn_VecPlay_delete_from_device(NrnThread* nt); -void* cnrn_gpu_copyin(void* h_ptr, std::size_t len) { +template +T* cnrn_target_deviceptr(const T* h_ptr) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return acc_copyin(h_ptr, len); + return static_cast(acc_deviceptr(const_cast(h_ptr))); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - auto host_id = omp_get_initial_device(); - auto device_id = omp_get_default_device(); - auto* d_ptr = omp_target_alloc(len, device_id); - nrn_assert(d_ptr != nullptr); - nrn_assert(omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id) == 0); - nrn_assert(omp_target_associate_ptr(h_ptr, d_ptr, len, 0, device_id) == 0); - return d_ptr; + return static_cast(omp_get_mapped_ptr(const_cast(h_ptr), omp_get_default_device())); #else - throw std::runtime_error("cnrn_gpu_copyin() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); #endif } -void cnrn_memcpy_to_device(void* d_ptr, void* h_ptr, size_t len) { +template +T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - acc_memcpy_to_device(d_ptr, h_ptr, len); + return static_cast(acc_copyin(const_cast(h_ptr), len * sizeof(T))); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - auto host_id = omp_get_initial_device(); - auto device_id = omp_get_default_device(); - omp_target_memcpy(d_ptr, h_ptr, len, 0, 0, device_id, host_id); + #pragma omp target enter data map(to:h_ptr[:len]) + return cnrn_target_deviceptr(const_cast(h_ptr)); #else - throw std::runtime_error("cnrn_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); #endif } -void cnrn_target_delete(void* h_ptr, size_t len) { +template +void cnrn_target_delete(T* h_ptr, std::size_t len = 1) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - acc_delete(h_ptr, len); + acc_delete(h_ptr, len * sizeof(T)); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - (void)len; - auto device_id = omp_get_default_device(); - omp_target_disassociate_ptr(h_ptr, device_id); - auto* d_ptr = omp_get_mapped_ptr(h_ptr, device_id); - omp_target_free(d_ptr, device_id); + #pragma omp target exit data map(delete: h_ptr[:len]) #else throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); #endif } -void* cnrn_target_deviceptr(void* h_ptr) { +template +void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return acc_deviceptr(h_ptr); + acc_memcpy_to_device(d_ptr, const_cast(h_ptr), len * sizeof(T)); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - auto device_id = omp_get_default_device(); - return omp_get_mapped_ptr(h_ptr, device_id); + omp_target_memcpy(d_ptr, const_cast(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device()); #else - throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); #endif } @@ -114,13 +106,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { NrnThread* nt = threads + i; // NrnThread on host if (nt->n_presyn) { - PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); + PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay); // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu, // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay // inside nt on cpu (due to unified memory). @@ -138,7 +130,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * find * corresponding NrnThread using Point_process in NET_RECEIVE block */ - NrnThread* d_threads = (NrnThread*) cnrn_gpu_copyin(threads, sizeof(NrnThread) * nthreads); + NrnThread* d_threads = cnrn_target_copyin(threads, nthreads); if (interleave_info == nullptr) { printf("\n Warning: No permutation data? Required for linear algebra!"); @@ -157,7 +149,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* -- copy _data to device -- */ /*copy all double data for thread */ - d__data = (double*) cnrn_gpu_copyin(nt->_data, nt->_ndata * sizeof(double)); + d__data = cnrn_target_copyin(nt->_data, nt->_ndata); /* Here is the example of using OpenACC data enter/exit @@ -168,7 +160,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { */ /*update d_nt._data to point to device copy */ - cnrn_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_data), &d__data); /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */ double* dptr; @@ -177,36 +169,35 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int ne = nrn_soa_padded_size(nt->end, 0); dptr = d__data + 0 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr)); dptr = d__data + 1 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_d), &(dptr)); dptr = d__data + 2 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_a), &(dptr)); dptr = d__data + 3 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_b), &(dptr)); dptr = d__data + 4 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_v), &(dptr)); dptr = d__data + 5 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_area), &(dptr)); if (nt->_actual_diam) { dptr = d__data + 6 * ne; - cnrn_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_diam), &(dptr)); } - int* d_v_parent_index = (int*) cnrn_gpu_copyin(nt->_v_parent_index, nt->end * sizeof(int)); - cnrn_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*)); + int* d_v_parent_index = cnrn_target_copyin(nt->_v_parent_index, nt->end); + cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index)); /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/ - Memb_list** d_ml_list = (Memb_list**) cnrn_gpu_copyin(nt->_ml_list, - corenrn.get_memb_funcs().size() * - sizeof(Memb_list*)); - cnrn_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**)); + Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, + corenrn.get_memb_funcs().size()); + cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list)); /* -- copy NrnThreadMembList list ml to device -- */ @@ -217,26 +208,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { for (auto tml = nt->tml; tml; tml = tml->next) { /*copy tml to device*/ /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/ - auto d_tml = (NrnThreadMembList*) cnrn_gpu_copyin(tml, sizeof(NrnThreadMembList)); + auto d_tml = cnrn_target_copyin(tml); /*first tml is pointed by nt */ if (first_tml) { - cnrn_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*)); + cnrn_target_memcpy_to_device(&(d_nt->tml), &d_tml); first_tml = false; } else { /*rest of tml forms linked list */ - cnrn_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*)); + cnrn_target_memcpy_to_device(&(d_last_tml->next), &d_tml); } // book keeping for linked-list d_last_tml = d_tml; /* now for every tml, there is a ml. copy that and setup pointer */ - auto d_ml = (Memb_list*) cnrn_gpu_copyin(tml->ml, sizeof(Memb_list)); - cnrn_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*)); + auto d_ml = cnrn_target_copyin(tml->ml); + cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml); /* setup nt._ml_list */ - cnrn_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*)); + cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml); int type = tml->index; int n = tml->ml->nodecount; @@ -245,26 +236,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int is_art = corenrn.get_is_artificial()[type]; // get device pointer for corresponding mechanism data - dptr = (double*) cnrn_target_deviceptr(tml->ml->data); - cnrn_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*)); + dptr = cnrn_target_deviceptr(tml->ml->data); + cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr)); if (!is_art) { - int* d_nodeindices = (int*) cnrn_gpu_copyin(tml->ml->nodeindices, sizeof(int) * n); - cnrn_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*)); + int* d_nodeindices = cnrn_target_copyin(tml->ml->nodeindices, n); + cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices); } if (szdp) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - int* d_pdata = (int*) cnrn_gpu_copyin(tml->ml->pdata, sizeof(int) * pcnt); - cnrn_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*)); + int* d_pdata = cnrn_target_copyin(tml->ml->pdata, pcnt); + cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata); } int ts = corenrn.get_memb_funcs()[type].thread_size_; if (ts) { - ThreadDatum* td = (ThreadDatum*) cnrn_gpu_copyin(tml->ml->_thread, - ts * sizeof(ThreadDatum)); - cnrn_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*)); + ThreadDatum* td = cnrn_target_copyin(tml->ml->_thread, ts); + cnrn_target_memcpy_to_device(&(d_ml->_thread), &td); } NetReceiveBuffer_t *nrb, *d_nrb; @@ -276,28 +266,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { // if net receive buffer exist for mechanism if (nrb) { - d_nrb = (NetReceiveBuffer_t*) cnrn_gpu_copyin(nrb, sizeof(NetReceiveBuffer_t)); - cnrn_memcpy_to_device(&(d_ml->_net_receive_buffer), - &d_nrb, - sizeof(NetReceiveBuffer_t*)); + d_nrb = cnrn_target_copyin(nrb); + cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb); - d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); + d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index); - d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); + d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index); - d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); + d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t); - d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); + d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag); - d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); + d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1); + cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ); - d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); + d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index); } /* copy NetSendBuffer_t on to GPU */ @@ -309,26 +297,26 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int* d_iptr; double* d_dptr; - d_nsb = (NetSendBuffer_t*) cnrn_gpu_copyin(nsb, sizeof(NetSendBuffer_t)); - cnrn_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*)); + d_nsb = cnrn_target_copyin(nsb); + cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_sendtype, sizeof(int) * nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*)); + d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*)); + d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*)); + d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr); - d_iptr = (int*) cnrn_gpu_copyin(nsb->_weight_index, sizeof(int) * nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*)); + d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr); - d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*)); + d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr); - d_dptr = (double*) cnrn_gpu_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size); - cnrn_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*)); + d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr); } } @@ -338,28 +326,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); /* copy shadow_rhs to device and fix-up the pointer */ - d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_rhs, pcnt * sizeof(double)); - cnrn_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*)); + d_shadow_ptr = cnrn_target_copyin(nt->_shadow_rhs, pcnt); + cnrn_target_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr); /* copy shadow_d to device and fix-up the pointer */ - d_shadow_ptr = (double*) cnrn_gpu_copyin(nt->_shadow_d, pcnt * sizeof(double)); - cnrn_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*)); + d_shadow_ptr = cnrn_target_copyin(nt->_shadow_d, pcnt); + cnrn_target_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr); } /* Fast membrane current calculation struct */ if (nt->nrn_fast_imem) { - auto* d_fast_imem = reinterpret_cast( - cnrn_gpu_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem))); - cnrn_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*)); + NrnFastImem* d_fast_imem = cnrn_target_copyin(nt->nrn_fast_imem); + cnrn_target_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem); { - auto* d_ptr = reinterpret_cast( - cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double))); - cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*)); + double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end); + cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr); } { - auto* d_ptr = reinterpret_cast( - cnrn_gpu_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double))); - cnrn_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*)); + double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end); + cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr); } } @@ -367,21 +352,21 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU */ Point_process* pntptr = - (Point_process*) cnrn_gpu_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); - cnrn_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*)); + cnrn_target_copyin(nt->pntprocs, nt->n_pntproc); + cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr); } if (nt->n_weight) { /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */ - double* d_weights = (double*) cnrn_gpu_copyin(nt->weights, sizeof(double) * nt->n_weight); - cnrn_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*)); + double* d_weights = cnrn_target_copyin(nt->weights, nt->n_weight); + cnrn_target_memcpy_to_device(&(d_nt->weights), &d_weights); } if (nt->_nvdata) { /* copy vdata which is setup in bbcore_read. This contains cuda allocated * nrnran123_State * */ - void** d_vdata = (void**) cnrn_gpu_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata); - cnrn_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**)); + void** d_vdata = cnrn_target_copyin(nt->_vdata, nt->_nvdata); + cnrn_target_memcpy_to_device(&(d_nt->_vdata), &d_vdata); } if (nt->n_presyn) { @@ -391,24 +376,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * to * VTable and alignment */ PreSynHelper* d_presyns_helper = - (PreSynHelper*) cnrn_gpu_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); - cnrn_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*)); - PreSyn* d_presyns = (PreSyn*) cnrn_gpu_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - cnrn_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*)); + cnrn_target_copyin(nt->presyns_helper, nt->n_presyn); + cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper); + PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn); + cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns); } if (nt->_net_send_buffer_size) { /* copy send_receive buffer */ - int* d_net_send_buffer = (int*) cnrn_gpu_copyin(nt->_net_send_buffer, - sizeof(int) * nt->_net_send_buffer_size); - cnrn_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*)); + int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer, + nt->_net_send_buffer_size); + cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) cnrn_gpu_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); - cnrn_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**)); + void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay); + cnrn_target_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay); nrn_VecPlay_copyto_device(nt, d_vecplay); } @@ -417,41 +402,41 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (interleave_permute_type == 1) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo)); int* d_ptr = nullptr; + InterleaveInfo* d_info = cnrn_target_copyin(info); - d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * (info->nstride + 1)); - cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->stride, info->nstride + 1); + cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr); - d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * nt->ncell); - cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->firstnode, nt->ncell); + cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr); - d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * nt->ncell); - cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->lastnode, nt->ncell); + cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr); - d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * nt->ncell); - cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->cellsize, nt->ncell); + cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr); } else if (interleave_permute_type == 2) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = (InterleaveInfo*) cnrn_gpu_copyin(info, sizeof(InterleaveInfo)); + InterleaveInfo* d_info = cnrn_target_copyin(info); int* d_ptr = nullptr; - d_ptr = (int*) cnrn_gpu_copyin(info->stride, sizeof(int) * info->nstride); - cnrn_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->stride, info->nstride); + cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr); - d_ptr = (int*) cnrn_gpu_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1)); - cnrn_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->firstnode, info->nwarp + 1); + cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr); - d_ptr = (int*) cnrn_gpu_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1)); - cnrn_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->lastnode, info->nwarp + 1); + cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr); - d_ptr = (int*) cnrn_gpu_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1)); - cnrn_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->stridedispl, info->nwarp + 1); + cnrn_target_memcpy_to_device(&(d_info->stridedispl), &d_ptr); - d_ptr = (int*) cnrn_gpu_copyin(info->cellsize, sizeof(int) * info->nwarp); - cnrn_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->cellsize, info->nwarp); + cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr); } else { printf("\n ERROR: only --cell_permute = [12] implemented"); abort(); @@ -465,38 +450,30 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (tr) { // Create a device-side copy of the `trajec_requests` struct and // make sure the device-side NrnThread object knows about it. - auto* d_trajec_requests = reinterpret_cast( - cnrn_gpu_copyin(tr, sizeof(TrajectoryRequests))); - cnrn_memcpy_to_device(&(d_nt->trajec_requests), - &d_trajec_requests, - sizeof(TrajectoryRequests*)); + TrajectoryRequests* d_trajec_requests = cnrn_target_copyin(tr); + cnrn_target_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests); // Initialise the double** gather member of the struct. - auto* d_tr_gather = reinterpret_cast( - cnrn_gpu_copyin(tr->gather, sizeof(double*) * tr->n_trajec)); - cnrn_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**)); + double** d_tr_gather = cnrn_target_copyin(tr->gather, tr->n_trajec); + cnrn_target_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather); // Initialise the double** varrays member of the struct if it's // set. double** d_tr_varrays{nullptr}; if (tr->varrays) { - d_tr_varrays = reinterpret_cast( - cnrn_gpu_copyin(tr->varrays, sizeof(double*) * tr->n_trajec)); - cnrn_memcpy_to_device(&(d_trajec_requests->varrays), - &d_tr_varrays, - sizeof(double**)); + d_tr_varrays = cnrn_target_copyin(tr->varrays, tr->n_trajec); + cnrn_target_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays); } for (int i = 0; i < tr->n_trajec; ++i) { if (tr->varrays) { // tr->varrays[i] is a buffer of tr->bsize doubles on the host, // make a device-side copy of it and store a pointer to it in // the device-side version of tr->varrays. - auto* d_buf_traj_i = reinterpret_cast( - cnrn_gpu_copyin(tr->varrays[i], tr->bsize * sizeof(double))); - cnrn_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*)); + double* d_buf_traj_i = cnrn_target_copyin(tr->varrays[i], tr->bsize); + cnrn_target_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i); } // tr->gather[i] is a double* referring to (host) data in the // (host) _data block auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]); - cnrn_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i); } // TODO: other `double** scatter` and `void** vpr` members of // the TrajectoryRequests struct are not copied to the device. @@ -513,21 +490,15 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { #endif } -void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to, bool vector_copy_needed) { +void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) { #ifdef _OPENACC /// by default `to` is desitionation pointer on a device IvocVect* d_iv = &to; - /// if we need to copy IvocVect vector then newly alloated vector - /// on the device is a new destination pointer - if(vector_copy_needed) { - d_iv = (IvocVect*) cnrn_gpu_copyin((void*) &from, sizeof(IvocVect)); - cnrn_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*)); - } size_t n = from.size(); if (n) { - double* d_data = (double*) cnrn_gpu_copyin((void*) from.data(), sizeof(double) * n); - cnrn_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*)); + double* d_data = cnrn_target_copyin(from.data(), n); + cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data); } #else (void) from; @@ -539,9 +510,9 @@ void delete_ivoc_vect_from_device(IvocVect& vec) { #ifdef _OPENACC auto const n = vec.size(); if (n) { - cnrn_target_delete(vec.data(), sizeof(double) * n); + cnrn_target_delete(vec.data(), n); } - cnrn_target_delete(&vec, sizeof(IvocVect)); + cnrn_target_delete(&vec); #else (void) vec; #endif @@ -556,12 +527,12 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { #ifdef _OPENACC if (nt->compute_gpu) { // free existing vectors in buffers on gpu - cnrn_target_delete(nrb->_pnt_index, nrb->_size * sizeof(int)); - cnrn_target_delete(nrb->_weight_index, nrb->_size * sizeof(int)); - cnrn_target_delete(nrb->_nrb_t, nrb->_size * sizeof(double)); - cnrn_target_delete(nrb->_nrb_flag, nrb->_size * sizeof(double)); - cnrn_target_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int)); - cnrn_target_delete(nrb->_nrb_index, nrb->_size * sizeof(int)); + cnrn_target_delete(nrb->_pnt_index, nrb->_size); + cnrn_target_delete(nrb->_weight_index, nrb->_size); + cnrn_target_delete(nrb->_nrb_t, nrb->_size); + cnrn_target_delete(nrb->_nrb_flag, nrb->_size); + cnrn_target_delete(nrb->_displ, nrb->_size + 1); + cnrn_target_delete(nrb->_nrb_index, nrb->_size); } #endif @@ -583,26 +554,26 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { nrn_pragma_acc(update device(nrb)); nrn_pragma_omp(target update to(nrb)); - NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) cnrn_target_deviceptr(nrb); + NetReceiveBuffer_t* d_nrb = cnrn_target_deviceptr(nrb); // recopy the vectors in the buffer - d_pnt_index = (int*) cnrn_gpu_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); + d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index); - d_weight_index = (int*) cnrn_gpu_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); + d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index); - d_nrb_t = (double*) cnrn_gpu_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); + d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t); - d_nrb_flag = (double*) cnrn_gpu_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); + d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag); - d_displ = (int*) cnrn_gpu_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - cnrn_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); + d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1); + cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ); - d_nrb_index = (int*) cnrn_gpu_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); - cnrn_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); + d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index); } #endif } @@ -1039,21 +1010,21 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) { /** Cleanup device memory that is being tracked by the OpenACC runtime. * * This function painstakingly calls `cnrn_target_delete` in reverse order on all - * pointers that were passed to `cnrn_gpu_copyin` in `setup_nrnthreads_on_device`. + * pointers that were passed to `cnrn_target_copyin` in `setup_nrnthreads_on_device`. * This cleanup ensures that if the GPU is initialised multiple times from the * same process then the OpenACC runtime will not be polluted with old * pointers, which can cause errors. In particular if we do: * @code * { * // ... some_ptr is dynamically allocated ... - * cnrn_gpu_copyin(some_ptr, some_size); + * cnrn_target_copyin(some_ptr, some_size); * // ... do some work ... * // cnrn_target_delete(some_ptr); * free(some_ptr); * } * { * // ... same_ptr_again is dynamically allocated at the same address ... - * cnrn_gpu_copyin(same_ptr_again, some_other_size); // ERROR + * cnrn_target_copyin(same_ptr_again, some_other_size); // ERROR * } * @endcode * the application will/may abort with an error such as: @@ -1070,73 +1041,73 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (tr) { if (tr->varrays) { for (int i = 0; i < tr->n_trajec; ++i) { - cnrn_target_delete(tr->varrays[i], tr->bsize * sizeof(double)); + cnrn_target_delete(tr->varrays[i], tr->bsize); } - cnrn_target_delete(tr->varrays, sizeof(double*) * tr->n_trajec); + cnrn_target_delete(tr->varrays, tr->n_trajec); } - cnrn_target_delete(tr->gather, sizeof(double*) * tr->n_trajec); - cnrn_target_delete(tr, sizeof(TrajectoryRequests)); + cnrn_target_delete(tr->gather, tr->n_trajec); + cnrn_target_delete(tr); } } if (nt->_permute) { if (interleave_permute_type == 1) { InterleaveInfo* info = interleave_info + i; - cnrn_target_delete(info->cellsize, sizeof(int) * nt->ncell); - cnrn_target_delete(info->lastnode, sizeof(int) * nt->ncell); - cnrn_target_delete(info->firstnode, sizeof(int) * nt->ncell); - cnrn_target_delete(info->stride, sizeof(int) * (info->nstride + 1)); - cnrn_target_delete(info, sizeof(InterleaveInfo)); + cnrn_target_delete(info->cellsize, nt->ncell); + cnrn_target_delete(info->lastnode, nt->ncell); + cnrn_target_delete(info->firstnode, nt->ncell); + cnrn_target_delete(info->stride, info->nstride + 1); + cnrn_target_delete(info); } else if (interleave_permute_type == 2) { InterleaveInfo* info = interleave_info + i; - cnrn_target_delete(info->cellsize, sizeof(int) * info->nwarp); - cnrn_target_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1)); - cnrn_target_delete(info->lastnode, sizeof(int) * (info->nwarp + 1)); - cnrn_target_delete(info->firstnode, sizeof(int) * (info->nwarp + 1)); - cnrn_target_delete(info->stride, sizeof(int) * info->nstride); - cnrn_target_delete(info, sizeof(InterleaveInfo)); + cnrn_target_delete(info->cellsize, info->nwarp); + cnrn_target_delete(info->stridedispl, info->nwarp + 1); + cnrn_target_delete(info->lastnode, info->nwarp + 1); + cnrn_target_delete(info->firstnode, info->nwarp + 1); + cnrn_target_delete(info->stride, info->nstride); + cnrn_target_delete(info); } } if (nt->n_vecplay) { nrn_VecPlay_delete_from_device(nt); - cnrn_target_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + cnrn_target_delete(nt->_vecplay, nt->n_vecplay); } // Cleanup send_receive buffer. if (nt->_net_send_buffer_size) { - cnrn_target_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size); + cnrn_target_delete(nt->_net_send_buffer, nt->_net_send_buffer_size); } if (nt->n_presyn) { - cnrn_target_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - cnrn_target_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); + cnrn_target_delete(nt->presyns, nt->n_presyn); + cnrn_target_delete(nt->presyns_helper, nt->n_presyn); } // Cleanup data that's setup in bbcore_read. if (nt->_nvdata) { - cnrn_target_delete(nt->_vdata, sizeof(void*) * nt->_nvdata); + cnrn_target_delete(nt->_vdata, nt->_nvdata); } // Cleanup weight vector used in NET_RECEIVE if (nt->n_weight) { - cnrn_target_delete(nt->weights, sizeof(double) * nt->n_weight); + cnrn_target_delete(nt->weights, nt->n_weight); } // Cleanup point processes if (nt->n_pntproc) { - cnrn_target_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); + cnrn_target_delete(nt->pntprocs, nt->n_pntproc); } if (nt->nrn_fast_imem) { - cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)); - cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)); - cnrn_target_delete(nt->nrn_fast_imem, sizeof(NrnFastImem)); + cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end); + cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end); + cnrn_target_delete(nt->nrn_fast_imem); } if (nt->shadow_rhs_cnt) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); - cnrn_target_delete(nt->_shadow_d, pcnt * sizeof(double)); - cnrn_target_delete(nt->_shadow_rhs, pcnt * sizeof(double)); + cnrn_target_delete(nt->_shadow_d, pcnt); + cnrn_target_delete(nt->_shadow_rhs, pcnt); } for (auto tml = nt->tml; tml; tml = tml->next) { @@ -1144,26 +1115,26 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { { NetSendBuffer_t* nsb{tml->ml->_net_send_buffer}; if (nsb) { - cnrn_target_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size); - cnrn_target_delete(nsb->_nsb_t, sizeof(double) * nsb->_size); - cnrn_target_delete(nsb->_weight_index, sizeof(int) * nsb->_size); - cnrn_target_delete(nsb->_pnt_index, sizeof(int) * nsb->_size); - cnrn_target_delete(nsb->_vdata_index, sizeof(int) * nsb->_size); - cnrn_target_delete(nsb->_sendtype, sizeof(int) * nsb->_size); - cnrn_target_delete(nsb, sizeof(NetSendBuffer_t)); + cnrn_target_delete(nsb->_nsb_flag, nsb->_size); + cnrn_target_delete(nsb->_nsb_t, nsb->_size); + cnrn_target_delete(nsb->_weight_index, nsb->_size); + cnrn_target_delete(nsb->_pnt_index, nsb->_size); + cnrn_target_delete(nsb->_vdata_index, nsb->_size); + cnrn_target_delete(nsb->_sendtype, nsb->_size); + cnrn_target_delete(nsb); } } // Cleanup the net receive buffer if it exists. { NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer}; if (nrb) { - cnrn_target_delete(nrb->_nrb_index, sizeof(int) * nrb->_size); - cnrn_target_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - cnrn_target_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size); - cnrn_target_delete(nrb->_nrb_t, sizeof(double) * nrb->_size); - cnrn_target_delete(nrb->_weight_index, sizeof(int) * nrb->_size); - cnrn_target_delete(nrb->_pnt_index, sizeof(int) * nrb->_size); - cnrn_target_delete(nrb, sizeof(NetReceiveBuffer_t)); + cnrn_target_delete(nrb->_nrb_index, nrb->_size); + cnrn_target_delete(nrb->_displ, nrb->_size + 1); + cnrn_target_delete(nrb->_nrb_flag, nrb->_size); + cnrn_target_delete(nrb->_nrb_t, nrb->_size); + cnrn_target_delete(nrb->_weight_index, nrb->_size); + cnrn_target_delete(nrb->_pnt_index, nrb->_size); + cnrn_target_delete(nrb); } } int type = tml->index; @@ -1172,23 +1143,23 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { int is_art = corenrn.get_is_artificial()[type]; int ts = corenrn.get_memb_funcs()[type].thread_size_; if (ts) { - cnrn_target_delete(tml->ml->_thread, ts * sizeof(ThreadDatum)); + cnrn_target_delete(tml->ml->_thread, ts); } if (szdp) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - cnrn_target_delete(tml->ml->pdata, sizeof(int) * pcnt); + cnrn_target_delete(tml->ml->pdata, pcnt); } if (!is_art) { - cnrn_target_delete(tml->ml->nodeindices, sizeof(int) * n); + cnrn_target_delete(tml->ml->nodeindices, n); } - cnrn_target_delete(tml->ml, sizeof(Memb_list)); - cnrn_target_delete(tml, sizeof(NrnThreadMembList)); + cnrn_target_delete(tml->ml); + cnrn_target_delete(tml); } - cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*)); - cnrn_target_delete(nt->_v_parent_index, nt->end * sizeof(int)); - cnrn_target_delete(nt->_data, nt->_ndata * sizeof(double)); + cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size()); + cnrn_target_delete(nt->_v_parent_index, nt->end); + cnrn_target_delete(nt->_data, nt->_ndata); } - cnrn_target_delete(threads, sizeof(NrnThread) * nthreads); + cnrn_target_delete(threads, nthreads); nrn_ion_global_map_delete_from_device(); #endif } @@ -1204,34 +1175,34 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) { int n = ns->n * ns->n_instance; // actually, the values of double do not matter, only the pointers. - NewtonSpace* d_ns = (NewtonSpace*) cnrn_gpu_copyin(ns, sizeof(NewtonSpace)); + NewtonSpace* d_ns = cnrn_target_copyin(ns); double* pd; - pd = (double*) cnrn_gpu_copyin(ns->delta_x, n * sizeof(double)); - cnrn_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->delta_x, n); + cnrn_target_memcpy_to_device(&(d_ns->delta_x), &pd); - pd = (double*) cnrn_gpu_copyin(ns->high_value, n * sizeof(double)); - cnrn_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->high_value, n); + cnrn_target_memcpy_to_device(&(d_ns->high_value), &pd); - pd = (double*) cnrn_gpu_copyin(ns->low_value, n * sizeof(double)); - cnrn_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->low_value, n); + cnrn_target_memcpy_to_device(&(d_ns->low_value), &pd); - pd = (double*) cnrn_gpu_copyin(ns->rowmax, n * sizeof(double)); - cnrn_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->rowmax, n); + cnrn_target_memcpy_to_device(&(d_ns->rowmax), &pd); - auto pint = (int*) cnrn_gpu_copyin(ns->perm, n * sizeof(int)); - cnrn_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*)); + auto pint = cnrn_target_copyin(ns->perm, n); + cnrn_target_memcpy_to_device(&(d_ns->perm), &pint); - auto ppd = (double**) cnrn_gpu_copyin(ns->jacobian, ns->n * sizeof(double*)); - cnrn_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**)); + auto ppd = cnrn_target_copyin(ns->jacobian, ns->n); + cnrn_target_memcpy_to_device(&(d_ns->jacobian), &ppd); // the actual jacobian doubles were allocated as a single array - double* d_jacdat = (double*) cnrn_gpu_copyin(ns->jacobian[0], ns->n * n * sizeof(double)); + double* d_jacdat = cnrn_target_copyin(ns->jacobian[0], ns->n * n); for (int i = 0; i < ns->n; ++i) { pd = d_jacdat + i * n; - cnrn_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*)); + cnrn_target_memcpy_to_device(&(ppd[i]), &pd); } #endif } @@ -1244,14 +1215,14 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) { return; } int n = ns->n * ns->n_instance; - cnrn_target_delete(ns->jacobian[0], ns->n * n * sizeof(double)); - cnrn_target_delete(ns->jacobian, ns->n * sizeof(double*)); - cnrn_target_delete(ns->perm, n * sizeof(int)); - cnrn_target_delete(ns->rowmax, n * sizeof(double)); - cnrn_target_delete(ns->low_value, n * sizeof(double)); - cnrn_target_delete(ns->high_value, n * sizeof(double)); - cnrn_target_delete(ns->delta_x, n * sizeof(double)); - cnrn_target_delete(ns, sizeof(NewtonSpace)); + cnrn_target_delete(ns->jacobian[0], ns->n * n); + cnrn_target_delete(ns->jacobian, ns->n); + cnrn_target_delete(ns->perm, n); + cnrn_target_delete(ns->rowmax, n); + cnrn_target_delete(ns->low_value, n); + cnrn_target_delete(ns->high_value, n); + cnrn_target_delete(ns->delta_x, n); + cnrn_target_delete(ns); #endif } @@ -1264,76 +1235,76 @@ void nrn_sparseobj_copyto_device(SparseObj* so) { } unsigned n1 = so->neqn + 1; - SparseObj* d_so = (SparseObj*) cnrn_gpu_copyin(so, sizeof(SparseObj)); + SparseObj* d_so = cnrn_target_copyin(so); // only pointer fields in SparseObj that need setting up are // rowst, diag, rhs, ngetcall, coef_list // only pointer fields in Elm that need setting up are // r_down, c_right, value // do not care about the Elm* ptr value, just the space. - Elm** d_rowst = (Elm**) cnrn_gpu_copyin(so->rowst, n1 * sizeof(Elm*)); - cnrn_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**)); + Elm** d_rowst = cnrn_target_copyin(so->rowst, n1); + cnrn_target_memcpy_to_device(&(d_so->rowst), &d_rowst); - Elm** d_diag = (Elm**) cnrn_gpu_copyin(so->diag, n1 * sizeof(Elm*)); - cnrn_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**)); + Elm** d_diag = cnrn_target_copyin(so->diag, n1); + cnrn_target_memcpy_to_device(&(d_so->diag), &d_diag); - auto pu = (unsigned*) cnrn_gpu_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); - cnrn_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**)); + unsigned* pu = cnrn_target_copyin(so->ngetcall, so->_cntml_padded); + cnrn_target_memcpy_to_device(&(d_so->ngetcall), &pu); - auto pd = (double*) cnrn_gpu_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double)); - cnrn_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*)); + double* pd = cnrn_target_copyin(so->rhs, n1 * so->_cntml_padded); + cnrn_target_memcpy_to_device(&(d_so->rhs), &pd); - auto d_coef_list = (double**) cnrn_gpu_copyin(so->coef_list, so->coef_list_size * sizeof(double*)); - cnrn_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**)); + double** d_coef_list = cnrn_target_copyin(so->coef_list, so->coef_list_size); + cnrn_target_memcpy_to_device(&(d_so->coef_list), &d_coef_list); // Fill in relevant Elm pointer values for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - Elm* pelm = (Elm*) cnrn_gpu_copyin(elm, sizeof(Elm)); + Elm* pelm = cnrn_target_copyin(elm); if (elm == so->rowst[irow]) { - cnrn_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*)); + cnrn_target_memcpy_to_device(&(d_rowst[irow]), &pelm); } else { - Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->c_left); - cnrn_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*)); + Elm* d_e = cnrn_target_deviceptr(elm->c_left); + cnrn_target_memcpy_to_device(&(pelm->c_left), &d_e); } if (elm->col == elm->row) { - cnrn_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*)); + cnrn_target_memcpy_to_device(&(d_diag[irow]), &pelm); } if (irow > 1) { if (elm->r_up) { - Elm* d_e = (Elm*) cnrn_target_deviceptr(elm->r_up); - cnrn_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*)); + Elm* d_e = cnrn_target_deviceptr(elm->r_up); + cnrn_target_memcpy_to_device(&(pelm->r_up), &d_e); } } - pd = (double*) cnrn_gpu_copyin(elm->value, so->_cntml_padded * sizeof(double)); - cnrn_memcpy_to_device(&(pelm->value), &pd, sizeof(double*)); + pd = cnrn_target_copyin(elm->value, so->_cntml_padded); + cnrn_target_memcpy_to_device(&(pelm->value), &pd); } } // visit all the Elm again and fill in pelm->r_down and pelm->c_left for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - auto pelm = (Elm*) cnrn_target_deviceptr(elm); + auto pelm = cnrn_target_deviceptr(elm); if (elm->r_down) { - auto d_e = (Elm*) cnrn_target_deviceptr(elm->r_down); - cnrn_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*)); + auto d_e = cnrn_target_deviceptr(elm->r_down); + cnrn_target_memcpy_to_device(&(pelm->r_down), &d_e); } if (elm->c_right) { - auto d_e = (Elm*) cnrn_target_deviceptr(elm->c_right); - cnrn_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*)); + auto d_e = cnrn_target_deviceptr(elm->c_right); + cnrn_target_memcpy_to_device(&(pelm->c_right), &d_e); } } } // Fill in the d_so->coef_list for (unsigned i = 0; i < so->coef_list_size; ++i) { - pd = (double*) cnrn_target_deviceptr(so->coef_list[i]); - cnrn_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*)); + pd = cnrn_target_deviceptr(so->coef_list[i]); + cnrn_target_memcpy_to_device(&(d_coef_list[i]), &pd); } #endif } @@ -1348,16 +1319,16 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { unsigned n1 = so->neqn + 1; for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - cnrn_target_delete(elm->value, so->_cntml_padded * sizeof(double)); - cnrn_target_delete(elm, sizeof(Elm)); + cnrn_target_delete(elm->value, so->_cntml_padded); + cnrn_target_delete(elm); } } - cnrn_target_delete(so->coef_list, so->coef_list_size * sizeof(double*)); - cnrn_target_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double)); - cnrn_target_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); - cnrn_target_delete(so->diag, n1 * sizeof(Elm*)); - cnrn_target_delete(so->rowst, n1 * sizeof(Elm*)); - cnrn_target_delete(so, sizeof(SparseObj)); + cnrn_target_delete(so->coef_list, so->coef_list_size); + cnrn_target_delete(so->rhs, n1 * so->_cntml_padded); + cnrn_target_delete(so->ngetcall, so->_cntml_padded); + cnrn_target_delete(so->diag, n1); + cnrn_target_delete(so->rowst, n1); + cnrn_target_delete(so); #endif } @@ -1365,14 +1336,13 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { void nrn_ion_global_map_copyto_device() { if (nrn_ion_global_map_size) { - double** d_data = (double**) cnrn_gpu_copyin(nrn_ion_global_map, - sizeof(double*) * nrn_ion_global_map_size); + double** d_data = cnrn_target_copyin(nrn_ion_global_map, + nrn_ion_global_map_size); for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { - double* d_mechmap = (double*) cnrn_gpu_copyin(nrn_ion_global_map[j], - ion_global_map_member_size * - sizeof(double)); - cnrn_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*)); + double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j], + ion_global_map_member_size); + cnrn_target_memcpy_to_device(&(d_data[j]), &d_mechmap); } } } @@ -1381,11 +1351,11 @@ void nrn_ion_global_map_copyto_device() { void nrn_ion_global_map_delete_from_device() { for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { - cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double)); + cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size); } } if (nrn_ion_global_map_size) { - cnrn_target_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size); + cnrn_target_delete(nrn_ion_global_map, nrn_ion_global_map_size); } } @@ -1439,43 +1409,44 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i]; /** just VecPlayContinuous object */ - void* d_p = (void*) cnrn_gpu_copyin(vecplay_instance, sizeof(VecPlayContinuous)); - cnrn_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*)); - - VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p; + VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance); + cnrn_target_memcpy_to_device((VecPlayContinuous**) (&(d_vecplay[i])), &d_vecplay_instance); /** copy y_, t_ and discon_indices_ */ copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_); copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_); + // OL211213: beware, the test suite does not currently include anything + // with a non-null discon_indices_. if (vecplay_instance->discon_indices_) { + IvocVect* d_discon_indices = cnrn_target_copyin(vecplay_instance->discon_indices_); + cnrn_target_memcpy_to_device(&(d_vecplay_instance->discon_indices_), &d_discon_indices); copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_), - *(d_vecplay_instance->discon_indices_), - true); + *(d_vecplay_instance->discon_indices_)); } /** copy PlayRecordEvent : todo: verify this */ - PlayRecordEvent* d_e_ = (PlayRecordEvent*) cnrn_gpu_copyin(vecplay_instance->e_, - sizeof(PlayRecordEvent)); - cnrn_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*)); - cnrn_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*)); + PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_); + + cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**) (&d_vecplay_instance)); + cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_); /** copy pd_ : note that it's pointer inside ml->data and hence data itself is * already on GPU */ - double* d_pd_ = (double*) cnrn_target_deviceptr(vecplay_instance->pd_); - cnrn_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*)); + double* d_pd_ = cnrn_target_deviceptr(vecplay_instance->pd_); + cnrn_target_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_); } } void nrn_VecPlay_delete_from_device(NrnThread* nt) { for (int i = 0; i < nt->n_vecplay; i++) { auto* vecplay_instance = reinterpret_cast(nt->_vecplay[i]); - cnrn_target_delete(vecplay_instance->e_, sizeof(PlayRecordEvent)); + cnrn_target_delete(vecplay_instance->e_); if (vecplay_instance->discon_indices_) { delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_)); } delete_ivoc_vect_from_device(vecplay_instance->t_); delete_ivoc_vect_from_device(vecplay_instance->y_); - cnrn_target_delete(vecplay_instance, sizeof(VecPlayContinuous)); + cnrn_target_delete(vecplay_instance); } } From 78081b435ba165fb5e8ed58adaa2bc30d65a334b Mon Sep 17 00:00:00 2001 From: Nicolas Cornu Date: Tue, 14 Dec 2021 08:48:12 +0100 Subject: [PATCH 15/31] Remove unused GPU code (#711) We prefer selective host-to-device updates. --- coreneuron/gpu/nrn_acc_manager.cpp | 130 ----------------------------- coreneuron/gpu/nrn_acc_manager.hpp | 6 +- 2 files changed, 3 insertions(+), 133 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 4fe0004fd..e7bd09817 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -858,136 +858,6 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { #endif } -void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { -#ifdef _OPENACC - - for (int i = 0; i < nthreads; i++) { - NrnThread* nt = threads + i; - - if (nt->compute_gpu && (nt->end > 0)) { - /* -- copy data to device -- */ - - int ne = nrn_soa_padded_size(nt->end, 0); - - nrn_pragma_acc(update device( - nt->_actual_rhs[:ne], - nt->_actual_d[:ne], - nt->_actual_a[:ne], - nt->_actual_b[:ne], - nt->_actual_v[:ne], - nt->_actual_area[:ne])) - nrn_pragma_omp(target update to( - nt->_actual_rhs[:ne], - nt->_actual_d[:ne], - nt->_actual_a[:ne], - nt->_actual_b[:ne], - nt->_actual_v[:ne], - nt->_actual_area[:ne])) - - nrn_pragma_acc(update device(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) - nrn_pragma_omp(target update to(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) - - /* @todo: nt._ml_list[tml->index] = tml->ml; */ - - /* -- copy NrnThreadMembList list ml to host -- */ - for (auto tml = nt->tml; tml; tml = tml->next) { - Memb_list* ml = tml->ml; - int type = tml->index; - int n = ml->nodecount; - int szp = corenrn.get_prop_param_size()[type]; - int szdp = corenrn.get_prop_dparam_size()[type]; - - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp; - - nrn_pragma_acc(update device(ml->data[:pcnt])) - nrn_pragma_omp(target update to(ml->data[:pcnt])) - - nrn_pragma_acc(update device(ml->nodeindices[:n]) - if (!corenrn.get_is_artificial()[type])) - nrn_pragma_omp(target update to(ml->nodeindices[:n]) - if (!corenrn.get_is_artificial()[type])) - int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - nrn_pragma_acc(update device(ml->pdata[:dpcnt]) if (szdp)) - nrn_pragma_omp(target update to(ml->pdata[:dpcnt]) if (szdp)) - - auto nrb = tml->ml->_net_receive_buffer; - nrn_pragma_acc(update device(nrb->_cnt, - nrb->_size, - nrb->_pnt_offset, - nrb->_displ_cnt, - nrb->_pnt_index[:nrb->_size], - nrb->_weight_index[:nrb->_size], - nrb->_displ[:nrb->_size], - nrb->_nrb_index[:nrb->_size]) - if (nrb != nullptr)) - nrn_pragma_omp(target update to(nrb->_cnt, - nrb->_size, - nrb->_pnt_offset, - nrb->_displ_cnt, - nrb->_pnt_index[:nrb->_size], - nrb->_weight_index[:nrb->_size], - nrb->_displ[:nrb->_size], - nrb->_nrb_index[:nrb->_size]) - if (nrb != nullptr)) - } - int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); - /* copy shadow_rhs to host */ - nrn_pragma_acc(update device(nt->_shadow_rhs[:pcnt], - /* copy shadow_d to host */ - nt->_shadow_d[:pcnt]) - if (nt->shadow_rhs_cnt)) - nrn_pragma_omp(target update to(nt->_shadow_rhs[:pcnt], - /* copy shadow_d to host */ - nt->_shadow_d[:pcnt]) - if (nt->shadow_rhs_cnt)) - - - nrn_pragma_acc(update device(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], - nt->nrn_fast_imem->nrn_sav_d[:nt->end]) - if (nt->nrn_fast_imem != nullptr)) - nrn_pragma_omp(target update to(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], - nt->nrn_fast_imem->nrn_sav_d[:nt->end]) - if (nt->nrn_fast_imem != nullptr)) - - nrn_pragma_acc(update device(nt->pntprocs[:nt->n_pntproc]) - if (nt->n_pntproc)) - nrn_pragma_omp(target update to(nt->pntprocs[:nt->n_pntproc]) - if (nt->n_pntproc)) - - nrn_pragma_acc(update device(nt->weights[:nt->n_weight]) if (nt->n_weight)) - nrn_pragma_omp(target update to(nt->weights[:nt->n_weight]) if (nt->n_weight)) - - nrn_pragma_acc(update device(nt->presyns_helper[:nt->n_presyn], - nt->presyns[:nt->n_presyn]) - if (nt->n_presyn)) - nrn_pragma_omp(target update to(nt->presyns_helper[:nt->n_presyn], - nt->presyns[:nt->n_presyn]) - if (nt->n_presyn)) - - { - TrajectoryRequests* tr = nt->trajec_requests; - if (tr && tr->varrays) { - // The full buffers have `bsize` entries, but only `vsize` - // of them are valid. - for (int i = 0; i < tr->n_trajec; ++i) { - nrn_pragma_acc(update device(tr->varrays[i][:tr->vsize])) - nrn_pragma_omp(target update to(tr->varrays[i][:tr->vsize])) - } - } - } - - /* don't and don't update vdata, its pointer array - nrn_pragma_acc(update device(nt->_vdata[:nt->_nvdata) if nt->_nvdata) - nrn_pragma_omp(target update tp(nt->_vdata[:nt->_nvdata) if (nt->_nvdata)) - */ - } - } -#else - (void) threads; - (void) nthreads; -#endif -} - /** * Copy weights from GPU to CPU * diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp index 354bdc208..1334369e7 100644 --- a/coreneuron/gpu/nrn_acc_manager.hpp +++ b/coreneuron/gpu/nrn_acc_manager.hpp @@ -19,13 +19,13 @@ namespace coreneuron { void setup_nrnthreads_on_device(NrnThread* threads, int nthreads); void delete_nrnthreads_on_device(NrnThread* threads, int nthreads); void update_nrnthreads_on_host(NrnThread* threads, int nthreads); -void update_nrnthreads_on_device(NrnThread* threads, int nthreads); -void modify_data_on_device(NrnThread* threads, int nthreads); -void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads); void update_net_receive_buffer(NrnThread* _nt); + +// Called by NModl void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml); void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb); + void update_weights_from_gpu(NrnThread* threads, int nthreads); void init_gpu(); From 781d34f615c2ac9cbc5f1bc05b87db2e334bb78f Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Tue, 14 Dec 2021 10:31:57 +0100 Subject: [PATCH 16/31] Fixes and improvements from LLVM/XLC work. (#716) Code fixes for XLC and Clang execution without build system changes. This mainly adds missing OpenMP pragmas and makes cnrn_target_ wrappers visible to NMODL. --- CMake/MakefileBuildOptions.cmake | 1 + coreneuron/gpu/nrn_acc_manager.cpp | 55 ++--------------------- coreneuron/kinderiv.py | 6 +++ coreneuron/mechanism/eion.cpp | 2 + coreneuron/mechanism/mech/dimplic.cpp | 2 + coreneuron/mechanism/register_mech.cpp | 2 + coreneuron/network/cvodestb.cpp | 2 + coreneuron/network/netcvode.cpp | 2 +- coreneuron/sim/scopmath/crout_thread.cpp | 2 + coreneuron/sim/scopmath/newton_thread.cpp | 2 + coreneuron/sim/treeset_core.cpp | 12 ++--- coreneuron/utils/offload.hpp | 53 ++++++++++++++++++++++ extra/nrnivmodl_core_makefile.in | 4 +- 13 files changed, 85 insertions(+), 60 deletions(-) diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake index fc0b0b551..009dd3215 100644 --- a/CMake/MakefileBuildOptions.cmake +++ b/CMake/MakefileBuildOptions.cmake @@ -75,6 +75,7 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE) set(CORENRN_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}} ${CXX14_STD_FLAGS} ${NVHPC_ACC_COMP_FLAGS} ${NVHPC_CXX_INLINE_FLAGS}" ) +set(CORENRN_LD_FLAGS "${NVHPC_ACC_LINK_FLAGS}") # ============================================================================= # nmodl/mod2c related options : TODO diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index e7bd09817..9bd635d77 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -24,16 +24,14 @@ #include "coreneuron/mpi/nrnmpidec.h" #include "coreneuron/utils/utils.hpp" +#ifdef CRAYPAT +#include +#endif + #ifdef _OPENACC #include #endif -#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD -#include -#endif -#ifdef CRAYPAT -#include -#endif namespace coreneuron { extern InterleaveInfo* interleave_info; void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div); @@ -43,51 +41,6 @@ void nrn_ion_global_map_delete_from_device(); void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay); void nrn_VecPlay_delete_from_device(NrnThread* nt); -template -T* cnrn_target_deviceptr(const T* h_ptr) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return static_cast(acc_deviceptr(const_cast(h_ptr))); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - return static_cast(omp_get_mapped_ptr(const_cast(h_ptr), omp_get_default_device())); -#else - throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); -#endif -} - -template -T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return static_cast(acc_copyin(const_cast(h_ptr), len * sizeof(T))); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - #pragma omp target enter data map(to:h_ptr[:len]) - return cnrn_target_deviceptr(const_cast(h_ptr)); -#else - throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); -#endif -} - -template -void cnrn_target_delete(T* h_ptr, std::size_t len = 1) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - acc_delete(h_ptr, len * sizeof(T)); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - #pragma omp target exit data map(delete: h_ptr[:len]) -#else - throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); -#endif -} - -template -void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - acc_memcpy_to_device(d_ptr, const_cast(h_ptr), len * sizeof(T)); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - omp_target_memcpy(d_ptr, const_cast(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device()); -#else - throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); -#endif -} - /* note: threads here are corresponding to global nrn_threads array */ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { #ifdef _OPENACC diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py index 35158908c..9b143c0cf 100644 --- a/coreneuron/kinderiv.py +++ b/coreneuron/kinderiv.py @@ -59,6 +59,9 @@ def write_out_kinderiv(fout): fout.write("\n/* declarations */\n") fout.write("\nnamespace coreneuron {\n") + if deriv or kin or euler: + fout.write('nrn_pragma_omp(declare target)\n') + for item in deriv: fout.write('#pragma acc routine seq\n') fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1])) @@ -73,6 +76,9 @@ def write_out_kinderiv(fout): fout.write('#pragma acc routine seq\n') fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1])) + if deriv or kin or euler: + fout.write('nrn_pragma_omp(end declare target)\n') + fout.write("\n/* callback indices */\n") derivoffset = 1 kinoffset = 1 diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 727f30ea6..6cb3cf83d 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -177,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) { } } +nrn_pragma_omp(declare target) void nrn_wrote_conc(int type, double* p1, int p2, @@ -193,6 +194,7 @@ void nrn_wrote_conc(int type, pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius); } } +nrn_pragma_omp(end declare target) static double efun(double x) { if (fabs(x) < 1e-4) { diff --git a/coreneuron/mechanism/mech/dimplic.cpp b/coreneuron/mechanism/mech/dimplic.cpp index e3b08207e..de8970560 100644 --- a/coreneuron/mechanism/mech/dimplic.cpp +++ b/coreneuron/mechanism/mech/dimplic.cpp @@ -24,6 +24,7 @@ #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp" #include "_kinderiv.h" namespace coreneuron { +nrn_pragma_omp(declare target) int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) { difun(fun); return 0; @@ -48,5 +49,6 @@ int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) { switch (fun) { _NRN_KINETIC_CASES } return 0; } +nrn_pragma_omp(end declare target) } // namespace coreneuron diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp index a8bff7a50..433140b82 100644 --- a/coreneuron/mechanism/register_mech.cpp +++ b/coreneuron/mechanism/register_mech.cpp @@ -19,7 +19,9 @@ namespace coreneuron { int secondorder = 0; +nrn_pragma_omp(declare target) double t, dt, celsius, pi; +nrn_pragma_omp(end declare target) int rev_dt; using Pfrv = void (*)(); diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp index 31b2fec54..97c70950e 100644 --- a/coreneuron/network/cvodestb.cpp +++ b/coreneuron/network/cvodestb.cpp @@ -86,6 +86,7 @@ void fixed_play_continuous(NrnThread* nt) { // NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc" // for the ISPC backend. If changes are required, make sure to change ISPC as well. +nrn_pragma_omp(declare target) int at_time(NrnThread* nt, double te) { double x = te - 1e-11; if (x <= nt->_t && x > (nt->_t - nt->_dt)) { @@ -93,5 +94,6 @@ int at_time(NrnThread* nt, double te) { } return 0; } +nrn_pragma_omp(end declare target) } // namespace coreneuron diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index ee2e5cb3e..dd521afde 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -537,7 +537,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method nrn_pragma_acc(parallel loop present( nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end]) copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu)) for (int i = 0; i < nt->ncell; ++i) { PreSyn* ps = presyns + i; PreSynHelper* psh = presyns_helper + i; diff --git a/coreneuron/sim/scopmath/crout_thread.cpp b/coreneuron/sim/scopmath/crout_thread.cpp index b180ea107..72a5c017f 100644 --- a/coreneuron/sim/scopmath/crout_thread.cpp +++ b/coreneuron/sim/scopmath/crout_thread.cpp @@ -50,6 +50,7 @@ namespace coreneuron { #define ix(arg) ((arg) *_STRIDE) /* having a differnt permutation per instance may not be a good idea */ +nrn_pragma_omp(declare target) int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) { int save_i = 0; @@ -224,4 +225,5 @@ void nrn_scopmath_solve_thread(int n, } } } +nrn_pragma_omp(end declare target) } // namespace coreneuron diff --git a/coreneuron/sim/scopmath/newton_thread.cpp b/coreneuron/sim/scopmath/newton_thread.cpp index 6c0f303ce..dc08ca04b 100644 --- a/coreneuron/sim/scopmath/newton_thread.cpp +++ b/coreneuron/sim/scopmath/newton_thread.cpp @@ -59,6 +59,7 @@ namespace coreneuron { #define ix(arg) ((arg) *_STRIDE) #define s_(arg) _p[s[arg] * _STRIDE] +nrn_pragma_omp(declare target) int nrn_newton_thread(NewtonSpace* ns, int n, int* s, @@ -136,6 +137,7 @@ int nrn_newton_thread(NewtonSpace* ns, return (error); } +nrn_pragma_omp(end declare target) /*------------------------------------------------------------*/ /* */ diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index bb92d2ab1..208058fe1 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -34,7 +34,7 @@ static void nrn_rhs(NrnThread* _nt) { nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { vec_rhs[i] = 0.; vec_d[i] = 0.; @@ -46,7 +46,7 @@ static void nrn_rhs(NrnThread* _nt) { nrn_pragma_acc( parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_d[i] = 0.; fast_imem_rhs[i] = 0.; @@ -76,7 +76,7 @@ static void nrn_rhs(NrnThread* _nt) { double* p = _nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] -= vec_rhs[i]; } @@ -93,7 +93,7 @@ static void nrn_rhs(NrnThread* _nt) { vec_v [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { double dv = vec_v[parent_index[i]] - vec_v[i]; /* our connection coefficients are negative so */ @@ -153,7 +153,7 @@ static void nrn_lhs(NrnThread* _nt) { */ double* p = _nt->nrn_fast_imem->nrn_sav_d; nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; } @@ -163,7 +163,7 @@ static void nrn_lhs(NrnThread* _nt) { nrn_pragma_acc(parallel loop present( vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { nrn_pragma_acc(atomic update) nrn_pragma_omp(atomic update) diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp index d90cc10fd..7ec41f4f4 100644 --- a/coreneuron/utils/offload.hpp +++ b/coreneuron/utils/offload.hpp @@ -10,11 +10,64 @@ #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) #define nrn_pragma_acc(x) #define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x)) +#include #elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ defined(_OPENACC) #define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x)) #define nrn_pragma_omp(x) +#include #else #define nrn_pragma_acc(x) #define nrn_pragma_omp(x) +#include #endif + +#include + +namespace coreneuron { +template +T* cnrn_target_deviceptr(const T* h_ptr) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + return static_cast(acc_deviceptr(const_cast(h_ptr))); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + return static_cast(omp_get_mapped_ptr(const_cast(h_ptr), omp_get_default_device())); +#else + throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +template +T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + return static_cast(acc_copyin(const_cast(h_ptr), len * sizeof(T))); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + #pragma omp target enter data map(to:h_ptr[:len]) + return cnrn_target_deviceptr(const_cast(h_ptr)); +#else + throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +template +void cnrn_target_delete(T* h_ptr, std::size_t len = 1) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + acc_delete(h_ptr, len * sizeof(T)); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + #pragma omp target exit data map(delete: h_ptr[:len]) +#else + throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +template +void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + acc_memcpy_to_device(d_ptr, const_cast(h_ptr), len * sizeof(T)); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + omp_target_memcpy(d_ptr, const_cast(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device()); +#else + throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +} diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in index 5bd424865..f51571ae8 100644 --- a/extra/nrnivmodl_core_makefile.in +++ b/extra/nrnivmodl_core_makefile.in @@ -73,8 +73,8 @@ endif CXXFLAGS = @CORENRN_CXX_FLAGS@ CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES) -CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@ -CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@ +CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_EXE_LINKER_FLAGS@ +CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@ # ISPC compilation and link commands ISPC = @CMAKE_ISPC_COMPILER@ From 1f01552833472d0ff0cf5ddc1b024d79bde55bb1 Mon Sep 17 00:00:00 2001 From: Nicolas Cornu Date: Thu, 16 Dec 2021 09:35:16 +0100 Subject: [PATCH 17/31] Use pragmas instead of omp_get_mapped_ptr (#705) omp_get_mapped_ptr was added in OpenMP 5.1 and is not widely supported. With this change then calling cnrn_target_deviceptr on a pointer that is not present on the device is a hard error instead of returning nullptr, so avoid calling it for artificial cells. --- coreneuron/gpu/nrn_acc_manager.cpp | 6 ++++-- coreneuron/utils/offload.hpp | 12 ++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 9bd635d77..2c18f22d9 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -188,8 +188,10 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int szdp = corenrn.get_prop_dparam_size()[type]; int is_art = corenrn.get_is_artificial()[type]; - // get device pointer for corresponding mechanism data - dptr = cnrn_target_deviceptr(tml->ml->data); + // If the mechanism is artificial data are not inside nt->_data but in a newly + // allocated block. As we never run code for artificial cell inside GPU + // we don't copy it. + dptr = is_art ? nullptr : cnrn_target_deviceptr(tml->ml->data); cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr)); diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp index 7ec41f4f4..ad4189ec1 100644 --- a/coreneuron/utils/offload.hpp +++ b/coreneuron/utils/offload.hpp @@ -30,7 +30,15 @@ T* cnrn_target_deviceptr(const T* h_ptr) { #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) return static_cast(acc_deviceptr(const_cast(h_ptr))); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - return static_cast(omp_get_mapped_ptr(const_cast(h_ptr), omp_get_default_device())); + T *d_ptr = nullptr; + T *_h_ptr = const_cast(h_ptr); + + nrn_pragma_omp(target data use_device_ptr(_h_ptr)) + { + d_ptr = _h_ptr; + } + + return d_ptr; #else throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); #endif @@ -42,7 +50,7 @@ T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) { return static_cast(acc_copyin(const_cast(h_ptr), len * sizeof(T))); #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) #pragma omp target enter data map(to:h_ptr[:len]) - return cnrn_target_deviceptr(const_cast(h_ptr)); + return cnrn_target_deviceptr(h_ptr); #else throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); #endif From d03c45f85f39985b318180603fc958f6edf1d401 Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Fri, 17 Dec 2021 12:13:21 +0200 Subject: [PATCH 18/31] GPU implementation improvements (#718) * Set nwarp to very big number for optimal parallelization and improve a bit grid config of CUDA solve_interleaved2 --- coreneuron/apps/corenrn_parameters.hpp | 4 ++-- coreneuron/permute/cellorder.cu | 17 +++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/coreneuron/apps/corenrn_parameters.hpp b/coreneuron/apps/corenrn_parameters.hpp index 21f2f7767..e22cf348d 100644 --- a/coreneuron/apps/corenrn_parameters.hpp +++ b/coreneuron/apps/corenrn_parameters.hpp @@ -46,8 +46,8 @@ struct corenrn_parameters { unsigned ms_subint = 2; /// Number of multisend interval. 1 or 2 unsigned spkcompress = 0; /// Spike Compression unsigned cell_interleave_permute = 0; /// Cell interleaving permutation - unsigned nwarp = 1024; /// Number of warps to balance for cell_interleave_permute == 2 - unsigned num_gpus = 0; /// Number of gpus to use per node + unsigned nwarp = 65536; /// Number of warps to balance for cell_interleave_permute == 2 + unsigned num_gpus = 0; /// Number of gpus to use per node unsigned report_buff_size = report_buff_size_default; /// Size in MB of the report buffer. int seed = -1; /// Initialization seed for random number generator (int) diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu index 1226b4bf7..0c1b5af2e 100644 --- a/coreneuron/permute/cellorder.cu +++ b/coreneuron/permute/cellorder.cu @@ -92,12 +92,17 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) { auto cuda_stream = static_cast(stream); - // the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e. - // 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells. - // The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU. - // In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs. - int threadsPerBlock = 128; - int blocksPerGrid = 512; + /// the selection of these parameters has been done after running the channel-benchmark for + /// typical production runs, i.e. 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells. + /// In the OpenACC/OpenMP implementations threadsPerBlock is set to 32. From profiling the + /// channel-benchmark circuits mentioned above we figured out that the best performance was + /// achieved with this configuration + int threadsPerBlock = warpsize; + /// Max number of blocksPerGrid for NVIDIA GPUs is 65535, so we need to make sure that the + /// blocksPerGrid we launch the CUDA kernel with doesn't exceed this number + const auto maxBlocksPerGrid = 65535; + int provisionalBlocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock; + int blocksPerGrid = provisionalBlocksPerGrid <= maxBlocksPerGrid ? provisionalBlocksPerGrid : maxBlocksPerGrid; solve_interleaved2_kernel<<>>(nt, info, ncore); From 3fc7037842c3d80372d6ea1f3860643585d0f46b Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Fri, 17 Dec 2021 14:53:39 +0100 Subject: [PATCH 19/31] More CI + disable OpenACC in OpenMP builds (#717) * Re-enable GitLab CI. * Add NMODL + OpenACC test. * Restore {clang,cmake}-format checks. * Prefer OpenACC with MOD2C. * Do not enable OpenACC in NMODL + OpenMP mode. * Convert more #pragma acc to nrn_pragma_acc(...). * Call cudaSetDevice in OpenMP mode. Co-authored-by: Ioannis Magkanaris --- .../workflows/clang_cmake_format_check.yaml | 37 +++ .gitlab-ci.yml | 132 +++++++++- CMake/OpenAccHelper.cmake | 12 +- CMakeLists.txt | 14 + coreneuron/CMakeLists.txt | 16 +- coreneuron/apps/main1.cpp | 11 +- coreneuron/gpu/nrn_acc_manager.cpp | 243 +++++++++--------- coreneuron/gpu/nrn_acc_manager.hpp | 4 - coreneuron/kinderiv.py | 8 +- .../mechanism/mech/mod2c_core_thread.hpp | 31 ++- coreneuron/mechanism/mechanism.hpp | 3 +- coreneuron/mechanism/membfunc.hpp | 13 +- coreneuron/network/cvodestb.cpp | 6 +- coreneuron/network/netcvode.cpp | 3 - coreneuron/network/partrans.cpp | 3 +- coreneuron/permute/cellorder.cpp | 27 +- coreneuron/sim/scopmath/newton_struct.h | 10 +- coreneuron/sim/scopmath/sparse_thread.cpp | 2 +- coreneuron/sim/scopmath/ssimplic_thread.cpp | 5 +- coreneuron/utils/ivocvect.cpp | 5 +- coreneuron/utils/ivocvect.hpp | 12 +- coreneuron/utils/offload.hpp | 61 +++-- coreneuron/utils/profile/profiler_interface.h | 6 +- coreneuron/utils/randoms/nrnran123.h | 13 +- external/nmodl | 2 +- 25 files changed, 456 insertions(+), 223 deletions(-) create mode 100644 .github/workflows/clang_cmake_format_check.yaml diff --git a/.github/workflows/clang_cmake_format_check.yaml b/.github/workflows/clang_cmake_format_check.yaml new file mode 100644 index 000000000..b438a8080 --- /dev/null +++ b/.github/workflows/clang_cmake_format_check.yaml @@ -0,0 +1,37 @@ +name: clang-cmake-format-check + +concurrency: + group: ${{ github.workflow }}#${{ github.ref }} + cancel-in-progress: true + +on: + push: + +jobs: + build: + name: clang-cmake-format-check + runs-on: ubuntu-20.04 + steps: + - name: Fetch repository + uses: actions/checkout@v2 + - name: Install clang-format 11 + run: | + sudo apt-get update + sudo apt-get install clang-format-11 python3-pip libboost-all-dev libopenmpi-dev openmpi-bin + - name: Install cmake-format 0.6.13 + run: python3 -m pip install cmake-format==0.6.13 + - name: Configure + shell: bash + working-directory: ${{runner.workspace}}/CoreNeuron + run: | + export PATH=/home/runner/.local/bin:$PATH + mkdir BUILD && cd BUILD + cmake -DCORENRN_CLANG_FORMAT=ON -DCORENRN_CMAKE_FORMAT=ON -DCORENRN_ENABLE_MPI=ON -DCORENRN_ENABLE_OPENMP=OFF -DClangFormat_EXECUTABLE=$(which clang-format-11) -DCMakeFormat_EXECUTABLE=$(which cmake-format) .. + - name: Run clang-format + shell: bash + working-directory: ${{runner.workspace}}/CoreNeuron/BUILD + run: make check-clang-format VERBOSE=1 + - name: Run cmake-format + shell: bash + working-directory: ${{runner.workspace}}/CoreNeuron/BUILD + run: make check-cmake-format diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 84e83c0ac..8b434bf81 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -35,6 +35,9 @@ spack_setup: - git diff - fi +.spack_intel: + variables: + SPACK_PACKAGE_COMPILER: intel .spack_nvhpc: variables: SPACK_PACKAGE_COMPILER: nvhpc @@ -42,11 +45,21 @@ spack_setup: variables: SPACK_PACKAGE: neuron SPACK_PACKAGE_REF: '' - SPACK_PACKAGE_SPEC: +coreneuron+debug+tests~legacy-unit + SPACK_PACKAGE_SPEC: +coreneuron+debug+tests~legacy-unit model_tests=channel-benchmark,olfactory .gpu_node: variables: bb5_constraint: volta +build:nmodl:intel: + stage: build_nmodl + variables: + SPACK_PACKAGE: nmodl + SPACK_PACKAGE_REF: '' + SPACK_PACKAGE_SPEC: ~legacy-unit + extends: + - .spack_build + - .spack_intel + build:nmodl:gpu: stage: build_nmodl variables: @@ -58,21 +71,92 @@ build:nmodl:gpu: - .spack_build - .spack_nvhpc +build:coreneuron+nmodl:intel: + variables: + SPACK_PACKAGE: coreneuron + SPACK_PACKAGE_SPEC: +nmodl+tests~legacy-unit build_type=Debug + extends: + - .spack_build + - .spack_intel + needs: ["build:nmodl:intel"] + +build:coreneuron:intel: + variables: + SPACK_PACKAGE: coreneuron + SPACK_PACKAGE_SPEC: +tests~legacy-unit build_type=Debug + extends: + - .spack_build + - .spack_intel + build:coreneuron+nmodl:gpu: variables: SPACK_PACKAGE: coreneuron # +report pulls in a lot of dependencies and the tests fail. # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type - SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~report build_type=RelWithDebInfo + SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~report~sympy build_type=RelWithDebInfo extends: - .spack_build - .spack_nvhpc needs: ["build:nmodl:gpu"] +build:coreneuron+nmodl~openmp:gpu: + variables: + SPACK_PACKAGE: coreneuron + # +report pulls in a lot of dependencies and the tests fail. + # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type + # Sympy + OpenMP target offload does not currently work with NVHPC + SPACK_PACKAGE_SPEC: +nmodl~openmp+gpu+tests~legacy-unit~report+sympy build_type=RelWithDebInfo + extends: + - .spack_build + - .spack_nvhpc + needs: ["build:nmodl:gpu"] + +build:coreneuron:gpu: + variables: + SPACK_PACKAGE: coreneuron + # +report pulls in a lot of dependencies and the tests fail. + # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type + SPACK_PACKAGE_SPEC: +gpu+openmp+tests~legacy-unit~report build_type=RelWithDebInfo + extends: + - .spack_build + - .spack_nvhpc + +test:coreneuron+nmodl:intel: + extends: [.ctest] + needs: ["build:coreneuron+nmodl:intel"] + +test:coreneuron:intel: + extends: [.ctest] + needs: ["build:coreneuron:intel"] + test:coreneuron+nmodl:gpu: extends: [.ctest, .gpu_node] needs: ["build:coreneuron+nmodl:gpu"] +test:coreneuron+nmodl~openmp:gpu: + extends: [.ctest, .gpu_node] + needs: ["build:coreneuron+nmodl~openmp:gpu"] + +test:coreneuron:gpu: + extends: [.ctest, .gpu_node] + needs: ["build:coreneuron:gpu"] + +build:neuron+nmodl:intel: + stage: build_neuron + extends: + - .spack_build + - .spack_neuron + - .spack_intel + needs: ["build:coreneuron+nmodl:intel"] + +build:neuron:intel: + stage: build_neuron + extends: + - .spack_build + - .spack_neuron + - .spack_intel + needs: ["build:coreneuron:intel"] + build:neuron+nmodl:gpu: stage: build_neuron extends: @@ -85,7 +169,51 @@ build:neuron+nmodl:gpu: - !reference [.spack_build, before_script] needs: ["build:coreneuron+nmodl:gpu"] +build:neuron+nmodl~openmp:gpu: + stage: build_neuron + extends: + - .spack_build + - .spack_neuron + - .spack_nvhpc + before_script: + # Build py-cython and py-numpy with GCC instead of NVHPC. + - SPACK_PACKAGE_DEPENDENCIES="${SPACK_PACKAGE_DEPENDENCIES}^py-cython%gcc^py-numpy%gcc" + - !reference [.spack_build, before_script] + needs: ["build:coreneuron+nmodl~openmp:gpu"] + +build:neuron:gpu: + stage: build_neuron + extends: + - .spack_build + - .spack_neuron + - .spack_nvhpc + before_script: + # Build py-cython and py-numpy with GCC instead of NVHPC. + - SPACK_PACKAGE_DEPENDENCIES="${SPACK_PACKAGE_DEPENDENCIES}^py-cython%gcc^py-numpy%gcc" + - !reference [.spack_build, before_script] + needs: ["build:coreneuron:gpu"] + +test:neuron+nmodl:intel: + stage: test_neuron + extends: [.ctest] + needs: ["build:neuron+nmodl:intel"] + +test:neuron:intel: + stage: test_neuron + extends: [.ctest] + needs: ["build:neuron:intel"] + test:neuron+nmodl:gpu: stage: test_neuron extends: [.ctest, .gpu_node] needs: ["build:neuron+nmodl:gpu"] + +test:neuron+nmodl~openmp:gpu: + stage: test_neuron + extends: [.ctest, .gpu_node] + needs: ["build:neuron+nmodl~openmp:gpu"] + +test:neuron:gpu: + stage: test_neuron + extends: [.ctest, .gpu_node] + needs: ["build:neuron:gpu"] diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index e8fa6738a..063b32003 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -55,20 +55,26 @@ if(CORENRN_ENABLE_GPU) # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same # CUDA version as is used for the explicit CUDA code. - set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") - set(NVHPC_ACC_LINK_FLAGS "-acc -cuda") + set(NVHPC_ACC_COMP_FLAGS "-Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") + set(NVHPC_ACC_LINK_FLAGS "-cuda") # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the # same default compute capabilities as each other, particularly on GPU-less build machines. foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES}) string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}") endforeach() - if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD) + if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP") # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available # for a region then prefer OpenMP. add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp") string(APPEND NVHPC_ACC_LINK_FLAGS " -mp=gpu") + elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC") + # Only enable OpenACC offload for GPU + string(APPEND NVHPC_ACC_COMP_FLAGS " -acc") + string(APPEND NVHPC_ACC_LINK_FLAGS " -acc") + else() + message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers") endif() # avoid PGI adding standard compliant "-A" flags set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14) diff --git a/CMakeLists.txt b/CMakeLists.txt index 963703975..df528a965 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,6 +121,7 @@ else() set(CORENRN_HAVE_NVHPC_COMPILER OFF) endif() +set(CORENRN_ACCELERATOR_OFFLOAD "Disabled") if(CORENRN_ENABLE_GPU) # Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after # https://github.com/BlueBrain/CoreNeuron/pull/609. @@ -189,6 +190,18 @@ if(CORENRN_ENABLE_GPU) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085" ) + + if(CORENRN_ENABLE_NMODL) + # NMODL supports both OpenACC and OpenMP target offload + if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD) + set(CORENRN_ACCELERATOR_OFFLOAD "OpenMP") + else() + set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC") + endif() + else() + # MOD2C only supports OpenACC offload + set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC") + endif() endif() # ============================================================================= @@ -530,6 +543,7 @@ message(STATUS "MOD2CPP PATH | ${CORENRN_MOD2CPP_BINARY}") message(STATUS "GPU Support | ${CORENRN_ENABLE_GPU}") if(CORENRN_ENABLE_GPU) message(STATUS " CUDA | ${CUDAToolkit_LIBRARY_DIR}") + message(STATUS " Offload | ${CORENRN_ACCELERATOR_OFFLOAD}") message(STATUS " Unified Memory | ${CORENRN_ENABLE_CUDA_UNIFIED_MEMORY}") endif() message(STATUS "Auto Timeout | ${CORENRN_ENABLE_TIMEOUT}") diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 437eb8ea7..f42568a27 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -321,13 +321,15 @@ if(EXISTS "${CORENRN_EXTERNAL_BENCHMARK_DATA}") COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms") list(APPEND all_output_binaries ${output_binaries}) string( - CONCAT - benchmark_command - "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'" - " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'" - " --tstop 1 --gpu &&" - "diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'" - ) + CONCAT benchmark_command + "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'" + " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'" + " --tstop 1 --mpi") + if(CORENRN_ENABLE_GPU) + string(APPEND benchmark_command " --gpu") + endif() + string(APPEND benchmark_command " && diff out.dat " + "'${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'") add_test(NAME benchmark COMMAND sh -c "${benchmark_command}") endif() set(modfile_directory "${CORENEURON_PROJECT_SOURCE_DIR}/tests/integration/ring_gap/mod") diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp index 6a4d43bea..5bfda9421 100644 --- a/coreneuron/apps/main1.cpp +++ b/coreneuron/apps/main1.cpp @@ -193,10 +193,11 @@ void nrn_init_and_load_data(int argc, // precedence is: set by user, globals.dat, 34.0 celsius = corenrn_param.celsius; -#if _OPENACC +#if CORENEURON_ENABLE_GPU if (!corenrn_param.gpu && corenrn_param.cell_interleave_permute == 2) { fprintf(stderr, - "compiled with _OPENACC does not allow the combination of --cell-permute=2 and " + "compiled with CORENEURON_ENABLE_GPU does not allow the combination of " + "--cell-permute=2 and " "missing --gpu\n"); exit(1); } @@ -497,7 +498,7 @@ extern "C" void mk_mech_init(int argc, char** argv) { } #endif -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU if (corenrn_param.gpu) { init_gpu(); } @@ -558,8 +559,8 @@ extern "C" int run_solve_core(int argc, char** argv) { #endif bool compute_gpu = corenrn_param.gpu; - nrn_pragma_acc(update device(celsius, secondorder, pi) if(compute_gpu)) - nrn_pragma_omp(target update to(celsius, secondorder, pi) if(compute_gpu)) + nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu)) + nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu)) { double v = corenrn_param.voltage; double dt = corenrn_param.dt; diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 2c18f22d9..edf9b6d63 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -28,8 +28,8 @@ #include #endif -#ifdef _OPENACC -#include +#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) +#include #endif namespace coreneuron { @@ -41,9 +41,44 @@ void nrn_ion_global_map_delete_from_device(); void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay); void nrn_VecPlay_delete_from_device(NrnThread* nt); +int cnrn_target_get_num_devices() { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) + // choose nvidia GPU by default + acc_device_t device_type = acc_device_nvidia; + // check how many gpu devices available per node + return acc_get_num_devices(device_type); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + return omp_get_num_devices(); +#else + throw std::runtime_error( + "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +void cnrn_target_set_default_device(int device_num) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) + acc_set_device_num(device_num, acc_device_nvidia); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + omp_set_default_device(device_num); + // It seems that with NVHPC 21.9 then only setting the default OpenMP device + // is not enough: there were errors on some nodes when not-the-0th GPU was + // used. These seemed to be related to the NMODL instance structs, which are + // allocated using cudaMallocManaged. + auto const cuda_code = cudaSetDevice(device_num); + assert(cuda_code == cudaSuccess); +#else + throw std::runtime_error( + "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + /* note: threads here are corresponding to global nrn_threads array */ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // initialize NrnThreads for gpu execution // empty thread or only artificial cells should be on cpu for (int i = 0; i < nthreads; i++) { @@ -148,8 +183,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index)); /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/ - Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, - corenrn.get_memb_funcs().size()); + Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, corenrn.get_memb_funcs().size()); cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list)); /* -- copy NrnThreadMembList list ml to device -- */ @@ -306,8 +340,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (nt->n_pntproc) { /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU */ - Point_process* pntptr = - cnrn_target_copyin(nt->pntprocs, nt->n_pntproc); + Point_process* pntptr = cnrn_target_copyin(nt->pntprocs, nt->n_pntproc); cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr); } @@ -330,8 +363,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * while updating PreSyn objects which has virtual base class. May be this is issue due * to * VTable and alignment */ - PreSynHelper* d_presyns_helper = - cnrn_target_copyin(nt->presyns_helper, nt->n_presyn); + PreSynHelper* d_presyns_helper = cnrn_target_copyin(nt->presyns_helper, nt->n_presyn); cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper); PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn); cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns); @@ -340,7 +372,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (nt->_net_send_buffer_size) { /* copy send_receive buffer */ int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer, - nt->_net_send_buffer_size); + nt->_net_send_buffer_size); cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer); } @@ -446,13 +478,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { } void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU /// by default `to` is desitionation pointer on a device IvocVect* d_iv = &to; size_t n = from.size(); if (n) { - double* d_data = cnrn_target_copyin(from.data(), n); + double* d_data = cnrn_target_copyin(from.data(), n); cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data); } #else @@ -462,7 +494,7 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) { } void delete_ivoc_vect_from_device(IvocVect& vec) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU auto const n = vec.size(); if (n) { cnrn_target_delete(vec.data(), n); @@ -479,7 +511,7 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { return; } -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU if (nt->compute_gpu) { // free existing vectors in buffers on gpu cnrn_target_delete(nrb->_pnt_index, nrb->_size); @@ -500,7 +532,7 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { nrb->_displ = (int*) erealloc(nrb->_displ, (nrb->_size + 1) * sizeof(int)); nrb->_nrb_index = (int*) erealloc(nrb->_nrb_index, nrb->_size * sizeof(int)); -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU if (nt->compute_gpu) { int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index; double *d_nrb_t, *d_nrb_flag; @@ -628,7 +660,7 @@ void update_net_receive_buffer(NrnThread* nt) { } void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU if (!nt->compute_gpu) return; @@ -643,22 +675,22 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { if (nsb->_cnt) { Instrumentor::phase p_net_receive_buffer_order("net-send-buf-gpu2cpu"); } - nrn_pragma_acc(update self( - nsb->_sendtype[:nsb->_cnt], - nsb->_vdata_index[:nsb->_cnt], - nsb->_pnt_index[:nsb->_cnt], - nsb->_weight_index[:nsb->_cnt], - nsb->_nsb_t[:nsb->_cnt], - nsb->_nsb_flag[:nsb->_cnt]) - if (nsb->_cnt)) - nrn_pragma_omp(target update from( - nsb->_sendtype[:nsb->_cnt], - nsb->_vdata_index[:nsb->_cnt], - nsb->_pnt_index[:nsb->_cnt], - nsb->_weight_index[:nsb->_cnt], - nsb->_nsb_t[:nsb->_cnt], - nsb->_nsb_flag[:nsb->_cnt]) - if (nsb->_cnt)) + // clang-format off + nrn_pragma_acc(update self(nsb->_sendtype[:nsb->_cnt], + nsb->_vdata_index[:nsb->_cnt], + nsb->_pnt_index[:nsb->_cnt], + nsb->_weight_index[:nsb->_cnt], + nsb->_nsb_t[:nsb->_cnt], + nsb->_nsb_flag[:nsb->_cnt]) + if (nsb->_cnt)) + nrn_pragma_omp(target update from(nsb->_sendtype[:nsb->_cnt], + nsb->_vdata_index[:nsb->_cnt], + nsb->_pnt_index[:nsb->_cnt], + nsb->_weight_index[:nsb->_cnt], + nsb->_nsb_t[:nsb->_cnt], + nsb->_nsb_flag[:nsb->_cnt]) + if (nsb->_cnt)) + // clang-format on #else (void) nt; (void) nsb; @@ -666,7 +698,7 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { } void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU for (int i = 0; i < nthreads; i++) { NrnThread* nt = threads + i; @@ -676,23 +708,24 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { int ne = nrn_soa_padded_size(nt->end, 0); - nrn_pragma_acc(update self( - nt->_actual_rhs[:ne], - nt->_actual_d[:ne], - nt->_actual_a[:ne], - nt->_actual_b[:ne], - nt->_actual_v[:ne], - nt->_actual_area[:ne])) - nrn_pragma_omp(target update from( - nt->_actual_rhs[:ne], - nt->_actual_d[:ne], - nt->_actual_a[:ne], - nt->_actual_b[:ne], - nt->_actual_v[:ne], - nt->_actual_area[:ne])) + // clang-format off + nrn_pragma_acc(update self(nt->_actual_rhs[:ne], + nt->_actual_d[:ne], + nt->_actual_a[:ne], + nt->_actual_b[:ne], + nt->_actual_v[:ne], + nt->_actual_area[:ne])) + nrn_pragma_omp(target update from(nt->_actual_rhs[:ne], + nt->_actual_d[:ne], + nt->_actual_a[:ne], + nt->_actual_b[:ne], + nt->_actual_v[:ne], + nt->_actual_area[:ne])) + // clang-format on nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) - nrn_pragma_omp(target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) + nrn_pragma_omp( + target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) /* @todo: nt._ml_list[tml->index] = tml->ml; */ @@ -700,10 +733,8 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { for (auto tml = nt->tml; tml; tml = tml->next) { Memb_list* ml = tml->ml; - nrn_pragma_acc(update self(tml->index, - ml->nodecount)) - nrn_pragma_omp(target update from(tml->index, - ml->nodecount)) + nrn_pragma_acc(update self(tml->index, ml->nodecount)) + nrn_pragma_omp(target update from(tml->index, ml->nodecount)) int type = tml->index; int n = ml->nodecount; @@ -720,10 +751,8 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp; - nrn_pragma_acc(update self(ml->data[:pcnt], - ml->nodeindices[:n])) - nrn_pragma_omp(target update from(ml->data[:pcnt], - ml->nodeindices[:n])) + nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n])) + nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n])) int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp)) @@ -731,46 +760,44 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { auto nrb = tml->ml->_net_receive_buffer; - nrn_pragma_acc(update self( - nrb->_cnt, - nrb->_size, - nrb->_pnt_offset, - nrb->_displ_cnt, - - nrb->_pnt_index[:nrb->_size], - nrb->_weight_index[:nrb->_size], - nrb->_displ[:nrb->_size + 1], - nrb->_nrb_index[:nrb->_size]) - if (nrb != nullptr)) - nrn_pragma_omp(target update from( - nrb->_cnt, - nrb->_size, - nrb->_pnt_offset, - nrb->_displ_cnt, - - nrb->_pnt_index[:nrb->_size], - nrb->_weight_index[:nrb->_size], - nrb->_displ[:nrb->_size + 1], - nrb->_nrb_index[:nrb->_size]) - if (nrb != nullptr)) + // clang-format off + nrn_pragma_acc(update self(nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size + 1], + nrb->_nrb_index[:nrb->_size]) + if (nrb != nullptr)) + nrn_pragma_omp(target update from(nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size + 1], + nrb->_nrb_index[:nrb->_size]) + if (nrb != nullptr)) + // clang-format on } int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); /* copy shadow_rhs to host */ /* copy shadow_d to host */ - nrn_pragma_acc(update self(nt->_shadow_rhs[:pcnt], - nt->_shadow_d[:pcnt]) - if (nt->shadow_rhs_cnt)) - nrn_pragma_omp(target update from(nt->_shadow_rhs[:pcnt], - nt->_shadow_d[:pcnt]) - if (nt->shadow_rhs_cnt)) + nrn_pragma_acc( + update self(nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt)) + nrn_pragma_omp(target update from( + nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt)) + // clang-format off nrn_pragma_acc(update self(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], - nt->nrn_fast_imem->nrn_sav_d[:nt->end]) - if (nt->nrn_fast_imem != nullptr)) + nt->nrn_fast_imem->nrn_sav_d[:nt->end]) + if (nt->nrn_fast_imem != nullptr)) nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], - nt->nrn_fast_imem->nrn_sav_d[:nt->end]) - if (nt->nrn_fast_imem != nullptr)) + nt->nrn_fast_imem->nrn_sav_d[:nt->end]) + if (nt->nrn_fast_imem != nullptr)) + // clang-format on nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc)) nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc)) @@ -779,13 +806,9 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight)) nrn_pragma_acc(update self( - nt->presyns_helper[:nt->n_presyn], - nt->presyns[:nt->n_presyn]) - if (nt->n_presyn)) + nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn)) nrn_pragma_omp(target update from( - nt->presyns_helper[:nt->n_presyn], - nt->presyns[:nt->n_presyn]) - if (nt->n_presyn)) + nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn)) { TrajectoryRequests* tr = nt->trajec_requests; @@ -793,10 +816,8 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { // The full buffers have `bsize` entries, but only `vsize` // of them are valid. for (int i = 0; i < tr->n_trajec; ++i) { - nrn_pragma_acc(update self( - tr->varrays[i][:tr->vsize])) - nrn_pragma_omp(target update from( - tr->varrays[i][:tr->vsize])) + nrn_pragma_acc(update self(tr->varrays[i][:tr->vsize])) + nrn_pragma_omp(target update from(tr->varrays[i][:tr->vsize])) } } } @@ -858,7 +879,7 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) { * the same process. */ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU for (int i = 0; i < nthreads; i++) { NrnThread* nt = threads + i; { @@ -991,7 +1012,7 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { void nrn_newtonspace_copyto_device(NewtonSpace* ns) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // FIXME this check needs to be tweaked if we ever want to run with a mix // of CPU and GPU threads. if (nrn_threads[0].compute_gpu == 0) { @@ -1033,7 +1054,7 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) { } void nrn_newtonspace_delete_from_device(NewtonSpace* ns) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // FIXME this check needs to be tweaked if we ever want to run with a mix // of CPU and GPU threads. if (nrn_threads[0].compute_gpu == 0) { @@ -1052,7 +1073,7 @@ void nrn_newtonspace_delete_from_device(NewtonSpace* ns) { } void nrn_sparseobj_copyto_device(SparseObj* so) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // FIXME this check needs to be tweaked if we ever want to run with a mix // of CPU and GPU threads. if (nrn_threads[0].compute_gpu == 0) { @@ -1135,7 +1156,7 @@ void nrn_sparseobj_copyto_device(SparseObj* so) { } void nrn_sparseobj_delete_from_device(SparseObj* so) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // FIXME this check needs to be tweaked if we ever want to run with a mix // of CPU and GPU threads. if (nrn_threads[0].compute_gpu == 0) { @@ -1157,12 +1178,11 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { #endif } -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU void nrn_ion_global_map_copyto_device() { if (nrn_ion_global_map_size) { - double** d_data = cnrn_target_copyin(nrn_ion_global_map, - nrn_ion_global_map_size); + double** d_data = cnrn_target_copyin(nrn_ion_global_map, nrn_ion_global_map_size); for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j], @@ -1185,11 +1205,8 @@ void nrn_ion_global_map_delete_from_device() { } void init_gpu() { - // choose nvidia GPU by default - acc_device_t device_type = acc_device_nvidia; - // check how many gpu devices available per node - int num_devices_per_node = acc_get_num_devices(device_type); + int num_devices_per_node = cnrn_target_get_num_devices(); // if no gpu found, can't run on GPU if (num_devices_per_node == 0) { @@ -1217,11 +1234,7 @@ void init_gpu() { } #endif - int device_num = local_rank % num_devices_per_node; - acc_set_device_num(device_num, device_type); -#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD - omp_set_default_device(device_num); -#endif + cnrn_target_set_default_device(local_rank % num_devices_per_node); if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) { std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp index 1334369e7..72d222cdd 100644 --- a/coreneuron/gpu/nrn_acc_manager.hpp +++ b/coreneuron/gpu/nrn_acc_manager.hpp @@ -9,10 +9,6 @@ #ifndef _nrn_device_manager_ #define _nrn_device_manager_ -#if defined(_OPENACC) -#include -#endif - #include "coreneuron/sim/multicore.hpp" namespace coreneuron { diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py index 9b143c0cf..67cd93ebb 100644 --- a/coreneuron/kinderiv.py +++ b/coreneuron/kinderiv.py @@ -63,17 +63,17 @@ def write_out_kinderiv(fout): fout.write('nrn_pragma_omp(declare target)\n') for item in deriv: - fout.write('#pragma acc routine seq\n') + fout.write('nrn_pragma_acc(routine seq)\n') fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1])) - fout.write('#pragma acc routine seq\n') + fout.write('nrn_pragma_acc(routine seq)\n') fout.write('extern int _newton_%s%s(_threadargsproto_);\n' % (item[0], item[1])) for item in kin: - fout.write('#pragma acc routine seq\n') + fout.write('nrn_pragma_acc(routine seq)\n') fout.write('extern int %s%s(void*, double*, _threadargsproto_);\n' % (item[0], item[1])) for item in euler: - fout.write('#pragma acc routine seq\n') + fout.write('nrn_pragma_acc(routine seq)\n') fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1])) if deriv or kin or euler: diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp index e4dee09ac..4c572dd18 100644 --- a/coreneuron/mechanism/mech/mod2c_core_thread.hpp +++ b/coreneuron/mechanism/mech/mod2c_core_thread.hpp @@ -11,6 +11,7 @@ #include "coreneuron/sim/multicore.hpp" #include "coreneuron/mechanism/mechanism.hpp" +#include "coreneuron/utils/offload.hpp" namespace coreneuron { @@ -35,15 +36,17 @@ using DIFUN = int; using NEWTFUN = int; using SPFUN = int; using EULFUN = int; -#pragma acc routine seq +nrn_pragma_omp(declare target) +nrn_pragma_acc(routine seq) extern int nrn_derivimplicit_steer(int, _threadargsproto_); #define difun(arg) nrn_derivimplicit_steer(arg, _threadargs_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int nrn_newton_steer(int, _threadargsproto_); #define newtfun(arg) nrn_newton_steer(arg, _threadargs_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int nrn_euler_steer(int, _threadargsproto_); #define eulerfun(arg) nrn_euler_steer(arg, _threadargs_); +nrn_pragma_omp(end declare target) struct Elm { unsigned row; /* Row location */ @@ -89,15 +92,19 @@ struct SparseObj { /* all the state information */ int do_flag; }; -#pragma acc routine seq +nrn_pragma_acc(routine seq) +nrn_pragma_omp(declare target) extern double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml); +nrn_pragma_omp(end declare target) extern void* nrn_cons_sparseobj(SPFUN, int, Memb_list*, _threadargsproto_); extern void _nrn_destroy_sparseobj_thread(SparseObj* so); -#pragma acc routine seq +nrn_pragma_acc(routine seq) +nrn_pragma_omp(declare target) extern int nrn_kinetic_steer(int, SparseObj*, double*, _threadargsproto_); +nrn_pragma_omp(end declare target) #define spfun(arg1, arg2, arg3) nrn_kinetic_steer(arg1, arg2, arg3, _threadargs_); // derived from nrn/src/scopmath/euler.c @@ -116,14 +123,15 @@ static inline int euler_thread(int neqn, int* var, int* der, DIFUN fun, _threada return 0; } -#pragma acc routine seq +nrn_pragma_omp(declare target) +nrn_pragma_acc(routine seq) extern int derivimplicit_thread(int, int*, int*, DIFUN, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int _ss_derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int sparse_thread(SparseObj*, int, int*, int*, double*, double, SPFUN, int, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) int _ss_sparse_thread(SparseObj*, int n, int* s, @@ -134,10 +142,11 @@ int _ss_sparse_thread(SparseObj*, int linflag, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern double _modl_get_dt_thread(NrnThread*); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern void _modl_set_dt_thread(double, NrnThread*); +nrn_pragma_omp(end declare target) void nrn_sparseobj_copyto_device(SparseObj* so); void nrn_sparseobj_delete_from_device(SparseObj* so); diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp index 3e7046e4e..62be093a3 100644 --- a/coreneuron/mechanism/mechanism.hpp +++ b/coreneuron/mechanism/mechanism.hpp @@ -16,6 +16,7 @@ namespace coreneuron { // OpenACC with PGI compiler has issue when union is used and hence use struct // \todo check if newer PGI versions has resolved this issue +// OL211214: bump #if defined(_OPENACC) struct ThreadDatum { int i; @@ -88,7 +89,7 @@ struct NetSendBuffer_t: MemoryManaged { } void grow() { -#if defined(_OPENACC) +#ifdef CORENEURON_ENABLE_GPU int cannot_reallocate_on_device = 0; assert(cannot_reallocate_on_device); #else diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp index 7598edf50..ba7bf9281 100644 --- a/coreneuron/mechanism/membfunc.hpp +++ b/coreneuron/mechanism/membfunc.hpp @@ -11,6 +11,7 @@ #include #include "coreneuron/mechanism/mechanism.hpp" +#include "coreneuron/utils/offload.hpp" namespace coreneuron { using Pfrpdat = Datum* (*) (void); @@ -109,12 +110,14 @@ extern void hoc_register_watch_check(nrn_watch_check_t, int); extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int); extern void nrn_writes_conc(int, int); -#pragma acc routine seq +nrn_pragma_omp(declare target) +nrn_pragma_acc(routine seq) extern void nrn_wrote_conc(int, double*, int, int, double**, double, int); -#pragma acc routine seq +nrn_pragma_acc(routine seq) double nrn_nernst(double ci, double co, double z, double celsius); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern double nrn_ghk(double v, double ci, double co, double z); +nrn_pragma_omp(end declare target) extern void hoc_register_prop_size(int, int, int); extern void hoc_register_dparam_semantics(int type, int, const char* name); @@ -175,8 +178,10 @@ extern void artcell_net_move(void**, Point_process*, double); extern void nrn2ncs_outputevent(int netcon_output_index, double firetime); extern bool nrn_use_localgid_; extern void net_sem_from_gpu(int sendtype, int i_vdata, int, int ith, int ipnt, double, double); -#pragma acc routine seq +nrn_pragma_acc(routine seq) +nrn_pragma_omp(declare target) extern int at_time(NrnThread*, double); +nrn_pragma_omp(end declare target) // _OPENACC and/or NET_RECEIVE_BUFFERING extern void net_sem_from_gpu(int, int, int, int, int, double, double); diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp index 97c70950e..31c18807e 100644 --- a/coreneuron/network/cvodestb.cpp +++ b/coreneuron/network/cvodestb.cpp @@ -55,15 +55,15 @@ void init_net_events() { net_cvode_instance->init_events(); } -#if defined(_OPENACC) +#ifdef CORENEURON_ENABLE_GPU /* weight vectors could be updated (from INITIAL block of NET_RECEIVE, update those on GPU's */ for (int ith = 0; ith < nrn_nthread; ++ith) { NrnThread* nt = nrn_threads + ith; double* weights = nt->weights; int n_weight = nt->n_weight; if (n_weight && nt->compute_gpu) { - nrn_pragma_acc(update device(weights[0:n_weight])) - nrn_pragma_omp(target update to(weights[0:n_weight])) + nrn_pragma_acc(update device(weights [0:n_weight])) + nrn_pragma_omp(target update to(weights [0:n_weight])) } } #endif diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index dd521afde..4fb1d165f 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -26,9 +26,6 @@ #include "coreneuron/coreneuron.hpp" #include "coreneuron/utils/nrnoc_aux.hpp" -#ifdef _OPENACC -#include -#endif namespace coreneuron { #define PP2NT(pp) (nrn_threads + (pp)->_tid) #define PP2t(pp) (PP2NT(pp)->_t) diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index 1bd822f54..abc3a5a03 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -114,7 +114,8 @@ void nrnthread_v_transfer(NrnThread* _nt) { int* insrc_indices = ttd.insrc_indices.data(); double* tar_data = _nt->_data; // last element in the displacement vector gives total length -#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) int n_insrc_buf = insrcdspl_[nrnmpi_numprocs]; int ndata = _nt->_ndata; #endif diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index fd784fe38..6b4014a64 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -446,7 +446,7 @@ static void triang_interleaved(NrnThread* nt, if (istride < icellsize) { // only first icellsize strides matter // what is the index int ip = GPU_PARENT(i); -#ifndef _OPENACC +#ifndef CORENEURON_ENABLE_GPU nrn_assert(ip >= 0); // if (ip < 0) return; #endif double p = GPU_A(i) / GPU_D(i); @@ -468,7 +468,7 @@ static void bksub_interleaved(NrnThread* nt, GPU_RHS(icell) /= GPU_D(icell); // the root for (int istride = 0; istride < icellsize; ++istride) { int ip = GPU_PARENT(i); -#ifndef _OPENACC +#ifndef CORENEURON_ENABLE_GPU nrn_assert(ip >= 0); #endif GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip); @@ -482,7 +482,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid int icycle = ncycle - 1; int istride = stride[icycle]; int i = lastnode - istride + icore; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU int ii = i; #endif @@ -492,7 +492,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid // clang-format off nrn_pragma_acc(loop seq) for (; has_subtrees_to_compute; ) { // ncycle loop -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU // serial test, gpu does this in parallel for (int icore = 0; icore < warpsize; ++icore) { int i = ii + icore; @@ -508,7 +508,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid nrn_pragma_omp(atomic update) GPU_RHS(ip) -= p * GPU_RHS(i); } -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU } #endif // if finished with all tree depths then ready to break @@ -520,7 +520,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid --icycle; istride = stride[icycle]; i -= istride; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU ii -= istride; #endif } @@ -535,7 +535,7 @@ static void bksub_interleaved2(NrnThread* nt, int ncycle, int* stride, int firstnode) { -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU for (int i = root; i < lastroot; i += 1) { #else nrn_pragma_acc(loop seq) @@ -545,12 +545,12 @@ static void bksub_interleaved2(NrnThread* nt, } int i = firstnode + icore; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU int ii = i; #endif for (int icycle = 0; icycle < ncycle; ++icycle) { int istride = stride[icycle]; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU // serial test, gpu does this in parallel for (int icore = 0; icore < warpsize; ++icore) { int i = ii + icore; @@ -561,7 +561,7 @@ static void bksub_interleaved2(NrnThread* nt, GPU_RHS(i) /= GPU_D(i); } i += istride; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU } ii += istride; #endif @@ -596,7 +596,8 @@ void solve_interleaved2(int ith) { int* strides = ii.stride; // sum ncycles of these (bad since ncompart/warpsize) int* rootbegin = ii.firstnode; // nwarp+1 of these int* nodebegin = ii.lastnode; // nwarp+1 of these -#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) int nstride = stridedispl[nwarp]; #endif nrn_pragma_acc(parallel loop gang vector vector_length( @@ -616,12 +617,12 @@ void solve_interleaved2(int ith) { int lastroot = rootbegin[iwarp + 1]; int firstnode = nodebegin[iwarp]; int lastnode = nodebegin[iwarp + 1]; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU if (ic == 0) { // serial test mode. triang and bksub do all cores in warp #endif triang_interleaved2(nt, ic, ncycle, stride, lastnode); bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU } // serial test mode #endif } diff --git a/coreneuron/sim/scopmath/newton_struct.h b/coreneuron/sim/scopmath/newton_struct.h index 8cd52732c..d01bfb822 100644 --- a/coreneuron/sim/scopmath/newton_struct.h +++ b/coreneuron/sim/scopmath/newton_struct.h @@ -25,10 +25,11 @@ struct NewtonSpace { double* rowmax; }; -#pragma acc routine seq +nrn_pragma_omp(declare target) +nrn_pragma_acc(routine seq) extern int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern void nrn_scopmath_solve_thread(int n, double** a, double* value, @@ -37,7 +38,7 @@ extern void nrn_scopmath_solve_thread(int n, int* s, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int nrn_newton_thread(NewtonSpace* ns, int n, int* s, @@ -45,7 +46,7 @@ extern int nrn_newton_thread(NewtonSpace* ns, double* value, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern void nrn_buildjacobian_thread(NewtonSpace* ns, int n, int* s, @@ -53,6 +54,7 @@ extern void nrn_buildjacobian_thread(NewtonSpace* ns, double* value, double** jacobian, _threadargsproto_); +nrn_pragma_omp(end declare target) extern NewtonSpace* nrn_cons_newtonspace(int n, int n_instance); extern void nrn_destroy_newtonspace(NewtonSpace* ns); diff --git a/coreneuron/sim/scopmath/sparse_thread.cpp b/coreneuron/sim/scopmath/sparse_thread.cpp index d936e269a..71643430a 100644 --- a/coreneuron/sim/scopmath/sparse_thread.cpp +++ b/coreneuron/sim/scopmath/sparse_thread.cpp @@ -105,7 +105,7 @@ static void check_assert(SparseObj* so); static void re_link(SparseObj* so, unsigned i); static SparseObj* create_sparseobj(); -#if defined(_OPENACC) +#ifdef CORENEURON_ENABLE_GPU #undef emalloc #undef ecalloc #define emalloc(arg) malloc(arg) diff --git a/coreneuron/sim/scopmath/ssimplic_thread.cpp b/coreneuron/sim/scopmath/ssimplic_thread.cpp index fe11411d0..511e45d2b 100644 --- a/coreneuron/sim/scopmath/ssimplic_thread.cpp +++ b/coreneuron/sim/scopmath/ssimplic_thread.cpp @@ -9,12 +9,15 @@ #include "coreneuron/mechanism/mech/cfile/scoplib.h" #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp" #include "coreneuron/sim/scopmath/errcodes.h" +#include "coreneuron/utils/offload.hpp" namespace coreneuron { #define s_(arg) _p[s[arg] * _STRIDE] -#pragma acc routine seq +nrn_pragma_acc(routine seq) +nrn_pragma_omp(declare target) static int check_state(int, int*, _threadargsproto_); +nrn_pragma_omp(end declare target) int _ss_sparse_thread(SparseObj* v, int n, diff --git a/coreneuron/utils/ivocvect.cpp b/coreneuron/utils/ivocvect.cpp index 1315d409f..b51a96ab8 100644 --- a/coreneuron/utils/ivocvect.cpp +++ b/coreneuron/utils/ivocvect.cpp @@ -7,6 +7,7 @@ */ #include "coreneuron/utils/ivocvect.hpp" +#include "coreneuron/utils/offload.hpp" namespace coreneuron { IvocVect* vector_new(int n) { @@ -26,12 +27,12 @@ void* vector_new1(int n) { return (void*) (new IvocVect(n)); } -#pragma acc routine seq +nrn_pragma_acc(routine seq) int vector_capacity(void* v) { return ((IvocVect*) v)->size(); } -#pragma acc routine seq +nrn_pragma_acc(routine seq) double* vector_vec(void* v) { return ((IvocVect*) v)->data(); } diff --git a/coreneuron/utils/ivocvect.hpp b/coreneuron/utils/ivocvect.hpp index af4286e09..80440c74d 100644 --- a/coreneuron/utils/ivocvect.hpp +++ b/coreneuron/utils/ivocvect.hpp @@ -9,6 +9,8 @@ #ifndef ivoc_vector_h #define ivoc_vector_h +#include "coreneuron/utils/offload.hpp" + #include #include @@ -52,17 +54,17 @@ class fixed_vector { return data_[i]; } -#pragma acc routine seq + nrn_pragma_acc(routine seq) const T* data(void) const { return data_; } -#pragma acc routine seq + nrn_pragma_acc(routine seq) T* data(void) { return data_; } -#pragma acc routine seq + nrn_pragma_acc(routine seq) size_t size() const { return n_; } @@ -76,9 +78,9 @@ extern double* vector_vec(IvocVect* v); // retro-compatibility API extern void* vector_new1(int n); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int vector_capacity(void* v); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern double* vector_vec(void* v); } // namespace coreneuron diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp index ad4189ec1..078990107 100644 --- a/coreneuron/utils/offload.hpp +++ b/coreneuron/utils/offload.hpp @@ -27,55 +27,70 @@ namespace coreneuron { template T* cnrn_target_deviceptr(const T* h_ptr) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) return static_cast(acc_deviceptr(const_cast(h_ptr))); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - T *d_ptr = nullptr; - T *_h_ptr = const_cast(h_ptr); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + T const* d_ptr{}; - nrn_pragma_omp(target data use_device_ptr(_h_ptr)) - { - d_ptr = _h_ptr; - } + nrn_pragma_omp(target data use_device_ptr(h_ptr)) + { d_ptr = h_ptr; } - return d_ptr; + return const_cast(d_ptr); #else - throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error( + "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); #endif } template T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) return static_cast(acc_copyin(const_cast(h_ptr), len * sizeof(T))); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - #pragma omp target enter data map(to:h_ptr[:len]) +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + nrn_pragma_omp(target enter data map(to : h_ptr[:len])) return cnrn_target_deviceptr(h_ptr); #else - throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error( + "cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); #endif } template void cnrn_target_delete(T* h_ptr, std::size_t len = 1) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) acc_delete(h_ptr, len * sizeof(T)); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - #pragma omp target exit data map(delete: h_ptr[:len]) +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + nrn_pragma_omp(target exit data map(delete : h_ptr[:len])) #else - throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error( + "cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); #endif } template void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) acc_memcpy_to_device(d_ptr, const_cast(h_ptr), len * sizeof(T)); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - omp_target_memcpy(d_ptr, const_cast(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device()); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + omp_target_memcpy(d_ptr, + const_cast(h_ptr), + len * sizeof(T), + 0, + 0, + omp_get_default_device(), + omp_get_initial_device()); #else - throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); + throw std::runtime_error( + "cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); #endif } -} +} // namespace coreneuron diff --git a/coreneuron/utils/profile/profiler_interface.h b/coreneuron/utils/profile/profiler_interface.h index f6a24eb2e..2c68a0ae1 100644 --- a/coreneuron/utils/profile/profiler_interface.h +++ b/coreneuron/utils/profile/profiler_interface.h @@ -15,7 +15,7 @@ #include #endif -#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC)) +#ifdef CORENEURON_CUDA_PROFILING #include #endif @@ -163,7 +163,7 @@ struct Caliper { #endif -#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC)) +#ifdef CORENEURON_CUDA_PROFILING struct CudaProfiling { inline static void phase_begin(const char* name){}; @@ -270,7 +270,7 @@ using InstrumentorImpl = detail::Instrumentor< #if defined CORENEURON_CALIPER detail::Caliper, #endif -#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC)) +#ifdef CORENEURON_CUDA_PROFILING detail::CudaProfiling, #endif #if defined(CRAYPAT) diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h index ab432f89c..c97592161 100644 --- a/coreneuron/utils/randoms/nrnran123.h +++ b/coreneuron/utils/randoms/nrnran123.h @@ -37,6 +37,8 @@ of the full distribution available from #define R123_USE_GNU_UINT128 1 #endif +#include "coreneuron/utils/offload.hpp" + #include #include @@ -46,17 +48,12 @@ of the full distribution available from #define CORENRN_HOST_DEVICE #endif -// Is there actually any harm leaving the pragma in when DISABLE_OPENACC is true? -#if defined(_OPENACC) && !defined(DISABLE_OPENACC) -#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE _Pragma("acc routine seq") -#else -#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE -#endif +#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE nrn_pragma_acc(routine seq) // Some files are compiled with DISABLE_OPENACC, and some builds have no GPU // support at all. In these two cases, request that the random123 state is // allocated using new/delete instead of CUDA unified memory. -#if (defined(__CUDACC__) || defined(_OPENACC)) && !defined(DISABLE_OPENACC) +#if defined(CORENEURON_ENABLE_GPU) && !defined(DISABLE_OPENACC) #define CORENRN_RAN123_USE_UNIFIED_MEMORY true #else #define CORENRN_RAN123_USE_UNIFIED_MEMORY false @@ -100,6 +97,7 @@ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY); /* minimal data stream */ +nrn_pragma_omp(declare target) CORENRN_HOST_DEVICE_ACC void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which); CORENRN_HOST_DEVICE_ACC void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2); CORENRN_HOST_DEVICE_ACC void nrnran123_getids3(nrnran123_State*, @@ -128,6 +126,7 @@ CORENRN_HOST_DEVICE_ACC nrnran123_array4x32 nrnran123_iran(uint32_t seq, uint32_t id1, uint32_t id2); CORENRN_HOST_DEVICE_ACC double nrnran123_uint2dbl(uint32_t); +nrn_pragma_omp(end declare target) } // namespace coreneuron #endif diff --git a/external/nmodl b/external/nmodl index a60c5e903..fc85090f3 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit a60c5e903126ad95cfe2bceb904d0efe83ba9d8a +Subproject commit fc85090f3fbb5736f8647170d1151af85f891467 From 9a98f73117e43688e2f0963d4451b2043ae4241d Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Fri, 17 Dec 2021 15:16:34 +0100 Subject: [PATCH 20/31] NMODL -> hackathon_main. --- external/nmodl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/nmodl b/external/nmodl index fc85090f3..ddb0c518c 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit fc85090f3fbb5736f8647170d1151af85f891467 +Subproject commit ddb0c518c1c227eb6df80dc8ddcc7598cde9e3ee From 5ce52d5569f9311ecea6c871136725d52d59ec93 Mon Sep 17 00:00:00 2001 From: Nicolas Cornu Date: Tue, 21 Dec 2021 14:05:15 +0100 Subject: [PATCH 21/31] Separate handling of ml inside nrn_acc_manager (#719) --- coreneuron/gpu/nrn_acc_manager.cpp | 377 +++++++++++++++-------------- external/nmodl | 2 +- 2 files changed, 194 insertions(+), 185 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index edf9b6d63..bafb17346 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -76,6 +76,189 @@ void cnrn_target_set_default_device(int device_num) { #endif } +static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) { + // As we never run code for artificial cell inside GPU we don't copy it. + int is_art = corenrn.get_is_artificial()[type]; + if (is_art) { + return nullptr; + } + + auto d_ml = cnrn_target_copyin(ml); + + int n = ml->nodecount; + int szp = corenrn.get_prop_param_size()[type]; + int szdp = corenrn.get_prop_dparam_size()[type]; + + double* dptr = cnrn_target_deviceptr(ml->data); + cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr)); + + + int* d_nodeindices = cnrn_target_copyin(ml->nodeindices, n); + cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices); + + if (szdp) { + int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; + int* d_pdata = cnrn_target_copyin(ml->pdata, pcnt); + cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata); + } + + int ts = corenrn.get_memb_funcs()[type].thread_size_; + if (ts) { + ThreadDatum* td = cnrn_target_copyin(ml->_thread, ts); + cnrn_target_memcpy_to_device(&(d_ml->_thread), &td); + } + + // net_receive buffer associated with mechanism + NetReceiveBuffer_t* nrb = ml->_net_receive_buffer; + + // if net receive buffer exist for mechanism + if (nrb) { + NetReceiveBuffer_t* d_nrb = cnrn_target_copyin(nrb); + cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb); + + int* d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index); + + int* d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index); + + double* d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t); + + double* d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag); + + int* d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1); + cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ); + + int* d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index); + } + + /* copy NetSendBuffer_t on to GPU */ + NetSendBuffer_t* nsb = ml->_net_send_buffer; + + if (nsb) { + NetSendBuffer_t* d_nsb; + int* d_iptr; + double* d_dptr; + + d_nsb = cnrn_target_copyin(nsb); + cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb); + + d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr); + + d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr); + + d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr); + + d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr); + + d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr); + + d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr); + } + + return d_ml; +} + +static void update_ml_on_host(const Memb_list* ml, int type) { + int is_art = corenrn.get_is_artificial()[type]; + if (is_art) { + // Artificial mechanisms such as PatternStim and IntervalFire + // are not copied onto the GPU. They should not, therefore, be + // updated from the GPU. + return; + } + + int n = ml->nodecount; + int szp = corenrn.get_prop_param_size()[type]; + int szdp = corenrn.get_prop_dparam_size()[type]; + + int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp; + + nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n])) + nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n])) + + int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; + nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp)) + nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp)) + + auto nrb = ml->_net_receive_buffer; + + // clang-format off + nrn_pragma_acc(update self(nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size + 1], + nrb->_nrb_index[:nrb->_size]) + if (nrb != nullptr)) + nrn_pragma_omp(target update from(nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size + 1], + nrb->_nrb_index[:nrb->_size]) + if (nrb != nullptr)) + // clang-format on +} + +static void delete_ml_from_device(Memb_list* ml, int type) { + int is_art = corenrn.get_is_artificial()[type]; + if (is_art) { + return; + } + // Cleanup the net send buffer if it exists + { + NetSendBuffer_t* nsb{ml->_net_send_buffer}; + if (nsb) { + cnrn_target_delete(nsb->_nsb_flag, nsb->_size); + cnrn_target_delete(nsb->_nsb_t, nsb->_size); + cnrn_target_delete(nsb->_weight_index, nsb->_size); + cnrn_target_delete(nsb->_pnt_index, nsb->_size); + cnrn_target_delete(nsb->_vdata_index, nsb->_size); + cnrn_target_delete(nsb->_sendtype, nsb->_size); + cnrn_target_delete(nsb); + } + } + // Cleanup the net receive buffer if it exists. + { + NetReceiveBuffer_t* nrb{ml->_net_receive_buffer}; + if (nrb) { + cnrn_target_delete(nrb->_nrb_index, nrb->_size); + cnrn_target_delete(nrb->_displ, nrb->_size + 1); + cnrn_target_delete(nrb->_nrb_flag, nrb->_size); + cnrn_target_delete(nrb->_nrb_t, nrb->_size); + cnrn_target_delete(nrb->_weight_index, nrb->_size); + cnrn_target_delete(nrb->_pnt_index, nrb->_size); + cnrn_target_delete(nrb); + } + } + int n = ml->nodecount; + int szdp = corenrn.get_prop_dparam_size()[type]; + int ts = corenrn.get_memb_funcs()[type].thread_size_; + if (ts) { + cnrn_target_delete(ml->_thread, ts); + } + if (szdp) { + int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; + cnrn_target_delete(ml->pdata, pcnt); + } + cnrn_target_delete(ml->nodeindices, n); + cnrn_target_delete(ml); +} + /* note: threads here are corresponding to global nrn_threads array */ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { #ifdef CORENEURON_ENABLE_GPU @@ -210,103 +393,10 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { d_last_tml = d_tml; /* now for every tml, there is a ml. copy that and setup pointer */ - auto d_ml = cnrn_target_copyin(tml->ml); + Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index); cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml); - /* setup nt._ml_list */ cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml); - - int type = tml->index; - int n = tml->ml->nodecount; - int szp = corenrn.get_prop_param_size()[type]; - int szdp = corenrn.get_prop_dparam_size()[type]; - int is_art = corenrn.get_is_artificial()[type]; - - // If the mechanism is artificial data are not inside nt->_data but in a newly - // allocated block. As we never run code for artificial cell inside GPU - // we don't copy it. - dptr = is_art ? nullptr : cnrn_target_deviceptr(tml->ml->data); - cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr)); - - - if (!is_art) { - int* d_nodeindices = cnrn_target_copyin(tml->ml->nodeindices, n); - cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices); - } - - if (szdp) { - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - int* d_pdata = cnrn_target_copyin(tml->ml->pdata, pcnt); - cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata); - } - - int ts = corenrn.get_memb_funcs()[type].thread_size_; - if (ts) { - ThreadDatum* td = cnrn_target_copyin(tml->ml->_thread, ts); - cnrn_target_memcpy_to_device(&(d_ml->_thread), &td); - } - - NetReceiveBuffer_t *nrb, *d_nrb; - int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index; - double *d_nrb_t, *d_nrb_flag; - - // net_receive buffer associated with mechanism - nrb = tml->ml->_net_receive_buffer; - - // if net receive buffer exist for mechanism - if (nrb) { - d_nrb = cnrn_target_copyin(nrb); - cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb); - - d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size); - cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index); - - d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size); - cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index); - - d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size); - cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t); - - d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size); - cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag); - - d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1); - cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ); - - d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size); - cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index); - } - - /* copy NetSendBuffer_t on to GPU */ - NetSendBuffer_t* nsb; - nsb = tml->ml->_net_send_buffer; - - if (nsb) { - NetSendBuffer_t* d_nsb; - int* d_iptr; - double* d_dptr; - - d_nsb = cnrn_target_copyin(nsb); - cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb); - - d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size); - cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr); - - d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size); - cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr); - - d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size); - cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr); - - d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size); - cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr); - - d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size); - cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr); - - d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size); - cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr); - } } if (nt->shadow_rhs_cnt) { @@ -619,6 +709,10 @@ static void net_receive_buffer_order(NetReceiveBuffer_t* nrb) { void update_net_receive_buffer(NrnThread* nt) { Instrumentor::phase p_update_net_receive_buffer("update-net-receive-buf"); for (auto tml = nt->tml; tml; tml = tml->next) { + int is_art = corenrn.get_is_artificial()[tml->index]; + if (is_art) { + continue; + } // net_receive buffer to copy NetReceiveBuffer_t* nrb = tml->ml->_net_receive_buffer; @@ -731,55 +825,11 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { /* -- copy NrnThreadMembList list ml to host -- */ for (auto tml = nt->tml; tml; tml = tml->next) { - Memb_list* ml = tml->ml; - - nrn_pragma_acc(update self(tml->index, ml->nodecount)) - nrn_pragma_omp(target update from(tml->index, ml->nodecount)) - - int type = tml->index; - int n = ml->nodecount; - int szp = corenrn.get_prop_param_size()[type]; - int szdp = corenrn.get_prop_dparam_size()[type]; - int is_art = corenrn.get_is_artificial()[type]; - - // Artificial mechanisms such as PatternStim and IntervalFire - // are not copied onto the GPU. They should not, therefore, be - // updated from the GPU. - if (is_art) { - continue; + if (!corenrn.get_is_artificial()[tml->index]) { + nrn_pragma_acc(update self(tml->index, tml->ml->nodecount)) + nrn_pragma_omp(target update from(tml->index, tml->ml->nodecount)) } - - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp; - - nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n])) - nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n])) - - int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp)) - nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp)) - - auto nrb = tml->ml->_net_receive_buffer; - - // clang-format off - nrn_pragma_acc(update self(nrb->_cnt, - nrb->_size, - nrb->_pnt_offset, - nrb->_displ_cnt, - nrb->_pnt_index[:nrb->_size], - nrb->_weight_index[:nrb->_size], - nrb->_displ[:nrb->_size + 1], - nrb->_nrb_index[:nrb->_size]) - if (nrb != nullptr)) - nrn_pragma_omp(target update from(nrb->_cnt, - nrb->_size, - nrb->_pnt_offset, - nrb->_displ_cnt, - nrb->_pnt_index[:nrb->_size], - nrb->_weight_index[:nrb->_size], - nrb->_displ[:nrb->_size + 1], - nrb->_nrb_index[:nrb->_size]) - if (nrb != nullptr)) - // clang-format on + update_ml_on_host(tml->ml, tml->index); } int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); @@ -957,48 +1007,7 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { } for (auto tml = nt->tml; tml; tml = tml->next) { - // Cleanup the net send buffer if it exists - { - NetSendBuffer_t* nsb{tml->ml->_net_send_buffer}; - if (nsb) { - cnrn_target_delete(nsb->_nsb_flag, nsb->_size); - cnrn_target_delete(nsb->_nsb_t, nsb->_size); - cnrn_target_delete(nsb->_weight_index, nsb->_size); - cnrn_target_delete(nsb->_pnt_index, nsb->_size); - cnrn_target_delete(nsb->_vdata_index, nsb->_size); - cnrn_target_delete(nsb->_sendtype, nsb->_size); - cnrn_target_delete(nsb); - } - } - // Cleanup the net receive buffer if it exists. - { - NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer}; - if (nrb) { - cnrn_target_delete(nrb->_nrb_index, nrb->_size); - cnrn_target_delete(nrb->_displ, nrb->_size + 1); - cnrn_target_delete(nrb->_nrb_flag, nrb->_size); - cnrn_target_delete(nrb->_nrb_t, nrb->_size); - cnrn_target_delete(nrb->_weight_index, nrb->_size); - cnrn_target_delete(nrb->_pnt_index, nrb->_size); - cnrn_target_delete(nrb); - } - } - int type = tml->index; - int n = tml->ml->nodecount; - int szdp = corenrn.get_prop_dparam_size()[type]; - int is_art = corenrn.get_is_artificial()[type]; - int ts = corenrn.get_memb_funcs()[type].thread_size_; - if (ts) { - cnrn_target_delete(tml->ml->_thread, ts); - } - if (szdp) { - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - cnrn_target_delete(tml->ml->pdata, pcnt); - } - if (!is_art) { - cnrn_target_delete(tml->ml->nodeindices, n); - } - cnrn_target_delete(tml->ml); + delete_ml_from_device(tml->ml, tml->index); cnrn_target_delete(tml); } cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size()); diff --git a/external/nmodl b/external/nmodl index ddb0c518c..8535e828a 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit ddb0c518c1c227eb6df80dc8ddcc7598cde9e3ee +Subproject commit 8535e828a7f1a4e12ffabd59c90233efc2993608 From a6c70784fe9a9961d0bf8e179cc62d50628f49c9 Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Tue, 21 Dec 2021 17:19:29 +0100 Subject: [PATCH 22/31] Fixing jenkins tests --- coreneuron/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index f42568a27..55d2baa3c 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -320,13 +320,15 @@ if(EXISTS "${CORENRN_EXTERNAL_BENCHMARK_DATA}") WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms") list(APPEND all_output_binaries ${output_binaries}) + list(JOIN TEST_EXEC_PREFIX " " BENCHMARK_SRUN_COMAND) string( CONCAT benchmark_command + "OMP_NUM_THREADS=1 ${BENCHMARK_SRUN_COMAND} " "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'" " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'" " --tstop 1 --mpi") if(CORENRN_ENABLE_GPU) - string(APPEND benchmark_command " --gpu") + string(APPEND benchmark_command " --gpu --cell-permute=2") endif() string(APPEND benchmark_command " && diff out.dat " "'${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'") From 6b8b6c3afc394029714c0886b28dceec860a1ead Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Tue, 21 Dec 2021 19:20:03 +0100 Subject: [PATCH 23/31] Address review comments. --- CMake/OpenAccHelper.cmake | 2 +- CMakeLists.txt | 3 --- coreneuron/CMakeLists.txt | 36 ++---------------------------- coreneuron/mechanism/eion.cpp | 24 +++++++++++--------- coreneuron/mechanism/mechanism.hpp | 1 - coreneuron/sim/fadvance_core.cpp | 1 - 6 files changed, 17 insertions(+), 50 deletions(-) diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index 063b32003..2d8158be9 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -94,7 +94,7 @@ if(CORENRN_ENABLE_GPU) GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS - "${NVHPC_ACC_COMP_FLAGS} ${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive" + "${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive" ) else() set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS diff --git a/CMakeLists.txt b/CMakeLists.txt index df528a965..d3e1950d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,9 +104,6 @@ option(CORENRN_ENABLE_SHARED "Enable shared library build" ON) option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF) option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF) -set(CORENRN_EXTERNAL_BENCHMARK_DATA - "/gpfs/bbp.cscs.ch/project/proj12/nersc-gpu-hackathon-dec-2021" - CACHE PATH "Path to input data files and mechanisms for benchmarks") set(CORENRN_NMODL_DIR "" CACHE PATH "Path to nmodl source-to-source compiler installation") diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 55d2baa3c..d370df1df 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -288,6 +288,7 @@ if(CORENRN_ENABLE_GPU) # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available. find_package(Boost QUIET) if(Boost_FOUND) + message(STATUS "Boost found, enabling use of memory pools for Random123...") target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS}) target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL) endif() @@ -302,38 +303,6 @@ set_target_properties( # ============================================================================= # create special-core with halfgap.mod for tests # ============================================================================= -set(all_output_binaries) -if(EXISTS "${CORENRN_EXTERNAL_BENCHMARK_DATA}") - # Hack for the december 2021 hackathon, build an extra special-core with channel-benchmark - # mechanisms. - set(modfile_directory - "${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark/benchmark/channels/lib/modlib") - file(GLOB modfiles "${modfile_directory}/*.mod") - set(output_binaries "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core" - "${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/libcorenrnmech.a") - file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark) - add_custom_command( - OUTPUT ${output_binaries} - DEPENDS scopmath coreneuron ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES} - COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b STATIC -m ${CORENRN_MOD2CPP_BINARY} -p 6 - "${modfile_directory}" - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark - COMMENT "Running nrnivmodl-core for channel-benchmark mechanisms") - list(APPEND all_output_binaries ${output_binaries}) - list(JOIN TEST_EXEC_PREFIX " " BENCHMARK_SRUN_COMAND) - string( - CONCAT benchmark_command - "OMP_NUM_THREADS=1 ${BENCHMARK_SRUN_COMAND} " - "'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'" - " --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'" - " --tstop 1 --mpi") - if(CORENRN_ENABLE_GPU) - string(APPEND benchmark_command " --gpu --cell-permute=2") - endif() - string(APPEND benchmark_command " && diff out.dat " - "'${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'") - add_test(NAME benchmark COMMAND sh -c "${benchmark_command}") -endif() set(modfile_directory "${CORENEURON_PROJECT_SOURCE_DIR}/tests/integration/ring_gap/mod") file(GLOB modfiles "${modfile_directory}/*.mod") set(output_binaries "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}/special-core" @@ -345,8 +314,7 @@ add_custom_command( "${modfile_directory}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin COMMENT "Running nrnivmodl-core with halfgap.mod") -list(APPEND all_output_binaries ${output_binaries}) -add_custom_target(nrniv-core ALL DEPENDS ${all_output_binaries}) +add_custom_target(nrniv-core ALL DEPENDS ${output_binaries}) include_directories(${CORENEURON_PROJECT_SOURCE_DIR}) diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 6cb3cf83d..8b58e858d 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -263,11 +263,13 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) { int _cntml_padded = ml->_nodecount_padded; pd = ml->data; ppd = ml->pdata; - nrn_pragma_acc(parallel loop present( - pd [0:_cntml_padded * 5], - nrn_ion_global_map - [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu) - async(nt->stream_id)) + // clang-format off + nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5], + nrn_ion_global_map[0:nrn_ion_global_map_size] + [0:ion_global_map_member_size]) + if (nt->compute_gpu) + async(nt->stream_id)) + // clang-format on nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { dcurdv = 0.; @@ -300,11 +302,13 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) { // no `nowait` clause has been added to the OpenMP implementation. TODO: // verify if this can be made asynchronous or if there is a strong reason it // needs to be like this. - nrn_pragma_acc(parallel loop present( - pd [0:_cntml_padded * 5], - ppd [0:1], - nrn_ion_global_map - [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)) + // clang-format off + nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5], + ppd[0:1], + nrn_ion_global_map[0:nrn_ion_global_map_size] + [0:ion_global_map_member_size]) + if (nt->compute_gpu)) + // clang-format on nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { if (iontype & 04) { diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp index 62be093a3..65d7b29ce 100644 --- a/coreneuron/mechanism/mechanism.hpp +++ b/coreneuron/mechanism/mechanism.hpp @@ -16,7 +16,6 @@ namespace coreneuron { // OpenACC with PGI compiler has issue when union is used and hence use struct // \todo check if newer PGI versions has resolved this issue -// OL211214: bump #if defined(_OPENACC) struct ThreadDatum { int i; diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index a46f83535..ab6fc4bfb 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -320,7 +320,6 @@ void nrncore2nrn_send_values(NrnThread* nth) { nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)) } nrn_pragma_acc(wait(nth->stream_id)) - nrn_pragma_omp(taskwait) for (int i = 0; i < tr->n_trajec; ++i) { *(tr->scatter[i]) = *(tr->gather[i]); } From 531c4fe7bf6ca81a91e7dcf3523d23ab9a4ca298 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Tue, 21 Dec 2021 19:47:44 +0100 Subject: [PATCH 24/31] Add CUDA toolkit includes. Presumably this was working before because our nvhpc localrc files accidentally included CUDA include directories before https://github.com/BlueBrain/spack/pull/1392. --- CMake/OpenAccHelper.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index 2d8158be9..78d02777c 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -10,6 +10,10 @@ if(CORENRN_ENABLE_GPU) # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU) + # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for + # example, starting and stopping profiling. This makes sure those headers can + # be found. + include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # cuda unified memory support if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY) add_compile_definitions(CORENEURON_UNIFIED_MEMORY) From e3aeafc93f0c7a83817501511bc3a7fe168ba52c Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Tue, 21 Dec 2021 19:56:33 +0100 Subject: [PATCH 25/31] Fixup cmake-format. --- CMake/OpenAccHelper.cmake | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index 78d02777c..99469f0cc 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -10,9 +10,8 @@ if(CORENRN_ENABLE_GPU) # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU) - # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for - # example, starting and stopping profiling. This makes sure those headers can - # be found. + # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and + # stopping profiling. This makes sure those headers can be found. include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # cuda unified memory support if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY) From 9fddc7de7319127e08dc28722b692f610a0cac44 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 22 Dec 2021 10:23:17 +0100 Subject: [PATCH 26/31] Compile with -cuda. (#721) * Compile NVHPC+Open{ACC,MP} with -cuda. * Pull in NMODL+Eigen fixes to make this work. --- CMake/OpenAccHelper.cmake | 2 +- external/nmodl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index 99469f0cc..225b5ff45 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -58,7 +58,7 @@ if(CORENRN_ENABLE_GPU) # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same # CUDA version as is used for the explicit CUDA code. - set(NVHPC_ACC_COMP_FLAGS "-Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") + set(NVHPC_ACC_COMP_FLAGS "-cuda -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") set(NVHPC_ACC_LINK_FLAGS "-cuda") # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the diff --git a/external/nmodl b/external/nmodl index 8535e828a..5ebca71ff 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit 8535e828a7f1a4e12ffabd59c90233efc2993608 +Subproject commit 5ebca71ffc43e8cfa9ebbee5b15628bf81a546ce From 1fbba172ff017d6fc5a68441b64f08d766cd0831 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 22 Dec 2021 10:38:14 +0100 Subject: [PATCH 27/31] Cleanup CMake for GPU offload. --- CMake/MakefileBuildOptions.cmake | 1 - CMake/OpenAccHelper.cmake | 24 +++++++++--------------- extra/nrnivmodl_core_makefile.in | 4 ++-- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake index 009dd3215..fc0b0b551 100644 --- a/CMake/MakefileBuildOptions.cmake +++ b/CMake/MakefileBuildOptions.cmake @@ -75,7 +75,6 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE) set(CORENRN_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}} ${CXX14_STD_FLAGS} ${NVHPC_ACC_COMP_FLAGS} ${NVHPC_CXX_INLINE_FLAGS}" ) -set(CORENRN_LD_FLAGS "${NVHPC_ACC_LINK_FLAGS}") # ============================================================================= # nmodl/mod2c related options : TODO diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index 225b5ff45..5838742f8 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -50,16 +50,12 @@ if(CORENRN_ENABLE_GPU) endif() set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}") endif() - # -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be - # required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA - # device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which - # apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when - # compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences - # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for - # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same - # CUDA version as is used for the explicit CUDA code. - set(NVHPC_ACC_COMP_FLAGS "-cuda -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") - set(NVHPC_ACC_LINK_FLAGS "-cuda") + # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code + # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code + # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in + # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP + # code is compiled with the same CUDA version as the explicit CUDA code. + set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the # same default compute capabilities as each other, particularly on GPU-less build machines. @@ -70,18 +66,16 @@ if(CORENRN_ENABLE_GPU) # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available # for a region then prefer OpenMP. add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) - string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp") - string(APPEND NVHPC_ACC_LINK_FLAGS " -mp=gpu") + string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu") elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC") # Only enable OpenACC offload for GPU string(APPEND NVHPC_ACC_COMP_FLAGS " -acc") - string(APPEND NVHPC_ACC_LINK_FLAGS " -acc") else() message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers") endif() # avoid PGI adding standard compliant "-A" flags set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14) - string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}") + string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}") # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors. # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined @@ -97,7 +91,7 @@ if(CORENRN_ENABLE_GPU) GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS - "${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive" + "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive" ) else() set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in index f51571ae8..fc339fb04 100644 --- a/extra/nrnivmodl_core_makefile.in +++ b/extra/nrnivmodl_core_makefile.in @@ -73,8 +73,8 @@ endif CXXFLAGS = @CORENRN_CXX_FLAGS@ CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES) -CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_EXE_LINKER_FLAGS@ -CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@ +CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@ +CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@ # ISPC compilation and link commands ISPC = @CMAKE_ISPC_COMPILER@ From 847d415d0a4e5b626a342ae405e05abd03f2e5c1 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 22 Dec 2021 11:36:14 +0100 Subject: [PATCH 28/31] fixup --- coreneuron/gpu/nrn_acc_manager.cpp | 4 ++-- extra/nrnivmodl_core_makefile.in | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index bafb17346..33b526676 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -68,8 +68,8 @@ void cnrn_target_set_default_device(int device_num) { // is not enough: there were errors on some nodes when not-the-0th GPU was // used. These seemed to be related to the NMODL instance structs, which are // allocated using cudaMallocManaged. - auto const cuda_code = cudaSetDevice(device_num); - assert(cuda_code == cudaSuccess); + //auto const cuda_code = cudaSetDevice(device_num); + //assert(cuda_code == cudaSuccess); #else throw std::runtime_error( "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build"); diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in index fc339fb04..5bd424865 100644 --- a/extra/nrnivmodl_core_makefile.in +++ b/extra/nrnivmodl_core_makefile.in @@ -74,7 +74,7 @@ endif CXXFLAGS = @CORENRN_CXX_FLAGS@ CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES) CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@ -CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@ +CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@ # ISPC compilation and link commands ISPC = @CMAKE_ISPC_COMPILER@ From 53b0c5fda91e98c24000238285f7ef1330fc06a4 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 22 Dec 2021 11:38:54 +0100 Subject: [PATCH 29/31] fixup the fixup :facepalm: --- coreneuron/gpu/nrn_acc_manager.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 33b526676..bafb17346 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -68,8 +68,8 @@ void cnrn_target_set_default_device(int device_num) { // is not enough: there were errors on some nodes when not-the-0th GPU was // used. These seemed to be related to the NMODL instance structs, which are // allocated using cudaMallocManaged. - //auto const cuda_code = cudaSetDevice(device_num); - //assert(cuda_code == cudaSuccess); + auto const cuda_code = cudaSetDevice(device_num); + assert(cuda_code == cudaSuccess); #else throw std::runtime_error( "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build"); From 2c7377c40d885d86ddac419c07c0be6dbd0b7ed9 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 22 Dec 2021 18:00:56 +0100 Subject: [PATCH 30/31] NMODL -> master after #783. --- external/nmodl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/nmodl b/external/nmodl index 5ebca71ff..46f8baf2b 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit 5ebca71ffc43e8cfa9ebbee5b15628bf81a546ce +Subproject commit 46f8baf2bbeaa0d21559d6306ec37b94c601f1ee From 5c5b8a37c8bc659a2cecaaf54b16775851939d44 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 22 Dec 2021 18:02:05 +0100 Subject: [PATCH 31/31] Drop two OpenMP taskwait directives. --- coreneuron/gpu/nrn_acc_manager.cpp | 1 - coreneuron/network/partrans.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index bafb17346..d5e723527 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -750,7 +750,6 @@ void update_net_receive_buffer(NrnThread* nt) { } } nrn_pragma_acc(wait(nt->stream_id)) - nrn_pragma_omp(taskwait) } void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index abc3a5a03..4c517e999 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -72,7 +72,6 @@ void nrnmpi_v_transfer() { if (nrn_threads[tid].compute_gpu) { compute_gpu = true; nrn_pragma_acc(wait(nrn_threads[tid].stream_id)) - nrn_pragma_omp(taskwait) } TransferThreadData& ttd = transfer_thread_data_[tid]; size_t n_outsrc_indices = ttd.outsrc_indices.size();