diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index 33c2269fb360b..fb593ee00b3eb 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -202,8 +202,7 @@ jobs: --ci-defaults ${{ inputs.build_configure_extra_args }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DLLVM_INSTALL_UTILS=ON \ - -DNATIVECPU_USE_OCK=Off + -DLLVM_INSTALL_UTILS=ON - name: Compile id: build # Emulate default value for manual dispatch as we've run out of available arguments. diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md index b7fbb47d1064c..2ace9543328b4 100644 --- a/sycl/doc/design/SYCLNativeCPU.md +++ b/sycl/doc/design/SYCLNativeCPU.md @@ -62,6 +62,20 @@ in order to use a local checkout of the oneAPI Construction Kit. The CMake varia The SYCL Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. +### oneTBB integration + +SYCL Native CPU can use oneTBB as an optional backend for task scheduling. oneTBB with SYCL Native CPU is enabled by setting `NATIVECPU_WITH_ONETBB=On` at configure time: + +``` +python3 buildbot/configure.py \ + --native_cpu \ + --cmake-opt=-DNATIVECPU_WITH_ONETBB=On +``` + +This will pull oneTBB into SYCL Native CPU via CMake `FetchContent` and DPC++ can be built as usual. + +By default SYCL Native CPU implements its own scheduler whose only dependency is standard C++. + # Supported features and current limitations The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barriers` are supported only when the oneAPI Construction Kit integration is enabled, several math builtins are not supported and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](https://github.com/intel/llvm/blob/sycl/sycl/test/native_cpu). diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index a5af02213f0c0..8cd0ed5649e8f 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp @@ -51,6 +52,34 @@ set_target_properties(${TARGET_NAME} PROPERTIES SOVERSION "${PROJECT_VERSION_MAJOR}" ) +# oneTBB is used as an optional NativeCPU backend and disabled by default. +option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" ON) +if(NATIVECPU_WITH_ONETBB) + message(STATUS "Configuring Native CPU adapter with oneTBB backend.") + + include(FetchContent) + FetchContent_Declare( + tbb + GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git +# commit 4e4fffed4fb86ae0960a3364700f549b539c777e (HEAD -> master, origin/master, origin/HEAD) +# Author: Ilya Isaev +# Date: Mon Aug 18 10:35:26 2025 +0200 +# Improve task_arena interoperability with task_groups (#1784) + GIT_TAG 4e4fffed4fb86ae0960a3364700f549b539c777e + CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" + OVERRIDE_FIND_PACKAGE + ) + set(TBB_TEST OFF CACHE INTERNAL "" FORCE) + set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE) + set(TBB_BENCH OFF CACHE INTERNAL "" FORCE) + set(TBB_BUILD ON CACHE INTERNAL "" FORCE) + set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE) + set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE) + set(TBB_INSTALL ON CACHE INTERNAL "" FORCE) + set (CMAKE_INCLUDE_CURRENT_DIR OFF) + FetchContent_MakeAvailable(tbb) +endif() + find_package(Threads REQUIRED) target_link_libraries(${TARGET_NAME} PRIVATE @@ -63,3 +92,23 @@ target_link_libraries(${TARGET_NAME} PRIVATE target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) + +if(NATIVECPU_WITH_ONETBB) + target_link_libraries(${TARGET_NAME} PRIVATE + TBB::tbb + ) + if (NOT MSVC) + # oneTBB currently casts away some const qualifiers + # todo: check if compiler actually supports these options + target_compile_options(tbb PRIVATE -Wno-cast-qual -Wno-stringop-overflow -Wno-unknown-warning-option) + target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) + endif() + + # Undefine _DEBUG option in release builds to find + # release tbbbind + if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + target_compile_options(tbb PRIVATE -U_DEBUG) + endif() + + target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB) +endif() diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp old mode 100644 new mode 100755 index 86da10bbffef7..472f933104c9d --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -70,8 +70,14 @@ class WaitInfo { } }; +template inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList) { + const ur_event_handle_t *phEventWaitList, + const T &scheduler) { + if (numEventsInWaitList && !scheduler.CanWaitInThread()) { + urEventWait(numEventsInWaitList, phEventWaitList); + numEventsInWaitList = 0; + } return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList); } @@ -151,7 +157,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto &tp = hQueue->getDevice()->tp; const size_t numParallelThreads = tp.num_threads(); - std::vector> futures; + auto Tasks = native_cpu::getScheduler(tp); auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0]; auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1]; auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2]; @@ -162,7 +168,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto kernel = std::make_unique(*hKernel); kernel->updateMemPool(numParallelThreads); - auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); + auto InEvents = + native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks); const size_t numWG = numWG0 * numWG1 * numWG2; const size_t numWGPerThread = numWG / numParallelThreads; @@ -177,42 +184,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( rangeEnd[0] = rangeEnd[3] % numWG0; rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1; rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1); - futures.emplace_back(tp.schedule_task( - [ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3], - numWG0, numWG1, numParallelThreads](size_t threadId) { - auto state = getState(ndr); - InEvents.wait(); - for (size_t g0 = rangeStart[0], g1 = rangeStart[1], - g2 = rangeStart[2], g3 = rangeStart[3]; - g3 < rangeEnd; ++g3) { + Tasks.schedule([ndr, InEvents, &kernel = *kernel, rangeStart, + rangeEnd = rangeEnd[3], numWG0, numWG1, + numParallelThreads](size_t threadId) { + auto state = getState(ndr); + InEvents.wait(); + for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2], + g3 = rangeStart[3]; + g3 < rangeEnd; ++g3) { #ifdef NATIVECPU_USE_OCK - state.update(g0, g1, g2); - kernel._subhandler( - kernel.getArgs(numParallelThreads, threadId).data(), &state); + state.update(g0, g1, g2); + kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(), + &state); #else - for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { - for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { - for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { - state.update(g0, g1, g2, local0, local1, local2); - kernel._subhandler( - kernel.getArgs(numParallelThreads, threadId).data(), - &state); - } - } + for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { + for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { + for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { + state.update(g0, g1, g2, local0, local1, local2); + kernel._subhandler( + kernel.getArgs(numParallelThreads, threadId).data(), &state); } + } + } #endif - if (++g0 == numWG0) { - g0 = 0; - if (++g1 == numWG1) { - g1 = 0; - ++g2; - } - } + if (++g0 == numWG0) { + g0 = 0; + if (++g1 == numWG1) { + g1 = 0; + ++g2; } - })); + } + } + }); rangeStart = rangeEnd; } - event->set_futures(futures); + event->set_futures(Tasks.getTaskInfo()); if (phEvent) { *phEvent = event; @@ -248,14 +254,14 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, return result; } auto &tp = hQueue->getDevice()->tp; - std::vector> futures; + auto Tasks = native_cpu::getScheduler(tp); auto InEvents = - native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); - futures.emplace_back(tp.schedule_task([f, InEvents](size_t) { + native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks); + Tasks.schedule([f, InEvents](size_t) { InEvents.wait(); f(); - })); - event->set_futures(futures); + }); + event->set_futures(Tasks.getTaskInfo()); event->set_callback( [event, InEvents = InEvents.getUniquePtr()]() { event->tick_end(); }); return UR_RESULT_SUCCESS; @@ -466,7 +472,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( // TODO: error checking // TODO: handle async void *startingPtr = hBuffer->_mem + offset; - unsigned steps = size / patternSize; + size_t steps = size / patternSize; for (unsigned i = 0; i < steps; i++) { memcpy(static_cast(startingPtr) + i * patternSize, pPattern, patternSize); @@ -576,7 +582,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( break; } default: { - for (unsigned int step{0}; step < size; step += patternSize) { + for (size_t step{0}; step < size; step += patternSize) { auto *dest = reinterpret_cast( reinterpret_cast(ptr) + step); memcpy(dest, pPattern, patternSize); diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp index 91b8fb302eb18..7af1e3cd65860 100644 --- a/unified-runtime/source/adapters/native_cpu/event.cpp +++ b/unified-runtime/source/adapters/native_cpu/event.cpp @@ -11,6 +11,7 @@ #include "ur_api.h" #include "common.hpp" +#include "device.hpp" #include "event.hpp" #include "queue.hpp" #include @@ -111,7 +112,7 @@ urEnqueueTimestampRecordingExp(ur_queue_handle_t /*hQueue*/, bool /*blocking*/, ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue, ur_command_t command_type) : queue(queue), context(queue->getContext()), command_type(command_type), - done(false) { + done(false), futures(queue->getDevice()->tp) { this->queue->addEvent(this); } @@ -126,9 +127,7 @@ void ur_event_handle_t_::wait() { if (done) { return; } - for (auto &f : futures) { - f.wait(); - } + this->futures.wait_all(); queue->removeEvent(this); done = true; // The callback may need to acquire the lock, so we unlock it here diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp index 479c671b38cd1..c71e7593686cd 100644 --- a/unified-runtime/source/adapters/native_cpu/event.hpp +++ b/unified-runtime/source/adapters/native_cpu/event.hpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #pragma once #include "common.hpp" +#include "threadpool.hpp" #include "ur_api.h" #include #include @@ -42,7 +43,7 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t getCommandType() const { return command_type; } - void set_futures(std::vector> &fs) { + void set_futures(native_cpu::tasksinfo_t &&fs) { std::lock_guard lock(mutex); futures = std::move(fs); } @@ -61,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t command_type; bool done; std::mutex mutex; - std::vector> futures; + native_cpu::tasksinfo_t futures; std::packaged_task callback; uint64_t timestamp_start = 0; uint64_t timestamp_end = 0; diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index ea64acf1f227c..3010b60238092 100644 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -207,7 +207,90 @@ template class threadpool_interface { return ret; } }; +using simple_threadpool_t = threadpool_interface; -using threadpool_t = threadpool_interface; +class TasksInfo_TP { + using FType = std::future; + std::vector futures; +public: + void schedule(FType &&f) { futures.emplace_back(std::move(f)); } + void wait_all() { + for (auto &f : futures) + f.wait(); + } + TasksInfo_TP(simple_threadpool_t &) {} +}; + +template struct Scheduler_base { + TP &ref; + TaskInfo ti; + Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {} + TaskInfo getTaskInfo() { return std::move(ti); } + static constexpr bool CanWaitInThread() { return true; } +}; + +template struct Scheduler : Scheduler_base { + using Scheduler_base::Scheduler_base; + + template void schedule(T &&task) { + this->ti.schedule(this->ref.schedule_task(std::forward(task))); + } +}; + +template inline Scheduler getScheduler(TPType &tp) { + return Scheduler(tp); +} + +} // namespace native_cpu + +#ifdef NATIVECPU_WITH_ONETBB +// Simple TBB backend +#include "oneapi/tbb.h" +namespace native_cpu { + +class TBB_threadpool { + oneapi::tbb::task_group tasks; + +public: + void wait_all() { tasks.wait(); } + oneapi::tbb::task_group &Tasks() { return tasks; } + size_t num_threads() const noexcept { + return oneapi::tbb::info::default_concurrency(); + } +}; + +class TBB_TasksInfo { + TBB_threadpool *tp; + +public: + void wait_all() { tp->wait_all(); } + TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} +}; + +template <> +struct Scheduler + : Scheduler_base { + using Scheduler_base::Scheduler_base; + template void schedule(T &&task_) { + ref.Tasks().run([task = std::move(task_)]() { + auto thread_id = tbb::this_task_arena::current_thread_index(); + assert(thread_id >= 0 && + thread_id < oneapi::tbb::info::default_concurrency()); + task(thread_id); + }); + } + static constexpr bool CanWaitInThread() { return false; } +}; + +using tasksinfo_t = TBB_TasksInfo; +using threadpool_t = TBB_threadpool; +} // namespace native_cpu + +#else +// The default backend +namespace native_cpu { +using tasksinfo_t = TasksInfo_TP; +using threadpool_t = simple_threadpool_t; } // namespace native_cpu +#endif