From cd2497c0806bb45caf2c2ee5aba2a46166f0bb44 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Mon, 9 Jan 2023 13:00:37 +0000 Subject: [PATCH 01/16] [SYCL][Fusion] Embed LLVM IR for SYCL for Nvidia Signed-off-by: Lukas Sommer --- clang/include/clang/Driver/Action.h | 7 ++++++- clang/include/clang/Driver/Options.td | 2 ++ clang/lib/Driver/Action.cpp | 6 +++--- clang/lib/Driver/Driver.cpp | 10 ++++++++++ clang/lib/Driver/ToolChains/Clang.cpp | 10 +++++++++- 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h index 2c9f59cdde9b0..9dab2e32b2ffc 100644 --- a/clang/include/clang/Driver/Action.h +++ b/clang/include/clang/Driver/Action.h @@ -660,9 +660,14 @@ class OffloadUnbundlingJobAction final : public JobAction { class OffloadWrapperJobAction : public JobAction { void anchor() override; + bool EmbedIR; + public: OffloadWrapperJobAction(ActionList &Inputs, types::ID Type); - OffloadWrapperJobAction(Action *Input, types::ID OutputType); + OffloadWrapperJobAction(Action *Input, types::ID OutputType, + bool IsEmbeddedIR = false); + + bool isEmbeddedIR() const { return EmbedIR; } static bool classof(const Action *A) { return A->getKind() == OffloadWrapperJobClass; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 73b5e0ae1c905..7eddf97fa527f 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2955,6 +2955,8 @@ def fintelfpga : Flag<["-"], "fintelfpga">, Group, HelpText<"Perform ahead-of-time compilation for FPGA">; def fsycl_device_only : Flag<["-"], "fsycl-device-only">, Flags<[CoreOption]>, HelpText<"Compile SYCL kernels for device">; +def fsycl_embed_ir : Flag<["-"], "fsycl-embed-ir">, Flags<[CoreOption]>, + HelpText<"Embed LLVM IR for runtime kernel fusion">; defm sycl_esimd_force_stateless_mem : BoolFOption<"sycl-esimd-force-stateless-mem", LangOpts<"SYCLESIMDForceStatelessMem">, DefaultFalse, PosFlag( + PostLinkAction, types::TY_Object, true); + DA.add(*WrapBitcodeAction, *TC, BoundArch, Action::OFK_SYCL); + } bool NoRDCFatStaticArchive = !IsRDC && FullDeviceLinkAction->getType() == types::TY_Tempfilelist; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f0851ea74c0d2..10c8e42d38ba3 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9274,6 +9274,14 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA, createArgString("-link-opts="); } + bool IsEmbeddedIR = cast(JA).isEmbeddedIR(); + if (IsEmbeddedIR) { + // When the offload-wrapper is called to embed LLVM IR, add a prefix to + // the target triple to distinguish the LLVM IR from the actual device + // binary for that target. + TargetTripleOpt = ("llvm_" + TargetTripleOpt).str(); + } + WrapperArgs.push_back( C.getArgs().MakeArgString(Twine("-target=") + TargetTripleOpt)); @@ -9295,7 +9303,7 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA, assert(I.isFilename() && "Invalid input."); if (I.getType() == types::TY_Tempfiletable || - I.getType() == types::TY_Tempfilelist) + I.getType() == types::TY_Tempfilelist || IsEmbeddedIR) // wrapper actual input files are passed via the batch job file table: WrapperArgs.push_back(C.getArgs().MakeArgString("-batch")); WrapperArgs.push_back(C.getArgs().MakeArgString(I.getFilename())); From 57bd8f656b4ec3a16f12acaa2615ee81e7d3c007 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 10 Jan 2023 13:29:22 +0000 Subject: [PATCH 02/16] Enable LLVM IR as alternative fusion input format; Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/CMakeLists.txt | 4 +- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 5 +- .../lib/translation/LoadKernels.cpp | 116 ++++++++++++++++++ .../lib/translation/LoadKernels.h | 36 ++++++ .../lib/translation/SPIRVLLVMTranslation.cpp | 94 +++----------- .../lib/translation/SPIRVLLVMTranslation.h | 13 +- sycl/source/detail/jit_compiler.cpp | 97 ++++++++++----- 7 files changed, 245 insertions(+), 120 deletions(-) create mode 100644 sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp create mode 100644 sycl-fusion/jit-compiler/lib/translation/LoadKernels.h diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index de6a73b1eab3d..92f0cefd68634 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -2,13 +2,15 @@ add_llvm_library(sycl-fusion lib/KernelFusion.cpp lib/JITContext.cpp + lib/translation/LoadKernels.cpp lib/translation/SPIRVLLVMTranslation.cpp lib/fusion/FusionPipeline.cpp lib/fusion/FusionHelper.cpp lib/fusion/ModuleHelper.cpp lib/helper/ConfigHelper.cpp - LINK_COMPONENTS + LINK_COMPONENTS + BitReader Core Support Analysis diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index 56ea9401c465b..fc168587738b5 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -15,6 +15,7 @@ #include "fusion/FusionPipeline.h" #include "helper/ConfigHelper.h" #include "helper/ErrorHandling.h" +#include "translation/LoadKernels.h" #include "translation/SPIRVLLVMTranslation.h" #include #include @@ -97,8 +98,8 @@ FusionResult KernelFusion::fuseKernels( // Load all input kernels from their respective SPIR-V modules into a single // LLVM IR module. llvm::Expected> ModOrError = - translation::SPIRVLLVMTranslator::loadSPIRVKernels( - *JITCtx.getLLVMContext(), ModuleInfo.kernels()); + translation::KernelLoader::loadKernels(*JITCtx.getLLVMContext(), + ModuleInfo.kernels()); if (auto Error = ModOrError.takeError()) { return errorToFusionResult(std::move(Error), "SPIR-V translation failed"); } diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp new file mode 100644 index 0000000000000..cabf1f6c44e9f --- /dev/null +++ b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp @@ -0,0 +1,116 @@ +//==-------------------------- LoadKernels.cpp ----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LoadKernels.h" +#include "SPIRVLLVMTranslation.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/MemoryBuffer.h" + +using namespace jit_compiler; +using namespace jit_compiler::translation; +using namespace llvm; + +llvm::Expected> +KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx, + std::vector &Kernels) { + std::unique_ptr Result{nullptr}; + bool First = true; + DenseSet ParsedBinaries; + size_t AddressBits = 0; + for (auto &Kernel : Kernels) { + // FIXME: Currently, we use the front of the list. + // Do we need to iterate to find the most suitable + // SPIR-V module? + SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo; + + const unsigned char *ModulePtr = BinInfo.BinaryStart; + size_t ModuleSize = BinInfo.BinarySize; + BinaryBlob BinBlob{ModulePtr, ModuleSize}; + if (ParsedBinaries.contains(BinBlob)) { + // Multiple kernels can be stored in the same SPIR-V or LLVM IR module. + // If we encountered the same binary module before, skip. + // NOTE: We compare the pointer as well as the size, in case + // a previous kernel only referenced part of the SPIR-V/LLVM IR module. + // Not sure this can actually happen, but better safe than sorry. + continue; + } + // Simply load and translate the SPIR-V into the currently still empty + // module. + std::unique_ptr NewMod; + + switch (BinInfo.Format) { + case BinaryFormat::LLVM: { + auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel); + if (auto Err = ModOrError.takeError()) { + return std::move(Err); + } + NewMod = std::move(*ModOrError); + break; + } + case BinaryFormat::SPIRV: { + auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel); + if (auto Err = ModOrError.takeError()) { + return std::move(Err); + } + NewMod = std::move(*ModOrError); + break; + } + default: { + return createStringError( + inconvertibleErrorCode(), + "Failed to load kernel from unsupported input format"); + } + } + + // We do not assume that the input binary information has the address bits + // set, but rather retrieve this information from the SPIR-V/LLVM module's + // data-layout. + BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits(); + + if (First) { + // We can simply assign the module we just loaded from SPIR-V to the + // empty pointer on the first iteration. + Result = std::move(NewMod); + // The first module will dictate the address bits for the remaining. + AddressBits = BinInfo.AddressBits; + First = false; + } else { + // We have already loaded some module, so now we need to + // link the module we just loaded with the result so far. + // FIXME: We allow duplicates to be overridden by the module + // read last. This could cause problems if different modules contain + // definitions with the same name, but different body/content. + // Check that this is not problematic. + Linker::linkModules(*Result, std::move(NewMod), + Linker::Flags::OverrideFromSrc); + if (AddressBits != BinInfo.AddressBits) { + return createStringError( + inconvertibleErrorCode(), + "Number of address bits between SPIR-V modules does not match"); + } + } + } + return std::move(Result); +} + +llvm::Expected> +KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { + auto &BinInfo = Kernel.BinaryInfo; + llvm::StringRef RawData(reinterpret_cast(BinInfo.BinaryStart), + BinInfo.BinarySize); + return llvm::parseBitcodeFile( + MemoryBuffer::getMemBuffer(RawData)->getMemBufferRef(), LLVMCtx); +} + +llvm::Expected> +KernelLoader::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { + return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel); +} diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h new file mode 100644 index 0000000000000..5720abf09bfd7 --- /dev/null +++ b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h @@ -0,0 +1,36 @@ +//==-- LoadKernels.h - Load LLVM IR for SYCL kernels in different formats -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Kernel.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include +#include + +namespace jit_compiler { +namespace translation { + +class KernelLoader { + +public: + static llvm::Expected> + loadKernels(llvm::LLVMContext &LLVMCtx, std::vector &Kernels); + +private: + /// + /// Pair of address and size to represent a binary blob. + using BinaryBlob = std::pair; + + static llvm::Expected> + loadLLVMKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel); + + static llvm::Expected> + loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel); +}; +} // namespace translation +} // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp index 77217419e98bd..fc481f23b141d 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp @@ -16,7 +16,6 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/Linker/Linker.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -86,12 +85,19 @@ SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() { return Opts; } -Expected> -SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx, - BinaryBlob Input) { - // Create an input stream for the binary blob. +Expected> +SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { + std::unique_ptr Result{nullptr}; + + SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo; + assert(BinInfo.Format == BinaryFormat::SPIRV && + "Only SPIR-V supported as input"); + + // Create an input stream for the SPIR-V binary. std::stringstream SPIRStream( - std::string(reinterpret_cast(Input.first), Input.second), + std::string(reinterpret_cast(BinInfo.BinaryStart), + BinInfo.BinarySize), std::ios_base::in | std::ios_base::binary); std::string ErrMsg; // Create a raw pointer. readSpirv accepts a reference to a pointer, @@ -105,77 +111,13 @@ SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx, "Failed to load and translate SPIR-V module with error %s", ErrMsg.c_str()); } - return std::unique_ptr(LLVMMod); -} + std::unique_ptr NewMod{LLVMMod}; -Expected> -SPIRVLLVMTranslator::loadSPIRVKernels(llvm::LLVMContext &LLVMCtx, - std::vector &Kernels) { - std::unique_ptr Result{nullptr}; - bool First = true; - DenseSet ParsedSPIRVModules; - size_t AddressBits = 0; - for (auto &Kernel : Kernels) { - // FIXME: Currently, we use the front of the list. - // Do we need to iterate to find the most suitable - // SPIR-V module? - SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo; - // TODO(Lukas, ONNX-399): Also support LLVM IR as input but simply skipping - // the translation from SPIR-V to LLVM. - assert(BinInfo.Format == BinaryFormat::SPIRV && - "Only SPIR-V supported as input"); - const unsigned char *SPRModulePtr = BinInfo.BinaryStart; - size_t SPRModuleSize = BinInfo.BinarySize; - BinaryBlob BinBlob{SPRModulePtr, SPRModuleSize}; - if (ParsedSPIRVModules.contains(BinBlob)) { - // Multiple kernels can be stored in the same SPIR-V module. - // If we encountered the same SPIR-V module before, skip. - // NOTE: We compare the pointer as well as the size, in case - // a previous kernel only referenced part of the SPIR-V module. - // Not sure this can actually happen, but better safe than sorry. - continue; - } - // Simply load and translate the SPIR-V into the currently still empty - // module. - PROPAGATE_ERROR(NewMod, readAndTranslateSPIRV(LLVMCtx, BinBlob)); - - // We do not assume that the input binary information has the address bits - // set, but rather retrieve this information from the SPIR-V/LLVM module's - // data-layout. - BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits(); - assert((First || BinInfo.AddressBits == AddressBits) && - "Address bits do not match"); - // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or - // 'work_group_size_hint' from metadata attached to the kernel function and - // store it in the SYCLKernelInfo. - // TODO(Lukas, ONNX-399): Validate that DPC++ used metadata to represent - // that information. - restoreKernelAttributes(NewMod.get(), Kernel); - - if (First) { - // We can simply assign the module we just loaded from SPIR-V to the - // empty pointer on the first iteration. - Result = std::move(NewMod); - // The first module will dictate the address bits for the remaining. - AddressBits = BinInfo.AddressBits; - First = false; - } else { - // We have already loaded some module, so now we need to - // link the module we just loaded with the result so far. - // FIXME: We allow duplicates to be overridden by the module - // read last. This could cause problems if different modules contain - // definitions with the same name, but different body/content. - // Check that this is not problematic. - Linker::linkModules(*Result, std::move(NewMod), - Linker::Flags::OverrideFromSrc); - if (AddressBits != BinInfo.AddressBits) { - return createStringError( - inconvertibleErrorCode(), - "Number of address bits between SPIR-V modules does not match"); - } - } - } - return std::move(Result); + // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or + // 'work_group_size_hint' from metadata attached to the kernel function and + // store it in the SYCLKernelInfo. + restoreKernelAttributes(NewMod.get(), Kernel); + return std::move(NewMod); } Expected diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h index 5f1d416e45150..d82a9cc82466c 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h @@ -25,8 +25,7 @@ class SPIRVLLVMTranslator { /// /// Load a list of SPIR-V kernels into a single LLVM module. static llvm::Expected> - loadSPIRVKernels(llvm::LLVMContext &LLVMCtx, - std::vector &Kernels); + loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel); /// /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and @@ -35,10 +34,6 @@ class SPIRVLLVMTranslator { JITContext &JITCtx); private: - /// - /// Pair of address and size to represent a binary blob. - using BinaryBlob = std::pair; - /// /// Get an attribute value consisting of NumValues scalar constant integers /// from the MDNode. @@ -53,12 +48,6 @@ class SPIRVLLVMTranslator { /// - work_group_size_hint static void restoreKernelAttributes(llvm::Module *Mod, SYCLKernelInfo &Info); - /// - /// Read the given SPIR-V binary and translate it to a new LLVM module - /// associated with the given context. - static llvm::Expected> - readAndTranslateSPIRV(llvm::LLVMContext &LLVMCtx, BinaryBlob Input); - /// /// Default settings for the SPIRV translation options. static SPIRV::TranslatorOpts &translatorOpts(); diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index cc39eabc294c8..ad93d93dc70d5 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -39,6 +39,70 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) { } } +std::pair +retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { + auto KernelName = KernelCG->getKernelName(); + + bool isNvidia = Queue->getDeviceImplPtr()->getPlugin().getBackend() == + backend::ext_oneapi_cuda; + if (isNvidia) { + auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName); + std::vector KernelIds; + KernelIds.push_back(KernelID); + auto DeviceImages = + ProgramManager::getInstance().getRawDeviceImages(KernelIds); + const RTDeviceBinaryImage *DeviceImage = nullptr; + for (auto *DI : DeviceImages) { + // We are looking for a device image with LLVM IR format and target spec + // "llvm_nvptx64", which has been set by the offload-wrapper action. + if (DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE && + DI->getRawData().DeviceTargetSpec == std::string("llvm_nvptx64")) { + DeviceImage = DI; + break; + } + } + if (!DeviceImage) { + return {nullptr, nullptr}; + } + auto ContextImpl = Queue->getContextImplPtr(); + auto Context = detail::createSyclObjFromImpl(ContextImpl); + auto DeviceImpl = Queue->getDeviceImplPtr(); + auto Device = detail::createSyclObjFromImpl(DeviceImpl); + RT::PiProgram Program = + detail::ProgramManager::getInstance().createPIProgram(*DeviceImage, + Context, Device); + return {DeviceImage, Program}; + } + + const RTDeviceBinaryImage *DeviceImage = nullptr; + RT::PiProgram Program = nullptr; + if (KernelCG->getKernelBundle() != nullptr) { + // Retrieve the device image from the kernel bundle. + auto KernelBundle = KernelCG->getKernelBundle(); + kernel_id KernelID = + detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + + auto SyclKernel = detail::getSyclObjImpl( + KernelBundle->get_kernel(KernelID, KernelBundle)); + + DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref(); + Program = SyclKernel->getDeviceImage()->get_program_ref(); + } else if (KernelCG->MSyclKernel != nullptr) { + DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref(); + Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref(); + } else { + auto ContextImpl = Queue->getContextImplPtr(); + auto Context = detail::createSyclObjFromImpl(ContextImpl); + auto DeviceImpl = Queue->getDeviceImplPtr(); + auto Device = detail::createSyclObjFromImpl(DeviceImpl); + DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage( + KernelCG->MOSModuleHandle, KernelName, Context, Device); + Program = detail::ProgramManager::getInstance().createPIProgram( + *DeviceImage, Context, Device); + } + return {DeviceImage, Program}; +} + static ::jit_compiler::ParameterKind translateArgType(kernel_param_kind_t Kind) { using PK = ::jit_compiler::ParameterKind; @@ -574,33 +638,9 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, "Cannot fuse kernel with invalid kernel function name"); return nullptr; } - const RTDeviceBinaryImage *DeviceImage = nullptr; - RT::PiProgram Program = nullptr; - if (KernelCG->getKernelBundle() != nullptr) { - // Retrieve the device image from the kernel bundle. - auto KernelBundle = KernelCG->getKernelBundle(); - kernel_id KernelID = - detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); - - auto SyclKernel = detail::getSyclObjImpl( - KernelBundle->get_kernel(KernelID, KernelBundle)); - - DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref(); - Program = SyclKernel->getDeviceImage()->get_program_ref(); - } else if (KernelCG->MSyclKernel != nullptr) { - DeviceImage = - KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref(); - Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref(); - } else { - auto ContextImpl = Queue->getContextImplPtr(); - auto Context = detail::createSyclObjFromImpl(ContextImpl); - auto DeviceImpl = Queue->getDeviceImplPtr(); - auto Device = detail::createSyclObjFromImpl(DeviceImpl); - DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage( - KernelCG->MOSModuleHandle, KernelName, Context, Device); - Program = detail::ProgramManager::getInstance().createPIProgram( - *DeviceImage, Context, Device); - } + + auto [DeviceImage, Program] = retrieveKernelBinary(Queue, KernelCG); + if (!DeviceImage || !Program) { printPerformanceWarning("No suitable IR available for fusion"); return nullptr; @@ -664,8 +704,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, return nullptr; } ::jit_compiler::SYCLKernelBinaryInfo BinInfo{ - translateBinaryImageFormat(DeviceImage->getFormat()), 0, - RawDeviceImage.BinaryStart, DeviceImageSize}; + BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize}; constexpr auto SYCLTypeToIndices = [](auto Val) -> ::jit_compiler::Indices { return {Val.get(0), Val.get(1), Val.get(2)}; From 0244fc050022a159dec12c62246687343dec7b9d Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 11 Jan 2023 14:02:59 +0000 Subject: [PATCH 03/16] [SYCL][Fusion] Add result translation to PTX Signed-off-by: Lukas Sommer --- sycl-fusion/common/include/Kernel.h | 2 +- sycl-fusion/common/lib/KernelIO.h | 1 + sycl-fusion/jit-compiler/CMakeLists.txt | 6 +- sycl-fusion/jit-compiler/include/JITContext.h | 16 ++- sycl-fusion/jit-compiler/include/Options.h | 6 +- sycl-fusion/jit-compiler/lib/JITContext.cpp | 14 +- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 18 +-- ...{LoadKernels.cpp => KernelTranslation.cpp} | 121 ++++++++++++++++-- .../{LoadKernels.h => KernelTranslation.h} | 14 +- .../lib/translation/SPIRVLLVMTranslation.cpp | 4 +- .../lib/translation/SPIRVLLVMTranslation.h | 2 +- sycl/source/detail/jit_compiler.cpp | 17 +++ 12 files changed, 186 insertions(+), 35 deletions(-) rename sycl-fusion/jit-compiler/lib/translation/{LoadKernels.cpp => KernelTranslation.cpp} (50%) rename sycl-fusion/jit-compiler/lib/translation/{LoadKernels.h => KernelTranslation.h} (64%) diff --git a/sycl-fusion/common/include/Kernel.h b/sycl-fusion/common/include/Kernel.h index 1962dd042ffbe..87726b3368d63 100644 --- a/sycl-fusion/common/include/Kernel.h +++ b/sycl-fusion/common/include/Kernel.h @@ -34,7 +34,7 @@ enum class ParameterKind : uint32_t { }; /// Different binary formats supported as input to the JIT compiler. -enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV }; +enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX }; /// Information about a device intermediate representation module (e.g., SPIR-V, /// LLVM IR) from DPC++. diff --git a/sycl-fusion/common/lib/KernelIO.h b/sycl-fusion/common/lib/KernelIO.h index 12c194f8b4dd4..09058d61e9981 100644 --- a/sycl-fusion/common/lib/KernelIO.h +++ b/sycl-fusion/common/lib/KernelIO.h @@ -47,6 +47,7 @@ template <> struct ScalarEnumerationTraits { static void enumeration(IO &IO, jit_compiler::BinaryFormat &BF) { IO.enumCase(BF, "LLVM", jit_compiler::BinaryFormat::LLVM); IO.enumCase(BF, "SPIRV", jit_compiler::BinaryFormat::SPIRV); + IO.enumCase(BF, "PTX", jit_compiler::BinaryFormat::PTX); IO.enumCase(BF, "INVALID", jit_compiler::BinaryFormat::INVALID); } }; diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index 92f0cefd68634..f67eed5fc6c76 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -2,7 +2,7 @@ add_llvm_library(sycl-fusion lib/KernelFusion.cpp lib/JITContext.cpp - lib/translation/LoadKernels.cpp + lib/translation/KernelTranslation.cpp lib/translation/SPIRVLLVMTranslation.cpp lib/fusion/FusionPipeline.cpp lib/fusion/FusionHelper.cpp @@ -20,6 +20,10 @@ add_llvm_library(sycl-fusion Linker ScalarOpts InstCombine + Target + NVPTX + X86 + MC ) target_include_directories(sycl-fusion diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h index 4c0616e267941..d4654f19820ce 100644 --- a/sycl-fusion/jit-compiler/include/JITContext.h +++ b/sycl-fusion/jit-compiler/include/JITContext.h @@ -36,17 +36,21 @@ using CacheKeyT = std::optional>>; /// -/// Wrapper around a SPIR-V binary. -class SPIRVBinary { +/// Wrapper around a kernel binary. +class KernelBinary { public: - explicit SPIRVBinary(std::string Binary); + explicit KernelBinary(std::string Binary, BinaryFormat Format); jit_compiler::BinaryAddress address() const; size_t size() const; + BinaryFormat format() const; + private: std::string Blob; + + BinaryFormat Format; }; /// @@ -61,7 +65,8 @@ class JITContext { llvm::LLVMContext *getLLVMContext(); - SPIRVBinary &emplaceSPIRVBinary(std::string Binary); + KernelBinary &emplaceSPIRVBinary(std::string Binary, + BinaryFormat Format); std::optional getCacheEntry(CacheKeyT &Identifier) const; @@ -79,11 +84,12 @@ class JITContext { MutexT BinariesMutex; - std::vector Binaries; + std::vector Binaries; mutable MutexT CacheMutex; std::unordered_map Cache; + }; } // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/include/Options.h b/sycl-fusion/jit-compiler/include/Options.h index 335f58fb64cf7..4fe7787df00db 100644 --- a/sycl-fusion/jit-compiler/include/Options.h +++ b/sycl-fusion/jit-compiler/include/Options.h @@ -9,12 +9,13 @@ #ifndef SYCL_FUSION_JIT_COMPILER_OPTIONS_H #define SYCL_FUSION_JIT_COMPILER_OPTIONS_H +#include "Kernel.h" #include #include namespace jit_compiler { -enum OptionID { VerboseOutput, EnableCaching }; +enum OptionID { VerboseOutput, EnableCaching, TargetFormat }; class OptionPtrBase {}; @@ -78,6 +79,9 @@ struct JITEnableVerbose : public OptionBase {}; struct JITEnableCaching : public OptionBase {}; +struct JITTargetFormat + : public OptionBase {}; + } // namespace option } // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/lib/JITContext.cpp b/sycl-fusion/jit-compiler/lib/JITContext.cpp index 68c7031b9d8a9..e1dda8b928c45 100644 --- a/sycl-fusion/jit-compiler/lib/JITContext.cpp +++ b/sycl-fusion/jit-compiler/lib/JITContext.cpp @@ -11,14 +11,17 @@ using namespace jit_compiler; -SPIRVBinary::SPIRVBinary(std::string Binary) : Blob{std::move(Binary)} {} +KernelBinary::KernelBinary(std::string Binary, BinaryFormat Fmt) + : Blob{std::move(Binary)}, Format{Fmt} {} -jit_compiler::BinaryAddress SPIRVBinary::address() const { +jit_compiler::BinaryAddress KernelBinary::address() const { // FIXME: Verify it's a good idea to perform this reinterpret_cast here. return reinterpret_cast(Blob.c_str()); } -size_t SPIRVBinary::size() const { return Blob.size(); } +size_t KernelBinary::size() const { return Blob.size(); } + +BinaryFormat KernelBinary::format() const { return Format; } JITContext::JITContext() : LLVMCtx{new llvm::LLVMContext}, Binaries{} {} @@ -26,11 +29,12 @@ JITContext::~JITContext() = default; llvm::LLVMContext *JITContext::getLLVMContext() { return LLVMCtx.get(); } -SPIRVBinary &JITContext::emplaceSPIRVBinary(std::string Binary) { +KernelBinary &JITContext::emplaceSPIRVBinary(std::string Binary, + BinaryFormat Format) { WriteLockT WriteLock{BinariesMutex}; // NOTE: With C++17, which returns a reference from emplace_back, the // following code would be even simpler. - Binaries.emplace_back(std::move(Binary)); + Binaries.emplace_back(std::move(Binary), Format); return Binaries.back(); } diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index fc168587738b5..eeffb0af6ced8 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -15,7 +15,7 @@ #include "fusion/FusionPipeline.h" #include "helper/ConfigHelper.h" #include "helper/ErrorHandling.h" -#include "translation/LoadKernels.h" +#include "translation/KernelTranslation.h" #include "translation/SPIRVLLVMTranslation.h" #include #include @@ -98,8 +98,8 @@ FusionResult KernelFusion::fuseKernels( // Load all input kernels from their respective SPIR-V modules into a single // LLVM IR module. llvm::Expected> ModOrError = - translation::KernelLoader::loadKernels(*JITCtx.getLLVMContext(), - ModuleInfo.kernels()); + translation::KernelTranslator::loadKernels(*JITCtx.getLLVMContext(), + ModuleInfo.kernels()); if (auto Error = ModOrError.takeError()) { return errorToFusionResult(std::move(Error), "SPIR-V translation failed"); } @@ -137,14 +137,14 @@ FusionResult KernelFusion::fuseKernels( SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName); - // Translate the LLVM IR module resulting from the fusion pass into SPIR-V. - llvm::Expected BinaryOrError = - translation::SPIRVLLVMTranslator::translateLLVMtoSPIRV(*NewMod, JITCtx); - if (auto Error = BinaryOrError.takeError()) { + // TODO + BinaryFormat TargetFormat = ConfigHelper::get(); + + if (auto Error = translation::KernelTranslator::translateKernel( + FusedKernelInfo, *NewMod, JITCtx, TargetFormat)) { return errorToFusionResult(std::move(Error), - "Translation to SPIR-V failed"); + "Translation to output format failed"); } - jit_compiler::SPIRVBinary *SPIRVBin = *BinaryOrError; FusedKernelInfo.NDR = FusedKernel.FusedNDRange; diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp similarity index 50% rename from sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp rename to sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index cabf1f6c44e9f..31fa850d58d63 100644 --- a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -1,4 +1,4 @@ -//==-------------------------- LoadKernels.cpp ----------------------------==// +//==----------------------- KernelTranslation.cpp -------------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,19 +6,24 @@ // //===----------------------------------------------------------------------===// -#include "LoadKernels.h" +#include "KernelTranslation.h" #include "SPIRVLLVMTranslation.h" #include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Linker/Linker.h" +#include "llvm/MC/TargetRegistry.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" using namespace jit_compiler; using namespace jit_compiler::translation; using namespace llvm; llvm::Expected> -KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx, - std::vector &Kernels) { +KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx, + std::vector &Kernels) { std::unique_ptr Result{nullptr}; bool First = true; DenseSet ParsedBinaries; @@ -100,8 +105,8 @@ KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx, } llvm::Expected> -KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, - SYCLKernelInfo &Kernel) { +KernelTranslator::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { auto &BinInfo = Kernel.BinaryInfo; llvm::StringRef RawData(reinterpret_cast(BinInfo.BinaryStart), BinInfo.BinarySize); @@ -110,7 +115,107 @@ KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, } llvm::Expected> -KernelLoader::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, - SYCLKernelInfo &Kernel) { +KernelTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel); } + +llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel, + llvm::Module &Mod, + JITContext &JITCtx, + BinaryFormat Format) { + + KernelBinary *KernelBin = nullptr; + switch (Format) { + case BinaryFormat::SPIRV: { + llvm::Expected BinaryOrError = + translateToSPIRV(Mod, JITCtx); + if (auto Error = BinaryOrError.takeError()) { + return Error; + } + KernelBin = *BinaryOrError; + break; + } + case BinaryFormat::PTX: { + llvm::Expected BinaryOrError = translateToPTX(Mod, JITCtx); + if (auto Error = BinaryOrError.takeError()) { + return Error; + } + KernelBin = *BinaryOrError; + break; + } + default: { + return createStringError( + inconvertibleErrorCode(), + "Failed to translate kernel to unsupported output format"); + } + } + + // Update the KernelInfo for the fused kernel with the address and size of the + // SPIR-V binary resulting from translation. + SYCLKernelBinaryInfo &FusedBinaryInfo = Kernel.BinaryInfo; + FusedBinaryInfo.Format = Format; + // Output SPIR-V should use the same number of address bits as the input + // SPIR-V. SPIR-V translation requires all modules to use the same number of + // address bits, so it's safe to take the value from the first one. + FusedBinaryInfo.AddressBits = Mod.getDataLayout().getPointerSizeInBits(); + FusedBinaryInfo.BinaryStart = KernelBin->address(); + FusedBinaryInfo.BinarySize = KernelBin->size(); + return Error::success(); +} + +llvm::Expected +KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) { + return SPIRVLLVMTranslator::translateLLVMtoSPIRV(Mod, JITCtx); +} + +llvm::Expected +KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { + // FIXME: Can we limit this to the NVPTX specific target? + llvm::InitializeAllTargets(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); + llvm::InitializeAllTargetMCs(); + + std::string TargetTriple{"nvptx64-nvidia-cuda"}; + + std::string ErrorMessage; + const auto *Target = + llvm::TargetRegistry::lookupTarget(TargetTriple, ErrorMessage); + + if (!Target) { + return createStringError( + inconvertibleErrorCode(), + "Failed to load and translate SPIR-V module with error %s", + ErrorMessage.c_str()); + } + + // FIXME: Check whether we can provide more accurate target information here + auto *TargetMachine = Target->createTargetMachine( + TargetTriple, "sm_50", "+sm_50,+ptx76", {}, llvm::Reloc::PIC_, + std::nullopt, llvm::CodeGenOpt::Default); + + llvm::legacy::PassManager PM; + + std::string PTXASM; + + { + llvm::raw_string_ostream ASMStream{PTXASM}; + llvm::buffer_ostream BufferedASM{ASMStream}; + + if (TargetMachine->addPassesToEmitFile(PM, BufferedASM, nullptr, + llvm::CGFT_AssemblyFile)) { + return createStringError( + inconvertibleErrorCode(), + "Failed to construct pass pipeline to emit output"); + } + + PM.run(Mod); + ASMStream.flush(); + } + + llvm::dbgs() << "PTX size: " << PTXASM.size() << "\n"; + llvm::dbgs() << "PTX:\n" << PTXASM << "\n"; + + return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX); +} diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h similarity index 64% rename from sycl-fusion/jit-compiler/lib/translation/LoadKernels.h rename to sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h index 5720abf09bfd7..3d6824edbdd37 100644 --- a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h @@ -1,4 +1,4 @@ -//==-- LoadKernels.h - Load LLVM IR for SYCL kernels in different formats -==// +//==- KernelTranslation - Translate SYCL kernels between different formats -==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "JITContext.h" #include "Kernel.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" @@ -15,12 +16,15 @@ namespace jit_compiler { namespace translation { -class KernelLoader { +class KernelTranslator { public: static llvm::Expected> loadKernels(llvm::LLVMContext &LLVMCtx, std::vector &Kernels); + static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod, + JITContext &JITCtx, BinaryFormat Format); + private: /// /// Pair of address and size to represent a binary blob. @@ -31,6 +35,12 @@ class KernelLoader { static llvm::Expected> loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel); + + static llvm::Expected translateToSPIRV(llvm::Module &Mod, + JITContext &JITCtx); + + static llvm::Expected translateToPTX(llvm::Module &Mod, + JITContext &JITCtx); }; } // namespace translation } // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp index fc481f23b141d..1046fd6dc2907 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp @@ -120,7 +120,7 @@ SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, return std::move(NewMod); } -Expected +Expected SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) { std::ostringstream BinaryStream; std::string ErrMsg; @@ -131,5 +131,5 @@ SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) { "Translation of LLVM IR to SPIR-V failed with error %s", ErrMsg.c_str()); } - return &JITCtx.emplaceSPIRVBinary(BinaryStream.str()); + return &JITCtx.emplaceSPIRVBinary(BinaryStream.str(), BinaryFormat::SPIRV); } diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h index d82a9cc82466c..440c00103b0d5 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h @@ -30,7 +30,7 @@ class SPIRVLLVMTranslator { /// /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and /// return a pointer to its container. - static llvm::Expected translateLLVMtoSPIRV(llvm::Module &Mod, + static llvm::Expected translateLLVMtoSPIRV(llvm::Module &Mod, JITContext &JITCtx); private: diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index ad93d93dc70d5..e081a61a99b4b 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -39,6 +39,21 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) { } } +::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) { + auto Backend = Queue->getDeviceImplPtr()->getPlugin().getBackend(); + switch (Backend) { + case backend::ext_oneapi_level_zero: + case backend::opencl: + return ::jit_compiler::BinaryFormat::SPIRV; + case backend::ext_oneapi_cuda: + return ::jit_compiler::BinaryFormat::PTX; + default: + throw sycl::exception( + sycl::make_error_code(sycl::errc::feature_not_supported), + "Backend unsupported by kernel fusion"); + } +} + std::pair retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { auto KernelName = KernelCG->getKernelName(); @@ -784,6 +799,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled); JITConfig.set<::jit_compiler::option::JITEnableCaching>( detail::SYCLConfig::get()); + JITConfig.set<::jit_compiler::option::JITTargetFormat>( + getTargetFormat(Queue)); auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels( *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames, From 70980941c46313b490713e8045ca8e85a006a509 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Thu, 12 Jan 2023 08:56:43 +0000 Subject: [PATCH 04/16] [SYCL][Fusion] Provide correct target spec; Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 12 ------ sycl/source/detail/jit_compiler.cpp | 38 ++++++++++++++----- sycl/source/detail/jit_compiler.hpp | 3 +- sycl/source/detail/jit_device_binaries.cpp | 10 ++--- sycl/source/detail/jit_device_binaries.hpp | 4 +- 5 files changed, 37 insertions(+), 30 deletions(-) diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index eeffb0af6ced8..6c14ac182514a 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -148,18 +148,6 @@ FusionResult KernelFusion::fuseKernels( FusedKernelInfo.NDR = FusedKernel.FusedNDRange; - // Update the KernelInfo for the fused kernel with the address and size of the - // SPIR-V binary resulting from translation. - SYCLKernelBinaryInfo &FusedBinaryInfo = FusedKernelInfo.BinaryInfo; - FusedBinaryInfo.Format = BinaryFormat::SPIRV; - // Output SPIR-V should use the same number of address bits as the input - // SPIR-V. SPIR-V translation requires all modules to use the same number of - // address bits, so it's safe to take the value from the first one. - FusedBinaryInfo.AddressBits = - ModuleInfo.kernels().front().BinaryInfo.AddressBits; - FusedBinaryInfo.BinaryStart = SPIRVBin->address(); - FusedBinaryInfo.BinarySize = SPIRVBin->size(); - if (CachingEnabled) { JITCtx.addCacheEntry(CacheKey, FusedKernelInfo); } diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index e081a61a99b4b..b2802c57e3589 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -799,8 +799,9 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled); JITConfig.set<::jit_compiler::option::JITEnableCaching>( detail::SYCLConfig::get()); - JITConfig.set<::jit_compiler::option::JITTargetFormat>( - getTargetFormat(Queue)); + + ::jit_compiler::BinaryFormat TargetFormat = getTargetFormat(Queue); + JITConfig.set<::jit_compiler::option::JITTargetFormat>(TargetFormat); auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels( *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames, @@ -842,7 +843,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage); if (!FusionResult.cached()) { - auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo); + auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat); detail::ProgramManager::getInstance().addImages(PIDeviceBinaries); } else if (DebugEnabled) { std::cerr << "INFO: Re-using existing device binary for fused kernel\n"; @@ -854,9 +855,11 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, FusedKernelInfo.Name); std::vector> RawExtendedMembers; - std::shared_ptr KernelBundleImplPtr = - detail::getSyclObjImpl(get_kernel_bundle( - Queue->get_context(), {Queue->get_device()}, {FusedKernelId})); + std::shared_ptr KernelBundleImplPtr; + if (TargetFormat == ::jit_compiler::BinaryFormat::SPIRV) { + detail::getSyclObjImpl(get_kernel_bundle( + Queue->get_context(), {Queue->get_device()}, {FusedKernelId})); + } std::unique_ptr FusedCG; FusedCG.reset(new detail::CGExecKernel( @@ -869,7 +872,25 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, } pi_device_binaries jit_compiler::createPIDeviceBinary( - const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) { + const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, + ::jit_compiler::BinaryFormat Format) { + + const char *TargetSpec = nullptr; + switch (Format) { + case ::jit_compiler::BinaryFormat::PTX: { + TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64; + break; + } + case ::jit_compiler::BinaryFormat::SPIRV: { + TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64) + ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64 + : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32; + break; + } + default: + sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Invalid output format"); + } DeviceBinaryContainer Binary; @@ -897,8 +918,7 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( DeviceBinariesCollection Collection; Collection.addDeviceBinary(std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart, - FusedKernelInfo.BinaryInfo.BinarySize, - FusedKernelInfo.BinaryInfo.AddressBits); + FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec); JITDeviceBinaries.push_back(std::move(Collection)); return JITDeviceBinaries.back().getPIDeviceStruct(); diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index 522c0749ef75b..e02be562de3ee 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -13,6 +13,7 @@ #include namespace jit_compiler { +enum class BinaryFormat; class JITContext; struct SYCLKernelInfo; using ArgUsageMask = std::vector; @@ -46,7 +47,7 @@ class jit_compiler { jit_compiler &operator=(const jit_compiler &&) = delete; pi_device_binaries - createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo); + createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, ::jit_compiler::BinaryFormat Format); std::vector encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const; diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp index 0aa778da14240..eadd71d021ee8 100644 --- a/sycl/source/detail/jit_device_binaries.cpp +++ b/sycl/source/detail/jit_device_binaries.cpp @@ -81,7 +81,7 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) { } pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( - const unsigned char *BinaryStart, size_t BinarySize, size_t AddressBits) { + const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec) { pi_device_binary_struct DeviceBinary; DeviceBinary.Version = PI_DEVICE_BINARY_VERSION; DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL; @@ -94,9 +94,7 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( DeviceBinary.BinaryStart = BinaryStart; DeviceBinary.BinaryEnd = BinaryStart + BinarySize; DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV; - DeviceBinary.DeviceTargetSpec = (AddressBits == 32) - ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32 - : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64; + DeviceBinary.DeviceTargetSpec = TargetSpec; DeviceBinary.EntriesBegin = PIOffloadEntries.data(); DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size(); DeviceBinary.PropertySetsBegin = PIPropertySets.data(); @@ -108,14 +106,14 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - size_t AddressBits) { + const char* TargetSpec) { // Adding to the vectors might trigger reallocation, which would invalidate // the pointers used for PI structs if a PI struct has already been created // via getPIDeviceStruct(). Forbid calls to this method after the first PI // struct has been created. assert(Fused && "Adding to container would invalidate existing PI structs"); PIBinaries.push_back( - Cont.getPIDeviceBinary(BinaryStart, BinarySize, AddressBits)); + Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec)); Binaries.push_back(std::move(Cont)); } diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp index 7bf2c7d9fe07b..6fa142543b61f 100644 --- a/sycl/source/detail/jit_device_binaries.hpp +++ b/sycl/source/detail/jit_device_binaries.hpp @@ -111,7 +111,7 @@ class DeviceBinaryContainer { pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart, size_t BinarySize, - size_t AddressBits); + const char* TargetSpec); private: bool Fused = true; @@ -138,7 +138,7 @@ class DeviceBinariesCollection { void addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - size_t AddressBits); + const char* TargetSpec); pi_device_binaries getPIDeviceStruct(); private: From 2b8e92d86cdee9975100cbdd236dfc34405d75c4 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Fri, 13 Jan 2023 13:16:28 +0000 Subject: [PATCH 05/16] [SYCL][Fusion] Avoid removing dependencies Avoid the dependencies of the dependencies to be removed when cleaning up the input commands from the graph without executing them. Signed-off-by: Lukas Sommer --- sycl/source/detail/scheduler/commands.hpp | 8 ++++++++ sycl/source/detail/scheduler/graph_builder.cpp | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index d4219770abbae..482c7a1d7d309 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -289,6 +289,14 @@ class Command { // XPTI instrumentation. Copy code location details to the internal struct. // Memory is allocated in this method and released in destructor. void copySubmissionCodeLocation(); + + /// Clear all dependency events for device and host dependencies. This should + /// only be used if a command is about to be deleted without being executed + /// before that. + void clearAllDependencies() { + MPreparedDepsEvents.clear(); + MPreparedHostDepsEvents.clear(); + } /// Contains list of dependencies(edges) std::vector MDeps; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 1132d32b2f605..53cd1a7da6d13 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1385,6 +1385,10 @@ void Scheduler::GraphBuilder::removeNodeFromGraph( } Node->MDeps.clear(); + // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called + // from the destructor of the command to delete the dependencies of the + // command this command depends on. + Node->clearAllDependencies(); } void Scheduler::GraphBuilder::cancelFusion(QueueImplPtr Queue, From fc43285af93e83862935889191452586d7ff26b9 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Fri, 13 Jan 2023 13:23:14 +0000 Subject: [PATCH 06/16] [SYCL][Fusion] Set device binary image format Signed-off-by: Lukas Sommer --- sycl/source/detail/jit_compiler.cpp | 12 +++++++++--- sycl/source/detail/jit_device_binaries.cpp | 8 ++++---- sycl/source/detail/jit_device_binaries.hpp | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index b2802c57e3589..fe645e43ab28a 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -842,10 +842,13 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, }(FusedKernelInfo.NDR); updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage); + OSModuleHandle Handle = OSUtil::DummyModuleHandle; if (!FusionResult.cached()) { auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat); detail::ProgramManager::getInstance().addImages(PIDeviceBinaries); + Handle = OSUtil::getOSModuleHandle(PIDeviceBinaries->DeviceBinaries); } else if (DebugEnabled) { + // TODO(Lukas): Create correct OSModuleHandle when using a cached binary. std::cerr << "INFO: Re-using existing device binary for fused kernel\n"; } @@ -866,8 +869,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr), std::move(ArgsStorage), std::move(AccStorage), std::move(RawExtendedMembers), std::move(Requirements), std::move(Events), - std::move(FusedArgs), FusedKernelInfo.Name, OSUtil::DummyModuleHandle, {}, - {}, CG::CGTYPE::Kernel)); + std::move(FusedArgs), FusedKernelInfo.Name, Handle, {}, {}, + CG::CGTYPE::Kernel)); return FusedCG; } @@ -876,15 +879,18 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( ::jit_compiler::BinaryFormat Format) { const char *TargetSpec = nullptr; + pi_device_binary_type BinFormat = PI_DEVICE_BINARY_TYPE_NATIVE; switch (Format) { case ::jit_compiler::BinaryFormat::PTX: { TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64; + BinFormat = PI_DEVICE_BINARY_TYPE_NONE; break; } case ::jit_compiler::BinaryFormat::SPIRV: { TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64) ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64 : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32; + BinFormat = PI_DEVICE_BINARY_TYPE_SPIRV; break; } default: @@ -918,7 +924,7 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( DeviceBinariesCollection Collection; Collection.addDeviceBinary(std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart, - FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec); + FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat); JITDeviceBinaries.push_back(std::move(Collection)); return JITDeviceBinaries.back().getPIDeviceStruct(); diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp index eadd71d021ee8..59530a0e691ce 100644 --- a/sycl/source/detail/jit_device_binaries.cpp +++ b/sycl/source/detail/jit_device_binaries.cpp @@ -81,10 +81,11 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) { } pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( - const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec) { + const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec, pi_device_binary_type Format) { pi_device_binary_struct DeviceBinary; DeviceBinary.Version = PI_DEVICE_BINARY_VERSION; DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL; + DeviceBinary.Format = Format; DeviceBinary.CompileOptions = ""; DeviceBinary.LinkOptions = ""; DeviceBinary.ManifestStart = nullptr; @@ -93,7 +94,6 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( // the JITContext. DeviceBinary.BinaryStart = BinaryStart; DeviceBinary.BinaryEnd = BinaryStart + BinarySize; - DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV; DeviceBinary.DeviceTargetSpec = TargetSpec; DeviceBinary.EntriesBegin = PIOffloadEntries.data(); DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size(); @@ -106,14 +106,14 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec) { + const char* TargetSpec, pi_device_binary_type Format) { // Adding to the vectors might trigger reallocation, which would invalidate // the pointers used for PI structs if a PI struct has already been created // via getPIDeviceStruct(). Forbid calls to this method after the first PI // struct has been created. assert(Fused && "Adding to container would invalidate existing PI structs"); PIBinaries.push_back( - Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec)); + Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec, Format)); Binaries.push_back(std::move(Cont)); } diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp index 6fa142543b61f..96079d3a25a19 100644 --- a/sycl/source/detail/jit_device_binaries.hpp +++ b/sycl/source/detail/jit_device_binaries.hpp @@ -111,7 +111,7 @@ class DeviceBinaryContainer { pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec); + const char* TargetSpec, pi_device_binary_type Format); private: bool Fused = true; @@ -138,7 +138,7 @@ class DeviceBinariesCollection { void addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec); + const char* TargetSpec, pi_device_binary_type Format); pi_device_binaries getPIDeviceStruct(); private: From 6fad170cdcec62dd4745ade23e86f9f5fbeb735f Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Mon, 16 Jan 2023 14:28:36 +0000 Subject: [PATCH 07/16] [SYCL][Fusion] Refactor target-specific processing Signed-off-by: Lukas Sommer --- sycl-fusion/passes/CMakeLists.txt | 1 + .../passes/kernel-fusion/SYCLKernelFusion.cpp | 133 +++-------- .../passes/kernel-fusion/SYCLKernelFusion.h | 9 +- .../passes/target/TargetFusionInfo.cpp | 225 ++++++++++++++++++ sycl-fusion/passes/target/TargetFusionInfo.h | 137 +++++++++++ 5 files changed, 395 insertions(+), 110 deletions(-) create mode 100644 sycl-fusion/passes/target/TargetFusionInfo.cpp create mode 100644 sycl-fusion/passes/target/TargetFusionInfo.h diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt index fe81b76b5bbcb..41398f6ee9e99 100644 --- a/sycl-fusion/passes/CMakeLists.txt +++ b/sycl-fusion/passes/CMakeLists.txt @@ -35,6 +35,7 @@ add_llvm_library(SYCLKernelFusionPasses syclcp/SYCLCP.cpp cleanup/Cleanup.cpp debug/PassDebug.cpp + target/TargetFusionInfo.cpp DEPENDS intrinsics_gen diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp index 241496ac9044f..d3dec7f16a81f 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp @@ -31,12 +31,6 @@ using namespace llvm; -constexpr static StringLiteral KernelArgAddrSpace{"kernel_arg_addr_space"}; -constexpr static StringLiteral KernelArgAccessQual{"kernel_arg_access_qual"}; -constexpr static StringLiteral KernelArgType{"kernel_arg_type"}; -constexpr static StringLiteral KernelArgBaseType{"kernel_arg_base_type"}; -constexpr static StringLiteral KernelArgTypeQual{"kernel_arg_type_qual"}; - constexpr StringLiteral SYCLKernelFusion::NDRangeMDKey; constexpr StringLiteral SYCLKernelFusion::NDRangesMDKey; @@ -144,6 +138,8 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) { AM.getResult(M).ModuleInfo; assert(ModuleInfo && "No module information available"); + auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + // Iterate over the functions in the module and locate all // stub functions identified by metadata. SmallPtrSet ToCleanUp; @@ -156,7 +152,7 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) { // attached to this stub function. // The newly created function will carry the name also specified // in the metadata. - if (auto Err = fuseKernel(M, F, ModuleInfo, ToCleanUp)) { + if (auto Err = fuseKernel(M, F, ModuleInfo, TFI, ToCleanUp)) { DeferredErrs = joinErrors(std::move(DeferredErrs), std::move(Err)); } // Rembember the stub for deletion, as it is not required anymore after @@ -164,6 +160,10 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) { ToCleanUp.insert(&F); } } + // Notify the target-specific logic that some functions will be erased + // shortly. + SmallVector NotifyDelete{ToCleanUp.begin(), ToCleanUp.end()}; + TFI.notifyFunctionsDelete(NotifyDelete); // Delete all the stub functions for (Function *SF : ToCleanUp) { SF->eraseFromParent(); @@ -230,11 +230,13 @@ static FusionInsertPoints addGuard(IRBuilderBase &Builder, return {Entry, CallInsertion, Exit}; } -static Expected createFusionCall( - IRBuilderBase &Builder, Function *F, ArrayRef CallArgs, - const jit_compiler::NDRange &SrcNDRange, - const jit_compiler::NDRange &FusedNDRange, bool IsLast, int BarriersFlags, - jit_compiler::Remapper &Remapper, bool ShouldRemap) { +static Expected +createFusionCall(IRBuilderBase &Builder, Function *F, + ArrayRef CallArgs, + const jit_compiler::NDRange &SrcNDRange, + const jit_compiler::NDRange &FusedNDRange, bool IsLast, + int BarriersFlags, jit_compiler::Remapper &Remapper, + bool ShouldRemap, TargetFusionInfo &TargetInfo) { const auto IPs = addGuard(Builder, SrcNDRange, FusedNDRange, IsLast); if (ShouldRemap) { @@ -260,7 +262,7 @@ static Expected createFusionCall( // Insert barrier if needed if (!IsLast && BarriersFlags > 0) { - jit_compiler::barrierCall(Builder, BarriersFlags); + TargetInfo.createBarrierCall(Builder, BarriersFlags); } // Set insert point for future insertions @@ -271,6 +273,7 @@ static Expected createFusionCall( Error SYCLKernelFusion::fuseKernel( Module &M, Function &StubFunction, jit_compiler::SYCLModuleInfo *ModInfo, + TargetFusionInfo &TargetInfo, SmallPtrSetImpl &ToCleanUp) const { // Retrieve the metadata from the stub function. // The first operand of the tuple is the name that the newly created, @@ -343,12 +346,9 @@ Error SYCLKernelFusion::fuseKernel( SmallVector FusedArgNames; SmallVector FusedParamAttributes; // We must keep track of some metadata attached to each parameter. - // Collect it in lists, so it can be attached to the fused function later on. - MDList KernelArgAddressSpaces; - MDList KernelArgAccessQualifiers; - MDList KernelArgTypes; - MDList KernelArgBaseTypes; - MDList KernelArgTypeQualifiers; + // Collect it, so it can be attached to the fused function later on. + MetadataCollection MDCollection{TargetInfo.getKernelMetadataKeys()}; + // Add the information about the new kernel to the SYCLModuleInfo. // Initialize the jit_compiler::SYCLKernelInfo with the name. The remaining // information for functor & argument layout and attributes will be filled in @@ -425,14 +425,7 @@ Error SYCLKernelFusion::fuseKernel( // Add the metadata corresponding to the used arguments to the different // lists. NOTE: We do not collect the "kernel_arg_name" metadata, because // the kernel arguments receive new names in the fused kernel. - addToFusedMetadata(FF, KernelArgAddrSpace, UsedArgsMask, - KernelArgAddressSpaces); - addToFusedMetadata(FF, KernelArgAccessQual, UsedArgsMask, - KernelArgAccessQualifiers); - addToFusedMetadata(FF, KernelArgType, UsedArgsMask, KernelArgTypes); - addToFusedMetadata(FF, KernelArgBaseType, UsedArgsMask, KernelArgBaseTypes); - addToFusedMetadata(FF, KernelArgTypeQual, UsedArgsMask, - KernelArgTypeQualifiers); + MDCollection.collectFromFunction(FF, UsedArgsMask); // Update the fused kernel's KernelInfo with information from this input // kernel. @@ -502,26 +495,15 @@ Error SYCLKernelFusion::fuseKernel( AI.value().setName(ArgName); KernelArgNames.push_back(MDString::get(LLVMCtx, ArgName)); } - // Attach the fused kernel_arg_* metadata collected from the different input + // Attach the fused metadata collected from the different input // kernels to the fused function. - attachFusedMetadata(FusedFunction, "kernel_arg_addr_space", - KernelArgAddressSpaces); - attachFusedMetadata(FusedFunction, "kernel_arg_access_qual", - KernelArgAccessQualifiers); - attachFusedMetadata(FusedFunction, "kernel_arg_type", KernelArgTypes); - attachFusedMetadata(FusedFunction, "kernel_arg_base_type", - KernelArgBaseTypes); - attachFusedMetadata(FusedFunction, "kernel_arg_type_qual", - KernelArgTypeQualifiers); - attachFusedMetadata(FusedFunction, "kernel_arg_name", KernelArgNames); + MDCollection.attachToFunction(FusedFunction); // Add metadata for reqd_work_group_size and work_group_size_hint attachKernelAttributeMD(LLVMCtx, FusedFunction, FusedKernelInfo); - // The fused kernel should be a SPIR-V kernel again. - // NOTE: If this pass is used in a scenario where input and output - // of the compilation are not SPIR-V, care must be taken of other - // potential calling conventions here (e.g., nvptx). - FusedFunction->setCallingConv(CallingConv::SPIR_KERNEL); + // Mark the fused function as a kernel by calling TargetFusionInfo, because + // this is target-specific. + TargetInfo.addKernelFunction(FusedFunction); // Fusion is implemented as a two step process: In the first step, we // simply create calls to the functions that should be fused into this @@ -557,9 +539,9 @@ Error SYCLKernelFusion::fuseKernel( unsigned ParamIdx = ParamMapping[{FuncIndex, I}]; CallArgs.push_back(FusedFunction->getArg(ParamIdx)); } - auto CallOrErr = createFusionCall(Builder, IF, CallArgs, KF.ND, NDRange, - FuncIndex == BarriersEnd, BarriersFlags, - Remapper, IsHeterogeneousNDRangesList); + auto CallOrErr = createFusionCall( + Builder, IF, CallArgs, KF.ND, NDRange, FuncIndex == BarriersEnd, + BarriersFlags, Remapper, IsHeterogeneousNDRangesList, TargetInfo); // Add to the set of original kernel functions that can be deleted after // fusion is complete. ToCleanUp.insert(IF); @@ -602,46 +584,8 @@ Error SYCLKernelFusion::fuseKernel( } } - // Remove all existing calls of the ITT instrumentation functions. Insert new - // ones in the entry block of the fused kernel and every exit block if the - // functions are present in the module. - // We cannot use the existing SPIRITTAnnotations pass, because that pass might - // insert calls to functions not present in the module (e.g., ITT - // instrumentations for barriers). As the JITed module is not linked with - // libdevice anymore, the functions would remain unresolved and cause the - // driver to fail. - Function *StartWrapperFunc = M.getFunction(ITTStartWrapper); - Function *FinishWrapperFunc = M.getFunction(ITTFinishWrapper); - bool InsertWrappers = - ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) && - (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration())); - auto *WrapperFuncTy = - FunctionType::get(Type::getVoidTy(M.getContext()), /*isVarArg*/ false); - for (auto &BB : *FusedFunction) { - for (auto Inst = BB.begin(); Inst != BB.end();) { - if (auto *CB = dyn_cast(Inst)) { - if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) { - Inst = Inst->eraseFromParent(); - continue; - } - } - ++Inst; - } - if (InsertWrappers) { - if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { - auto *WrapperCall = - CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI); - WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); - } - } - } - if (InsertWrappers) { - FusedFunction->getEntryBlock().getFirstInsertionPt(); - auto *WrapperCall = CallInst::Create( - WrapperFuncTy, StartWrapperFunc, "", - &*FusedFunction->getEntryBlock().getFirstInsertionPt()); - WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); - } + // Perform target-specific post-processing of the new fused kernel. + TargetInfo.postProcessKernel(FusedFunction); return Error::success(); } @@ -717,23 +661,6 @@ static unsigned getUnsignedFromMD(Metadata *MD) { return ConstInt->getZExtValue(); } -void SYCLKernelFusion::addToFusedMetadata( - Function *InputFunction, const StringRef &Kind, - const ArrayRef IsArgPresentMask, - SmallVectorImpl &FusedMDList) const { - // Retrieve metadata from one of the input kernels and add it to the list - // of fused metadata. - assert(InputFunction->hasMetadata(Kind) && - "Required Metadata not present on input kernel"); - if (auto *MD = InputFunction->getMetadata(Kind)) { - for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) { - if (std::get<0>(MaskedOps)) { - FusedMDList.emplace_back(std::get<1>(MaskedOps).get()); - } - } - } -} - void SYCLKernelFusion::attachFusedMetadata( Function *FusedFunction, const StringRef &Kind, const ArrayRef FusedMetadata) const { diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h index 5e26595d2d343..5f52f0a317d14 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h @@ -10,6 +10,7 @@ #define SYCL_FUSION_PASSES_SYCLKERNELFUSION_H #include "Kernel.h" +#include "target/TargetFusionInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -52,8 +53,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin { // locate our own metadata again. static constexpr auto MetadataKind = "sycl.kernel.fused"; static constexpr auto ParameterMDKind = "sycl.kernel.param"; - static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; - static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; using MDList = llvm::SmallVector; @@ -112,6 +111,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin { llvm::Error fuseKernel(llvm::Module &M, llvm::Function &StubFunction, jit_compiler::SYCLModuleInfo *ModInfo, + llvm::TargetFusionInfo &TargetInfo, llvm::SmallPtrSetImpl &ToCleanUp) const; void canonicalizeParameters( @@ -119,11 +119,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin { Parameter getParamFromMD(llvm::Metadata *MD) const; - void addToFusedMetadata( - llvm::Function *InputFunction, const llvm::StringRef &Kind, - const llvm::ArrayRef IsArgPresentMask, - llvm::SmallVectorImpl &FusedMDList) const; - void attachFusedMetadata( llvm::Function *FusedFunction, const llvm::StringRef &Kind, const llvm::ArrayRef FusedMetadata) const; diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp new file mode 100644 index 0000000000000..191ab676613bb --- /dev/null +++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp @@ -0,0 +1,225 @@ +//==---------------------- TargetFusionInfo.cpp ----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TargetFusionInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsNVPTX.h" + +using namespace llvm; + +// +// TargetFusionInfo +// + +TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) { + llvm::Triple Tri(Mod->getTargetTriple()); + if (Tri.isNVPTX()) { + return TargetFusionInfo( + std::shared_ptr(new NVPTXTargetFusionInfo(Mod))); + } + if (Tri.isSPIRV()) { + return TargetFusionInfo( + std::shared_ptr(new SPIRVTargetFusionInfo(Mod))); + } + assert(false && "Unsupported target for fusion"); +} + +// +// SPIRVTargetFusionInfo +// + +void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) { + KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL); +} + +ArrayRef SPIRVTargetFusionInfo::getKernelMetadataKeys() { + static SmallVector Keys{ + {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type", + "kernel_arg_base_type", "kernel_arg_type_qual"}}; + return Keys; +} + +void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) { + static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; + static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; + // Remove all existing calls of the ITT instrumentation functions. Insert new + // ones in the entry block of the fused kernel and every exit block if the + // functions are present in the module. + // We cannot use the existing SPIRITTAnnotations pass, because that pass might + // insert calls to functions not present in the module (e.g., ITT + // instrumentations for barriers). As the JITed module is not linked with + // libdevice anymore, the functions would remain unresolved and cause the + // driver to fail. + Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper); + Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper); + bool InsertWrappers = + ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) && + (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration())); + auto *WrapperFuncTy = FunctionType::get( + Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false); + for (auto &BB : *KernelFunc) { + for (auto Inst = BB.begin(); Inst != BB.end();) { + if (auto *CB = dyn_cast(Inst)) { + if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) { + Inst = Inst->eraseFromParent(); + continue; + } + } + ++Inst; + } + if (InsertWrappers) { + if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { + auto *WrapperCall = + CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI); + WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); + } + } + } + if (InsertWrappers) { + KernelFunc->getEntryBlock().getFirstInsertionPt(); + auto *WrapperCall = + CallInst::Create(WrapperFuncTy, StartWrapperFunc, "", + &*KernelFunc->getEntryBlock().getFirstInsertionPt()); + WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); + } +} + +void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) { + if (BarrierFlags == -1) { + return; + } + assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) && + "Invalid barrier flags"); + + static const auto FnAttrs = AttributeSet::get( + LLVMMod->getContext(), + {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent), + Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)}); + + static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"}; + + Function *F = LLVMMod->getFunction(N); + if (!F) { + constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage; + + auto *Ty = FunctionType::get( + Builder.getVoidTy(), + {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()}, + false /* isVarArg*/); + + F = Function::Create(Ty, Linkage, N, *LLVMMod); + + F->setAttributes( + AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); + F->setCallingConv(CallingConv::SPIR_FUNC); + } + + // See + // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- + SmallVector Args{ + Builder.getInt32(/*Exec Scope : Workgroup = */ 2), + Builder.getInt32(/*Exec Scope : Workgroup = */ 2), + Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) | + ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))}; + + auto *BarrierCallInst = Builder.CreateCall(F, Args); + BarrierCallInst->setAttributes( + AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); + BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC); +} + +// +// NVPTXTargetFusionInfo +// + +void NVPTXTargetFusionInfo::notifyFunctionsDelete( + llvm::ArrayRef Funcs) { + SmallPtrSet DeletedFuncs{Funcs.begin(), Funcs.end()}; + SmallVector ValidKernels; + auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations"); + for (auto *Op : OldAnnotations->operands()) { + if (auto *TOp = dyn_cast(Op)) { + if (auto *COp = dyn_cast_if_present( + TOp->getOperand(0).get())) { + if (!DeletedFuncs.contains(COp->getValue())) { + ValidKernels.push_back(Op); + // Add to the set to also remove duplicate entries. + DeletedFuncs.insert(COp->getValue()); + } + } + } + } + LLVMMod->eraseNamedMetadata(OldAnnotations); + auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + for (auto *Kernel : ValidKernels) { + NewAnnotations->addOperand(Kernel); + } +} + +void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) { + auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + auto *MDOne = ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1)); + auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel"); + auto *MDFunc = ConstantAsMetadata::get(KernelFunc); + SmallVector KernelMD({MDFunc, MDKernelString, MDOne}); + auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD); + NVVMAnnotations->addOperand(Tuple); +} + +ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() { + // FIXME: Check whether we need to take care of sycl_fixed_targets. + static SmallVector Keys{{"kernel_arg_buffer_location", + "kernel_arg_runtime_aligned", + "kernel_arg_exclusive_ptr"}}; + return Keys; +} + +void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) { + if (BarrierFlags == -1) { + return; + } + // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX + // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0 + // instruction, equivalent to the __syncthreads() call in CUDA." + Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {}); +} + +// +// MetadataCollection +// + +MetadataCollection::MetadataCollection(ArrayRef MDKeys) + : Keys{MDKeys}, Collection(MDKeys.size()) {} + +void MetadataCollection::collectFromFunction( + llvm::Function *Func, const ArrayRef IsArgPresentMask) { + for (auto &Key : Keys) { + // TODO: Do we want to assert for the presence of the metadata here? + if (auto *MD = Func->getMetadata(Key)) { + for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) { + if (std::get<0>(MaskedOps)) { + Collection[Key].emplace_back(std::get<1>(MaskedOps).get()); + } + } + } + } +} + +void MetadataCollection::attachToFunction(llvm::Function *Func) { + for (auto &Key : Keys) { + // Attach a list of fused metadata for a kind to the fused function. + auto *MDEntries = MDNode::get(Func->getContext(), Collection[Key]); + Func->setMetadata(Key, MDEntries); + } +} diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h new file mode 100644 index 0000000000000..653af4904e2ae --- /dev/null +++ b/sycl-fusion/passes/target/TargetFusionInfo.h @@ -0,0 +1,137 @@ +//==-- TargetFusionInfo.h - Encapsule target-specific fusion functionality -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" + +namespace llvm { + +class TargetFusionInfoImpl { + +public: + virtual ~TargetFusionInfoImpl() = default; + + virtual void notifyFunctionsDelete(llvm::ArrayRef Funcs) { + (void)Funcs; + } + + virtual void addKernelFunction(Function *KernelFunc) { (void)KernelFunc; } + + virtual void postProcessKernel(Function *KernelFunc) { (void)KernelFunc; } + + virtual ArrayRef getKernelMetadataKeys() { return {}; } + + virtual void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) = 0; + +protected: + explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {}; + + llvm::Module *LLVMMod; + + friend class TargetFusionInfo; +}; + +class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { +public: + void addKernelFunction(Function *KernelFunc) override; + + ArrayRef getKernelMetadataKeys() override; + + void postProcessKernel(Function *KernelFunc) override; + + void createBarrierCall(IRBuilderBase& Builder, int BarrierFlags) override; + +private: + using TargetFusionInfoImpl::TargetFusionInfoImpl; +}; + +class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { +public: + void notifyFunctionsDelete(llvm::ArrayRef Funcs) override; + + void addKernelFunction(Function *KernelFunc) override; + + ArrayRef getKernelMetadataKeys() override; + + void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) override; + +private: + using TargetFusionInfoImpl::TargetFusionInfoImpl; +}; + +/// +/// Common interface to target-specific logic around handling of kernel +/// functions. +class TargetFusionInfo { +public: + /// + /// Create the correct target-specific implementation based on the target + /// triple of \p Module. + static TargetFusionInfo getTargetFusionInfo(llvm::Module *Module); + + /// + /// Notify the target-specific implementation that set of functions \p Funcs + /// is about to be erased from the module. This should be called BEFORE + /// erasing the functions. + void notifyFunctionsDelete(llvm::ArrayRef Funcs) { + Impl->notifyFunctionsDelete(Funcs); + } + + /// + /// Notify the target-specific implementation that the function \p KernelFunc + /// was added as a new kernel. This should be called AFTER the function has + /// been added. + void addKernelFunction(llvm::Function *KernelFunc) { + Impl->addKernelFunction(KernelFunc); + } + + /// + /// Target-specific post-processing of the new kernel function \p KernelFunc. + /// This should be called AFTER the function has been added and defined. + void postProcessKernel(Function *KernelFunc) { + Impl->postProcessKernel(KernelFunc); + } + + /// + /// Get the target-specific list of argument metadata attached to each + /// function that should be collected and attached to the fused kernel. + llvm::ArrayRef getKernelMetadataKeys() { + return Impl->getKernelMetadataKeys(); + } + + void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) { + Impl->createBarrierCall(Builder, BarrierFlags); + } + +private: + using ImplPtr = std::shared_ptr; + + TargetFusionInfo(ImplPtr &&I) : Impl{I} {} + + ImplPtr Impl; +}; + +/// +/// Simple helper to collect a target-specific set of kernel argument metadata +/// from input functions and attach it to a fused kernel. +class MetadataCollection { +public: + explicit MetadataCollection(llvm::ArrayRef MDKeys); + + void collectFromFunction(llvm::Function *Func, + const ArrayRef IsArgPresentMask); + + void attachToFunction(llvm::Function *Func); + +private: + llvm::SmallVector Keys; + + llvm::StringMap> Collection; +}; +} // namespace llvm From ce6855a23a4da643c58cd1b0947d5f6044ea1f92 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 17 Jan 2023 08:19:21 +0000 Subject: [PATCH 08/16] [SYCL][Fusion] Do not require null terminator Signed-off-by: Lukas Sommer --- .../jit-compiler/lib/translation/KernelTranslation.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 31fa850d58d63..7e1318c70a4a9 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -111,7 +111,10 @@ KernelTranslator::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, llvm::StringRef RawData(reinterpret_cast(BinInfo.BinaryStart), BinInfo.BinarySize); return llvm::parseBitcodeFile( - MemoryBuffer::getMemBuffer(RawData)->getMemBufferRef(), LLVMCtx); + MemoryBuffer::getMemBuffer(RawData, Kernel.Name, + /* RequiresNullTermnator*/ false) + ->getMemBufferRef(), + LLVMCtx); } llvm::Expected> From e89cb781a8ae68fa4542cec0a81c840297e2ba58 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 17 Jan 2023 14:42:40 +0000 Subject: [PATCH 09/16] [SYCL][Fusion] Refactor more target-specific code Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 1 - .../lib/fusion/FusionPipeline.cpp | 19 ++++- .../lib/translation/KernelTranslation.cpp | 3 - sycl-fusion/passes/CMakeLists.txt | 1 + sycl-fusion/passes/cleanup/Cleanup.cpp | 17 ++-- sycl-fusion/passes/cleanup/Cleanup.h | 4 +- .../internalization/Internalization.cpp | 61 ++++--------- .../passes/kernel-fusion/SYCLKernelFusion.cpp | 13 +-- sycl-fusion/passes/syclcp/SYCLCP.cpp | 12 +-- .../passes/target/TargetFusionInfo.cpp | 60 +++++++++++-- sycl-fusion/passes/target/TargetFusionInfo.h | 85 +++++++++++++++---- 11 files changed, 177 insertions(+), 99 deletions(-) diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index 6c14ac182514a..821ad4cd36369 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -137,7 +137,6 @@ FusionResult KernelFusion::fuseKernels( SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName); - // TODO BinaryFormat TargetFormat = ConfigHelper::get(); if (auto Error = translation::KernelTranslator::translateKernel( diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp index 3ede007aa69a4..fb2e2b15200a7 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp @@ -23,6 +23,7 @@ #ifndef NDEBUG #include "llvm/IR/Verifier.h" #endif // NDEBUG +#include "llvm/ADT/Triple.h" #include "llvm/Passes/PassBuilder.h" #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Scalar/ADCE.h" @@ -35,6 +36,21 @@ using namespace llvm; using namespace jit_compiler; using namespace jit_compiler::fusion; +static unsigned getFlatAddressSpace(Module &Mod) { + // Ideally, we could get this information from the TargetTransformInfo, but + // the SPIR-V backend does not yet seem to have an implementation for that. + llvm::Triple Tri(Mod.getTargetTriple()); + if (Tri.isNVPTX()) { + return 0; + } + if (Tri.isSPIRV() || Tri.isSPIR()) { + return 4; + } + // Identical to the definition of "UninitializedAddressSpace" in + // "InferAddressSpaces.cpp". + return std::numeric_limits::max(); +} + std::unique_ptr FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, int BarriersFlags) { @@ -86,9 +102,8 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, // Run the InferAddressSpace pass to remove as many address-space casts // to/from generic address-space as possible, because these hinder // internalization. - // FIXME: TTI should tell the pass which address space to use. // Ideally, the static compiler should have performed that job. - constexpr unsigned FlatAddressSpace = 4; + unsigned FlatAddressSpace = getFlatAddressSpace(Mod); FPM.addPass(InferAddressSpacesPass(FlatAddressSpace)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 7e1318c70a4a9..3db02dbfd717c 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -217,8 +217,5 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { ASMStream.flush(); } - llvm::dbgs() << "PTX size: " << PTXASM.size() << "\n"; - llvm::dbgs() << "PTX:\n" << PTXASM << "\n"; - return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX); } diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt index 41398f6ee9e99..c74c3c40e34e0 100644 --- a/sycl-fusion/passes/CMakeLists.txt +++ b/sycl-fusion/passes/CMakeLists.txt @@ -8,6 +8,7 @@ add_llvm_library(SYCLKernelFusion MODULE syclcp/SYCLCP.cpp cleanup/Cleanup.cpp debug/PassDebug.cpp + target/TargetFusionInfo.cpp DEPENDS intrinsics_gen diff --git a/sycl-fusion/passes/cleanup/Cleanup.cpp b/sycl-fusion/passes/cleanup/Cleanup.cpp index 07fc86d9d0dd3..a5bc3e634c527 100644 --- a/sycl-fusion/passes/cleanup/Cleanup.cpp +++ b/sycl-fusion/passes/cleanup/Cleanup.cpp @@ -45,7 +45,8 @@ static void copyAttributesFrom(const BitVector &Mask, Function *NF, PAL.getRetAttrs(), Attributes)); } -static Function *createMaskedFunction(const BitVector &Mask, Function *F) { +static Function *createMaskedFunction(const BitVector &Mask, Function *F, + TargetFusionInfo &TFI) { // Declare FunctionType *NFTy = createMaskedFunctionType(Mask, F->getFunctionType()); Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(), @@ -78,7 +79,9 @@ static Function *createMaskedFunction(const BitVector &Mask, Function *F) { } // Erase old function + TFI.notifyFunctionsDelete(F); F->eraseFromParent(); + TFI.addKernelFunction(NF); return NF; } @@ -104,9 +107,9 @@ static void updateArgUsageMask(jit_compiler::SYCLKernelInfo *Info, static void applyArgMask(const jit_compiler::ArgUsageMask &NewArgInfo, const BitVector &Mask, Function *F, - ModuleAnalysisManager &AM) { + ModuleAnalysisManager &AM, TargetFusionInfo &TFI) { // Create the function without the masked-out args. - Function *NF = createMaskedFunction(Mask, F); + Function *NF = createMaskedFunction(Mask, F, TFI); // Update the unused args mask. jit_compiler::SYCLModuleInfo *ModuleInfo = AM.getResult(*NF->getParent()).ModuleInfo; @@ -125,9 +128,7 @@ static void maskMD(const BitVector &Mask, Function *F) { SmallVector> MD; F->getAllMetadata(MD); for (const auto &Entry : MD) { - auto MDKind = Entry.first; - if (MDKind == F->getContext().getMDKindID("reqd_work_group_size") || - MDKind == F->getContext().getMDKindID("work_group_size_hint")) { + if (Entry.second->getNumOperands() != Mask.size()) { // Some metadata, e.g., the metadata for reqd_work_group_size and // work_group_size_hint is independent from the number of arguments // and must not be filtered by the argument usage mask. @@ -144,7 +145,7 @@ static void maskMD(const BitVector &Mask, Function *F) { void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo, Function *F, ModuleAnalysisManager &AM, - ArrayRef MDToErase) { + TargetFusionInfo &TFI, ArrayRef MDToErase) { // Erase metadata. for (auto Key : MDToErase) { F->setMetadata(Key, nullptr); @@ -158,5 +159,5 @@ void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo, // Update metadata. maskMD(CleanupMask, F); // Remove arguments. - applyArgMask(ArgUsageInfo, CleanupMask, F, AM); + applyArgMask(ArgUsageInfo, CleanupMask, F, AM, TFI); } diff --git a/sycl-fusion/passes/cleanup/Cleanup.h b/sycl-fusion/passes/cleanup/Cleanup.h index 49619e4b9af07..491d96f46a886 100644 --- a/sycl-fusion/passes/cleanup/Cleanup.h +++ b/sycl-fusion/passes/cleanup/Cleanup.h @@ -10,6 +10,7 @@ #define SYCL_FUSION_PASSES_CLEANUP_H #include "Kernel.h" +#include "target/TargetFusionInfo.h" #include #include #include @@ -25,7 +26,8 @@ namespace llvm { /// @param[in] AM Module analysis manager. /// @param[in] EraseMD Keys of metadata to remove. void fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo, Function *F, - ModuleAnalysisManager &AM, ArrayRef EraseMD); + ModuleAnalysisManager &AM, TargetFusionInfo &TFI, + ArrayRef EraseMD); } // namespace llvm #endif // SYCL_FUSION_PASSES_CLEANUP_H diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index 61f3a0738921b..bca46ba43acb3 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -19,16 +19,12 @@ #include "cleanup/Cleanup.h" #include "debug/PassDebug.h" #include "metadata/MDParsing.h" +#include "target/TargetFusionInfo.h" #define DEBUG_TYPE "sycl-fusion" using namespace llvm; -// Corresponds to definition of spir_private and spir_local in -// "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap". -constexpr static unsigned PrivateAS{0}; -constexpr static unsigned LocalAS{3}; - constexpr static StringLiteral PrivatePromotion{"private"}; constexpr static StringLiteral LocalPromotion{"local"}; constexpr static StringLiteral NoPromotion{"none"}; @@ -44,6 +40,8 @@ struct SYCLInternalizerImpl { StringRef Kind; /// Whether or not to create allocas. bool CreateAllocas; + /// Interface to target-specific information. + TargetFusionInfo TargetInfo; /// Implements internalization the pass run. PreservedAnalyses operator()(Module &M, ModuleAnalysisManager &AM) const; @@ -338,11 +336,14 @@ Error SYCLInternalizerImpl::checkArgsPromotable( /// /// Function to perform the required cleaning actions. -static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal) { +static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal, + const TargetFusionInfo &TFI) { if (!KeepOriginal) { NewF->takeName(OldF); + TFI.notifyFunctionsDelete(OldF); OldF->eraseFromParent(); } + TFI.addKernelFunction(NewF); } void SYCLInternalizerImpl::promoteCall(CallBase *C, const Value *Val, @@ -499,11 +500,6 @@ Value *replaceByNewAlloca(Argument *Arg, unsigned AS, std::size_t LocalSize) { Function *SYCLInternalizerImpl::promoteFunctionArgs( Function *OldF, ArrayRef PromoteToLocal, bool CreateAllocas, bool KeepOriginal) const { - constexpr unsigned AddressSpaceBitWidth{32}; - - auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get( - IntegerType::get(OldF->getContext(), AddressSpaceBitWidth), AS)); - // We first declare the promoted function with the new signature. Function *NewF = getPromotedFunctionDeclaration(OldF, PromoteToLocal, AS, @@ -542,32 +538,9 @@ Function *SYCLInternalizerImpl::promoteFunctionArgs( promoteValue(Arg, LocalSize); } - { - constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"}; - if (auto *AddrspaceMD = - dyn_cast_or_null(NewF->getMetadata(KernelArgAddrSpaceMD))) { - // If we have kernel_arg_addr_space metadata in the original function, - // we should update it in the new one. - SmallVector NewInfo{AddrspaceMD->op_begin(), - AddrspaceMD->op_end()}; - for (auto I : enumerate(PromoteToLocal)) { - if (I.value() == 0) { - continue; - } - const auto Index = I.index(); - if (const auto *PtrTy = - dyn_cast(NewF->getArg(Index)->getType())) { - if (PtrTy->getAddressSpace() == LocalAS) { - NewInfo[Index] = NewAddrspace; - } - } - } - NewF->setMetadata(KernelArgAddrSpaceMD, - MDNode::get(NewF->getContext(), NewInfo)); - } - } + TargetInfo.updateAddressSpaceMetadata(NewF, PromoteToLocal, AS); - cleanup(OldF, NewF, KeepOriginal); + cleanup(OldF, NewF, KeepOriginal, TargetInfo); return NewF; } @@ -625,7 +598,8 @@ SYCLInternalizerImpl::operator()(Module &M, ModuleAnalysisManager &AM) const { return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } -static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) { +static void moduleCleanup(Module &M, ModuleAnalysisManager &AM, + TargetFusionInfo &TFI) { SmallVector ToProcess; for (auto &F : M) { if (F.hasMetadata(SYCLInternalizer::Key)) { @@ -650,24 +624,25 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) { NewArgInfo.push_back(jit_compiler::ArgUsage::Used); } } - fullCleanup(NewArgInfo, F, AM, + fullCleanup(NewArgInfo, F, AM, TFI, {SYCLInternalizer::Key, SYCLInternalizer::LocalSizeKey}); } } PreservedAnalyses llvm::SYCLInternalizer::run(Module &M, ModuleAnalysisManager &AM) { + auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); // Private promotion - const PreservedAnalyses Tmp = - SYCLInternalizerImpl{PrivateAS, PrivatePromotion, true}(M, AM); + const PreservedAnalyses Tmp = SYCLInternalizerImpl{ + TFI.getPrivateAddressSpace(), PrivatePromotion, true, TFI}(M, AM); // Local promotion - PreservedAnalyses Res = - SYCLInternalizerImpl{LocalAS, LocalPromotion, false}(M, AM); + PreservedAnalyses Res = SYCLInternalizerImpl{ + TFI.getLocalAddressSpace(), LocalPromotion, false, TFI}(M, AM); Res.intersect(Tmp); if (!Res.areAllPreserved()) { - moduleCleanup(M, AM); + moduleCleanup(M, AM, TFI); } return Res; } diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp index d3dec7f16a81f..a0079affbaa06 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp @@ -423,8 +423,7 @@ Error SYCLKernelFusion::fuseKernel( ++ParamIndex; } // Add the metadata corresponding to the used arguments to the different - // lists. NOTE: We do not collect the "kernel_arg_name" metadata, because - // the kernel arguments receive new names in the fused kernel. + // lists. MDCollection.collectFromFunction(FF, UsedArgsMask); // Update the fused kernel's KernelInfo with information from this input @@ -484,16 +483,12 @@ Error SYCLKernelFusion::fuseKernel( } // Attach names to the arguments. The name includes a prefix for the kernel - // from which this argument came. The names are also attached as metadata - // with kind "kernel_arg_name". - // NOTE: While the kernel_arg_name metadata is required, naming the - // parameters themselves is not necessary for functionality, it just improves - // readibility for debugging purposes. - SmallVector KernelArgNames; + // from which this argument came. Naming the parameters themselves is not + // necessary for functionality, it just improves readibility for debugging + // purposes. for (const auto &AI : llvm::enumerate(FusedFunction->args())) { auto &ArgName = FusedArgNames[AI.index()]; AI.value().setName(ArgName); - KernelArgNames.push_back(MDString::get(LLVMCtx, ArgName)); } // Attach the fused metadata collected from the different input // kernels to the fused function. diff --git a/sycl-fusion/passes/syclcp/SYCLCP.cpp b/sycl-fusion/passes/syclcp/SYCLCP.cpp index b520620c232d1..5cb9b00433ef7 100644 --- a/sycl-fusion/passes/syclcp/SYCLCP.cpp +++ b/sycl-fusion/passes/syclcp/SYCLCP.cpp @@ -41,7 +41,7 @@ static Expected> getCPFromMD(Function *F) { MDNode *MD = F->getMetadata(SYCLCP::Key); if (!MD) { return createStringError(inconvertibleErrorCode(), - "Private promotion metadata not available"); + "Constant progagation metadata not available"); } for (auto I : enumerate(MD->operands())) { Expected> Val = @@ -205,7 +205,8 @@ static bool propagateConstants(Function *F, ArrayRef Constants) { return Changed; } -static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) { +static void moduleCleanup(Module &M, ModuleAnalysisManager &AM, + TargetFusionInfo &TFI) { SmallVector ToProcess; for (auto &F : M) { if (F.hasMetadata(SYCLCP::Key)) { @@ -219,14 +220,13 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) { if (const auto *MDS = dyn_cast(I.value().get())) { // A value is masked-out if it has a non-empty MDString if (MDS->getLength() > 0) { - // And is either an integer or a FP number. NewArgInfo.push_back(jit_compiler::ArgUsage::Unused); continue; } } NewArgInfo.push_back(jit_compiler::ArgUsage::Used); } - fullCleanup(NewArgInfo, F, AM, {SYCLCP::Key}); + fullCleanup(NewArgInfo, F, AM, TFI, {SYCLCP::Key}); } } @@ -249,8 +249,10 @@ PreservedAnalyses SYCLCP::run(Module &M, ModuleAnalysisManager &AM) { Changed = propagateConstants(F, *ConstantsOrErr) || Changed; } + auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + if (Changed) { - moduleCleanup(M, AM); + moduleCleanup(M, AM, TFI); } return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp index 191ab676613bb..c17e2464ac323 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.cpp +++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp @@ -25,7 +25,7 @@ TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) { return TargetFusionInfo( std::shared_ptr(new NVPTXTargetFusionInfo(Mod))); } - if (Tri.isSPIRV()) { + if (Tri.isSPIRV() || Tri.isSPIR()) { return TargetFusionInfo( std::shared_ptr(new SPIRVTargetFusionInfo(Mod))); } @@ -36,18 +36,28 @@ TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) { // SPIRVTargetFusionInfo // -void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) { +void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) const { KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL); } -ArrayRef SPIRVTargetFusionInfo::getKernelMetadataKeys() { +ArrayRef SPIRVTargetFusionInfo::getKernelMetadataKeys() const { + // NOTE: We do not collect the "kernel_arg_name" metadata, because + // the kernel arguments receive new names in the fused kernel. static SmallVector Keys{ {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type", "kernel_arg_base_type", "kernel_arg_type_qual"}}; return Keys; } -void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) { +void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) const { + // Attach the kernel_arg_name metadata. + SmallVector KernelArgNames; + for (auto &P : KernelFunc->args()) { + KernelArgNames.push_back(MDString::get(LLVMMod->getContext(), P.getName())); + } + auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames); + KernelFunc->setMetadata("kernel_arg_name", ArgNameMD); + static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; // Remove all existing calls of the ITT instrumentation functions. Insert new @@ -93,7 +103,7 @@ void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) { } void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) { + int BarrierFlags) const { if (BarrierFlags == -1) { return; } @@ -137,12 +147,44 @@ void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC); } +void SPIRVTargetFusionInfo::updateAddressSpaceMetadata( + Function *KernelFunc, ArrayRef LocalSize, + unsigned AddressSpace) const { + static constexpr unsigned AddressSpaceBitWidth{32}; + static constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"}; + + auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get( + IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth), + AddressSpace)); + if (auto *AddrspaceMD = dyn_cast_or_null( + KernelFunc->getMetadata(KernelArgAddrSpaceMD))) { + // If we have kernel_arg_addr_space metadata in the original function, + // we should update it in the new one. + SmallVector NewInfo{AddrspaceMD->op_begin(), + AddrspaceMD->op_end()}; + for (auto I : enumerate(LocalSize)) { + if (I.value() == 0) { + continue; + } + const auto Index = I.index(); + if (const auto *PtrTy = + dyn_cast(KernelFunc->getArg(Index)->getType())) { + if (PtrTy->getAddressSpace() == getLocalAddressSpace()) { + NewInfo[Index] = NewAddrspace; + } + } + } + KernelFunc->setMetadata(KernelArgAddrSpaceMD, + MDNode::get(KernelFunc->getContext(), NewInfo)); + } +} + // // NVPTXTargetFusionInfo // void NVPTXTargetFusionInfo::notifyFunctionsDelete( - llvm::ArrayRef Funcs) { + llvm::ArrayRef Funcs) const { SmallPtrSet DeletedFuncs{Funcs.begin(), Funcs.end()}; SmallVector ValidKernels; auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations"); @@ -165,7 +207,7 @@ void NVPTXTargetFusionInfo::notifyFunctionsDelete( } } -void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) { +void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) const { auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); auto *MDOne = ConstantAsMetadata::get( ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1)); @@ -176,7 +218,7 @@ void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) { NVVMAnnotations->addOperand(Tuple); } -ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() { +ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() const { // FIXME: Check whether we need to take care of sycl_fixed_targets. static SmallVector Keys{{"kernel_arg_buffer_location", "kernel_arg_runtime_aligned", @@ -185,7 +227,7 @@ ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() { } void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) { + int BarrierFlags) const { if (BarrierFlags == -1) { return; } diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h index 653af4904e2ae..c307de2dd764e 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.h +++ b/sycl-fusion/passes/target/TargetFusionInfo.h @@ -6,6 +6,9 @@ // //===----------------------------------------------------------------------===// +#ifndef SYCL_FUSION_PASSES_TARGETFUSIONINFO_H +#define SYCL_FUSION_PASSES_TARGETFUSIONINFO_H + #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" @@ -17,17 +20,33 @@ class TargetFusionInfoImpl { public: virtual ~TargetFusionInfoImpl() = default; - virtual void notifyFunctionsDelete(llvm::ArrayRef Funcs) { + virtual void notifyFunctionsDelete(llvm::ArrayRef Funcs) const { (void)Funcs; } - virtual void addKernelFunction(Function *KernelFunc) { (void)KernelFunc; } + virtual void addKernelFunction(Function *KernelFunc) const { + (void)KernelFunc; + } + + virtual void postProcessKernel(Function *KernelFunc) const { + (void)KernelFunc; + } + + virtual ArrayRef getKernelMetadataKeys() const { return {}; } + + virtual void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const = 0; - virtual void postProcessKernel(Function *KernelFunc) { (void)KernelFunc; } + virtual unsigned getPrivateAddressSpace() const = 0; - virtual ArrayRef getKernelMetadataKeys() { return {}; } + virtual unsigned getLocalAddressSpace() const = 0; - virtual void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) = 0; + virtual void updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const { + (void)KernelFunc; + (void)LocalSize; + } protected: explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {}; @@ -39,13 +58,23 @@ class TargetFusionInfoImpl { class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { public: - void addKernelFunction(Function *KernelFunc) override; + void addKernelFunction(Function *KernelFunc) const override; + + ArrayRef getKernelMetadataKeys() const override; - ArrayRef getKernelMetadataKeys() override; + void postProcessKernel(Function *KernelFunc) const override; - void postProcessKernel(Function *KernelFunc) override; + void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const override; - void createBarrierCall(IRBuilderBase& Builder, int BarrierFlags) override; + // Corresponds to definition of spir_private and spir_local in + // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap". + unsigned getPrivateAddressSpace() const override { return 0; } + unsigned getLocalAddressSpace() const override { return 3; } + + void updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const override; private: using TargetFusionInfoImpl::TargetFusionInfoImpl; @@ -53,13 +82,19 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { public: - void notifyFunctionsDelete(llvm::ArrayRef Funcs) override; + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const override; + + void addKernelFunction(Function *KernelFunc) const override; - void addKernelFunction(Function *KernelFunc) override; + ArrayRef getKernelMetadataKeys() const override; - ArrayRef getKernelMetadataKeys() override; + void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const override; - void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) override; + // Corresponds to the definitions in the LLVM NVPTX backend user guide: + // https://llvm.org/docs/NVPTXUsage.html#address-spaces + unsigned getPrivateAddressSpace() const override { return 0; } + unsigned getLocalAddressSpace() const override { return 3; } private: using TargetFusionInfoImpl::TargetFusionInfoImpl; @@ -79,7 +114,7 @@ class TargetFusionInfo { /// Notify the target-specific implementation that set of functions \p Funcs /// is about to be erased from the module. This should be called BEFORE /// erasing the functions. - void notifyFunctionsDelete(llvm::ArrayRef Funcs) { + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const { Impl->notifyFunctionsDelete(Funcs); } @@ -87,28 +122,40 @@ class TargetFusionInfo { /// Notify the target-specific implementation that the function \p KernelFunc /// was added as a new kernel. This should be called AFTER the function has /// been added. - void addKernelFunction(llvm::Function *KernelFunc) { + void addKernelFunction(llvm::Function *KernelFunc) const { Impl->addKernelFunction(KernelFunc); } /// /// Target-specific post-processing of the new kernel function \p KernelFunc. /// This should be called AFTER the function has been added and defined. - void postProcessKernel(Function *KernelFunc) { + void postProcessKernel(Function *KernelFunc) const { Impl->postProcessKernel(KernelFunc); } /// /// Get the target-specific list of argument metadata attached to each /// function that should be collected and attached to the fused kernel. - llvm::ArrayRef getKernelMetadataKeys() { + llvm::ArrayRef getKernelMetadataKeys() const { return Impl->getKernelMetadataKeys(); } - void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) { + void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const { Impl->createBarrierCall(Builder, BarrierFlags); } + unsigned getPrivateAddressSpace() const { + return Impl->getPrivateAddressSpace(); + } + + unsigned getLocalAddressSpace() const { return Impl->getLocalAddressSpace(); } + + void updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const { + Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace); + } + private: using ImplPtr = std::shared_ptr; @@ -135,3 +182,5 @@ class MetadataCollection { llvm::StringMap> Collection; }; } // namespace llvm + +#endif // SYCL_FUSION_PASSES_TARGETFUSIONINFO_H From da50ca721d796d5e94a4b2782bafdd5d3c0e944a Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 14 Feb 2023 13:20:48 +0000 Subject: [PATCH 10/16] [SYCL][Fusion] Handle attributes for CUDA fusion Handle reqd_work_group_size and work_group_size_hint attributes. Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/include/JITContext.h | 3 +- .../lib/translation/KernelTranslation.cpp | 42 +++++++++++++++++ .../lib/translation/KernelTranslation.h | 2 +- .../lib/translation/SPIRVLLVMTranslation.cpp | 37 --------------- .../lib/translation/SPIRVLLVMTranslation.h | 18 +------- sycl/source/detail/jit_compiler.cpp | 46 +++++++++++++++++-- sycl/source/detail/jit_compiler.hpp | 7 ++- sycl/source/detail/jit_device_binaries.cpp | 6 ++- sycl/source/detail/jit_device_binaries.hpp | 5 +- 9 files changed, 102 insertions(+), 64 deletions(-) diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h index d4654f19820ce..c26312ceb70e6 100644 --- a/sycl-fusion/jit-compiler/include/JITContext.h +++ b/sycl-fusion/jit-compiler/include/JITContext.h @@ -65,8 +65,7 @@ class JITContext { llvm::LLVMContext *getLLVMContext(); - KernelBinary &emplaceSPIRVBinary(std::string Binary, - BinaryFormat Format); + KernelBinary &emplaceSPIRVBinary(std::string Binary, BinaryFormat Format); std::optional getCacheEntry(CacheKeyT &Identifier) const; diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 3db02dbfd717c..1436f51f40aaa 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -9,6 +9,7 @@ #include "KernelTranslation.h" #include "SPIRVLLVMTranslation.h" #include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/Linker/Linker.h" #include "llvm/MC/TargetRegistry.h" @@ -21,6 +22,43 @@ using namespace jit_compiler; using namespace jit_compiler::translation; using namespace llvm; +/// +/// Get an attribute value consisting of NumValues scalar constant integers +/// from the MDNode. +static void getAttributeValues(std::vector &Values, MDNode *MD) { + for (const auto &MDOp : MD->operands()) { + auto *ConstantMD = cast(MDOp); + auto *ConstInt = cast(ConstantMD->getValue()); + Values.push_back(std::to_string(ConstInt->getZExtValue())); + } +} + +// NOLINTNEXTLINE(readability-identifier-naming) +static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size"; +// NOLINTNEXTLINE(readability-identifier-naming) +static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint"; + +/// +/// Restore kernel attributes for the kernel in Info from the metadata +/// attached to its kernel function in the LLVM module Mod. +/// Currently supported attributes: +/// - reqd_work_group_size +/// - work_group_size_hint +static void restoreKernelAttributes(Module *Mod, SYCLKernelInfo &Info) { + auto *KernelFunction = Mod->getFunction(Info.Name); + assert(KernelFunction && "Kernel function not present in module"); + if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) { + SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR}; + getAttributeValues(ReqdAttr.Values, MD); + Info.Attributes.push_back(ReqdAttr); + } + if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) { + SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR}; + getAttributeValues(HintAttr.Values, MD); + Info.Attributes.push_back(HintAttr); + } +} + llvm::Expected> KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx, std::vector &Kernels) { @@ -100,6 +138,10 @@ KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx, "Number of address bits between SPIR-V modules does not match"); } } + // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or + // 'work_group_size_hint' from metadata attached to the kernel function and + // store it in the SYCLKernelInfo. + restoreKernelAttributes(Result.get(), Kernel); } return std::move(Result); } diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h index 3d6824edbdd37..276b6e536d269 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h @@ -23,7 +23,7 @@ class KernelTranslator { loadKernels(llvm::LLVMContext &LLVMCtx, std::vector &Kernels); static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod, - JITContext &JITCtx, BinaryFormat Format); + JITContext &JITCtx, BinaryFormat Format); private: /// diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp index 1046fd6dc2907..a5effb33a85e1 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp @@ -11,9 +11,7 @@ #include "Kernel.h" #include "LLVMSPIRVLib.h" #include "helper/ErrorHandling.h" -#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/raw_ostream.h" @@ -24,37 +22,6 @@ using namespace jit_compiler; using namespace jit_compiler::translation; using namespace llvm; -void SPIRVLLVMTranslator::getAttributeValues(std::vector &Values, - MDNode *MD, size_t NumValues) { - assert(MD->getNumOperands() == NumValues && "Incorrect number of values"); - for (const auto &MDOp : MD->operands()) { - auto *ConstantMD = cast(MDOp); - auto *ConstInt = cast(ConstantMD->getValue()); - Values.push_back(std::to_string(ConstInt->getZExtValue())); - } -} - -// NOLINTNEXTLINE(readability-identifier-naming) -static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size"; -// NOLINTNEXTLINE(readability-identifier-naming) -static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint"; - -void SPIRVLLVMTranslator::restoreKernelAttributes(Module *Mod, - SYCLKernelInfo &Info) { - auto *KernelFunction = Mod->getFunction(Info.Name); - assert(KernelFunction && "Kernel function not present in module"); - if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) { - SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR}; - getAttributeValues(ReqdAttr.Values, MD, 3); - Info.Attributes.push_back(ReqdAttr); - } - if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) { - SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR}; - getAttributeValues(HintAttr.Values, MD, 3); - Info.Attributes.push_back(HintAttr); - } -} - SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() { static auto Opts = []() -> SPIRV::TranslatorOpts { // Options for translation between SPIR-V and LLVM IR. @@ -113,10 +80,6 @@ SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, } std::unique_ptr NewMod{LLVMMod}; - // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or - // 'work_group_size_hint' from metadata attached to the kernel function and - // store it in the SYCLKernelInfo. - restoreKernelAttributes(NewMod.get(), Kernel); return std::move(NewMod); } diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h index 440c00103b0d5..c8cdf2bf90ca0 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h @@ -30,24 +30,10 @@ class SPIRVLLVMTranslator { /// /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and /// return a pointer to its container. - static llvm::Expected translateLLVMtoSPIRV(llvm::Module &Mod, - JITContext &JITCtx); + static llvm::Expected + translateLLVMtoSPIRV(llvm::Module &Mod, JITContext &JITCtx); private: - /// - /// Get an attribute value consisting of NumValues scalar constant integers - /// from the MDNode. - static void getAttributeValues(std::vector &Values, - llvm::MDNode *MD, size_t NumValues); - - /// - /// Restore kernel attributes for the kernel in Info from the metadata - /// attached to its kernel function in the LLVM module Mod. - /// Currently supported attributes: - /// - reqd_work_group_size - /// - work_group_size_hint - static void restoreKernelAttributes(llvm::Module *Mod, SYCLKernelInfo &Info); - /// /// Default settings for the SPIRV translation options. static SPIRV::TranslatorOpts &translatorOpts(); diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index fe645e43ab28a..8770f58172548 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -921,10 +921,33 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( Binary.addProperty(std::move(ArgMaskPropSet)); + if (Format == ::jit_compiler::BinaryFormat::PTX) { + // Add a program metadata property with the reqd_work_group_size attribute. + // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference. + auto ReqdWGS = std::find_if( + FusedKernelInfo.Attributes.begin(), FusedKernelInfo.Attributes.end(), + [](const ::jit_compiler::SYCLKernelAttribute &Attr) { + return Attr.AttributeName == "reqd_work_group_size"; + }); + if (ReqdWGS != FusedKernelInfo.Attributes.end()) { + auto Encoded = encodeReqdWorkGroupSize(*ReqdWGS); + std::stringstream PropName; + PropName << FusedKernelInfo.Name; + PropName << __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE; + PropertyContainer ReqdWorkGroupSizeProp{ + PropName.str(), Encoded.data(), Encoded.size(), + pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY}; + PropertySetContainer ProgramMetadata{ + __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA}; + ProgramMetadata.addProperty(std::move(ReqdWorkGroupSizeProp)); + Binary.addProperty(std::move(ProgramMetadata)); + } + } + DeviceBinariesCollection Collection; - Collection.addDeviceBinary(std::move(Binary), - FusedKernelInfo.BinaryInfo.BinaryStart, - FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat); + Collection.addDeviceBinary( + std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart, + FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat); JITDeviceBinaries.push_back(std::move(Collection)); return JITDeviceBinaries.back().getPIDeviceStruct(); @@ -960,6 +983,23 @@ std::vector jit_compiler::encodeArgUsageMask( return Encoded; } +std::vector jit_compiler::encodeReqdWorkGroupSize( + const ::jit_compiler::SYCLKernelAttribute &Attr) const { + assert(Attr.AttributeName == "reqd_work_group_size"); + size_t NumBytes = sizeof(uint64_t) + (Attr.Values.size() * sizeof(uint32_t)); + std::vector Encoded(NumBytes, 0u); + uint8_t *Ptr = Encoded.data(); + // Skip 64-bit wide size argument with value 0 at the start of the data. + // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference. + Ptr += sizeof(uint64_t); + for (const auto &Val : Attr.Values) { + uint32_t UVal = std::stoul(Val); + std::memcpy(Ptr, &UVal, sizeof(uint32_t)); + Ptr += sizeof(uint32_t); + } + return Encoded; +} + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index e02be562de3ee..4b299572bacbc 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -16,6 +16,7 @@ namespace jit_compiler { enum class BinaryFormat; class JITContext; struct SYCLKernelInfo; +struct SYCLKernelAttribute; using ArgUsageMask = std::vector; } // namespace jit_compiler @@ -47,11 +48,15 @@ class jit_compiler { jit_compiler &operator=(const jit_compiler &&) = delete; pi_device_binaries - createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, ::jit_compiler::BinaryFormat Format); + createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, + ::jit_compiler::BinaryFormat Format); std::vector encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const; + std::vector encodeReqdWorkGroupSize( + const ::jit_compiler::SYCLKernelAttribute &Attr) const; + // Manages the lifetime of the PI structs for device binaries. std::vector JITDeviceBinaries; diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp index 59530a0e691ce..d0cc9e824bc07 100644 --- a/sycl/source/detail/jit_device_binaries.cpp +++ b/sycl/source/detail/jit_device_binaries.cpp @@ -81,7 +81,8 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) { } pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( - const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec, pi_device_binary_type Format) { + const unsigned char *BinaryStart, size_t BinarySize, const char *TargetSpec, + pi_device_binary_type Format) { pi_device_binary_struct DeviceBinary; DeviceBinary.Version = PI_DEVICE_BINARY_VERSION; DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL; @@ -106,7 +107,8 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec, pi_device_binary_type Format) { + const char *TargetSpec, + pi_device_binary_type Format) { // Adding to the vectors might trigger reallocation, which would invalidate // the pointers used for PI structs if a PI struct has already been created // via getPIDeviceStruct(). Forbid calls to this method after the first PI diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp index 96079d3a25a19..cecab17870650 100644 --- a/sycl/source/detail/jit_device_binaries.hpp +++ b/sycl/source/detail/jit_device_binaries.hpp @@ -111,7 +111,8 @@ class DeviceBinaryContainer { pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec, pi_device_binary_type Format); + const char *TargetSpec, + pi_device_binary_type Format); private: bool Fused = true; @@ -138,7 +139,7 @@ class DeviceBinariesCollection { void addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec, pi_device_binary_type Format); + const char *TargetSpec, pi_device_binary_type Format); pi_device_binaries getPIDeviceStruct(); private: From 57e02a664d84a20cf89daff2a2876d89d41e2b26 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 14 Feb 2023 13:39:10 +0000 Subject: [PATCH 11/16] [SYCL][Fusion] Cache and groom input binaries Parse each input binary only once. Groom the nvvm annotations for functions deleted before fusion. Signed-off-by: Lukas Sommer --- .../jit-compiler/lib/fusion/ModuleHelper.cpp | 8 ++ .../lib/translation/KernelTranslation.cpp | 104 +++++++++--------- 2 files changed, 60 insertions(+), 52 deletions(-) diff --git a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp index c197fedf47e69..0a90600530f74 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp @@ -8,6 +8,7 @@ #include "ModuleHelper.h" +#include "target/TargetFusionInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Function.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -22,6 +23,13 @@ helper::ModuleHelper::cloneAndPruneModule(Module *Mod, SmallPtrSet UnusedFunctions; identifyUnusedFunctions(Mod, CGRoots, UnusedFunctions); + { + auto TFI = llvm::TargetFusionInfo::getTargetFusionInfo(Mod); + SmallVector Unused{UnusedFunctions.begin(), + UnusedFunctions.end()}; + TFI.notifyFunctionsDelete(Unused); + } + // Clone the module, but use an external reference in place of the global // definition for unused functions. auto FunctionCloneMask = [&](const GlobalValue *GV) -> bool { diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 1436f51f40aaa..b877829782658 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -75,68 +75,68 @@ KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx, const unsigned char *ModulePtr = BinInfo.BinaryStart; size_t ModuleSize = BinInfo.BinarySize; BinaryBlob BinBlob{ModulePtr, ModuleSize}; - if (ParsedBinaries.contains(BinBlob)) { + if (!ParsedBinaries.contains(BinBlob)) { // Multiple kernels can be stored in the same SPIR-V or LLVM IR module. - // If we encountered the same binary module before, skip. + // We only load if we did not encounter the same binary module before. // NOTE: We compare the pointer as well as the size, in case // a previous kernel only referenced part of the SPIR-V/LLVM IR module. // Not sure this can actually happen, but better safe than sorry. - continue; - } - // Simply load and translate the SPIR-V into the currently still empty - // module. - std::unique_ptr NewMod; + // Simply load and translate the SPIR-V into the currently still empty + // module. + std::unique_ptr NewMod; - switch (BinInfo.Format) { - case BinaryFormat::LLVM: { - auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel); - if (auto Err = ModOrError.takeError()) { - return std::move(Err); + switch (BinInfo.Format) { + case BinaryFormat::LLVM: { + auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel); + if (auto Err = ModOrError.takeError()) { + return std::move(Err); + } + NewMod = std::move(*ModOrError); + break; } - NewMod = std::move(*ModOrError); - break; - } - case BinaryFormat::SPIRV: { - auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel); - if (auto Err = ModOrError.takeError()) { - return std::move(Err); + case BinaryFormat::SPIRV: { + auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel); + if (auto Err = ModOrError.takeError()) { + return std::move(Err); + } + NewMod = std::move(*ModOrError); + break; } - NewMod = std::move(*ModOrError); - break; - } - default: { - return createStringError( - inconvertibleErrorCode(), - "Failed to load kernel from unsupported input format"); - } - } - - // We do not assume that the input binary information has the address bits - // set, but rather retrieve this information from the SPIR-V/LLVM module's - // data-layout. - BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits(); - - if (First) { - // We can simply assign the module we just loaded from SPIR-V to the - // empty pointer on the first iteration. - Result = std::move(NewMod); - // The first module will dictate the address bits for the remaining. - AddressBits = BinInfo.AddressBits; - First = false; - } else { - // We have already loaded some module, so now we need to - // link the module we just loaded with the result so far. - // FIXME: We allow duplicates to be overridden by the module - // read last. This could cause problems if different modules contain - // definitions with the same name, but different body/content. - // Check that this is not problematic. - Linker::linkModules(*Result, std::move(NewMod), - Linker::Flags::OverrideFromSrc); - if (AddressBits != BinInfo.AddressBits) { + default: { return createStringError( inconvertibleErrorCode(), - "Number of address bits between SPIR-V modules does not match"); + "Failed to load kernel from unsupported input format"); + } + } + + // We do not assume that the input binary information has the address bits + // set, but rather retrieve this information from the SPIR-V/LLVM module's + // data-layout. + BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits(); + + if (First) { + // We can simply assign the module we just loaded from SPIR-V to the + // empty pointer on the first iteration. + Result = std::move(NewMod); + // The first module will dictate the address bits for the remaining. + AddressBits = BinInfo.AddressBits; + First = false; + } else { + // We have already loaded some module, so now we need to + // link the module we just loaded with the result so far. + // FIXME: We allow duplicates to be overridden by the module + // read last. This could cause problems if different modules contain + // definitions with the same name, but different body/content. + // Check that this is not problematic. + Linker::linkModules(*Result, std::move(NewMod), + Linker::Flags::OverrideFromSrc); + if (AddressBits != BinInfo.AddressBits) { + return createStringError( + inconvertibleErrorCode(), + "Number of address bits between SPIR-V modules does not match"); + } } + ParsedBinaries.insert(BinBlob); } // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or // 'work_group_size_hint' from metadata attached to the kernel function and From 1d8bab8cd6c4eda25d06a5a59ea12f91a95035fa Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 14 Feb 2023 15:44:25 +0000 Subject: [PATCH 12/16] [SYCL][Fusion] Disable heterogeneous ND ranges on CUDA Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/CMakeLists.txt | 1 + sycl-fusion/jit-compiler/include/JITContext.h | 1 - sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 15 ++++++++++----- sycl-fusion/passes/CMakeLists.txt | 1 + sycl-fusion/passes/kernel-fusion/Builtins.cpp | 12 ------------ sycl-fusion/passes/kernel-fusion/Builtins.h | 4 ---- sycl/source/detail/jit_compiler.cpp | 2 +- sycl/source/detail/jit_compiler.hpp | 2 +- 8 files changed, 14 insertions(+), 24 deletions(-) diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index f67eed5fc6c76..777f06b303e0c 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_library(sycl-fusion ScalarOpts InstCombine Target + TargetParser NVPTX X86 MC diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h index c26312ceb70e6..eda8c83d8b21c 100644 --- a/sycl-fusion/jit-compiler/include/JITContext.h +++ b/sycl-fusion/jit-compiler/include/JITContext.h @@ -88,7 +88,6 @@ class JITContext { mutable MutexT CacheMutex; std::unordered_map Cache; - }; } // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index 821ad4cd36369..e2f321bbcb81b 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -56,6 +56,10 @@ FusionResult KernelFusion::fuseKernels( int BarriersFlags, const std::vector &Internalization, const std::vector &Constants) { + // Initialize the configuration helper to make the options for this invocation + // available (on a per-thread basis). + ConfigHelper::setConfig(std::move(JITConfig)); + const auto NDRanges = gatherNDRanges(KernelInformation); if (!isValidCombination(NDRanges)) { @@ -64,9 +68,12 @@ FusionResult KernelFusion::fuseKernels( "different global sizes in dimensions [2, N) and non-zero offsets"}; } - // Initialize the configuration helper to make the options for this invocation - // available (on a per-thread basis). - ConfigHelper::setConfig(std::move(JITConfig)); + bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges); + + BinaryFormat TargetFormat = ConfigHelper::get(); + if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) { + return FusionResult{"Heterogeneous ND ranges not supported for CUDA"}; + } bool CachingEnabled = ConfigHelper::get(); CacheKeyT CacheKey{KernelsToFuse, @@ -137,8 +144,6 @@ FusionResult KernelFusion::fuseKernels( SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName); - BinaryFormat TargetFormat = ConfigHelper::get(); - if (auto Error = translation::KernelTranslator::translateKernel( FusedKernelInfo, *NewMod, JITCtx, TargetFormat)) { return errorToFusionResult(std::move(Error), diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt index c74c3c40e34e0..95f504091dc30 100644 --- a/sycl-fusion/passes/CMakeLists.txt +++ b/sycl-fusion/passes/CMakeLists.txt @@ -46,6 +46,7 @@ add_llvm_library(SYCLKernelFusionPasses Support TransformUtils Passes + TargetParser ) target_include_directories(SYCLKernelFusionPasses diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.cpp b/sycl-fusion/passes/kernel-fusion/Builtins.cpp index 6de1c40e8f4cd..9cff120cd5888 100644 --- a/sycl-fusion/passes/kernel-fusion/Builtins.cpp +++ b/sycl-fusion/passes/kernel-fusion/Builtins.cpp @@ -595,18 +595,6 @@ jit_compiler::Remapper::remapBuiltins(Function *F, const NDRange &SrcNDRange, return Clone; } -void jit_compiler::barrierCall(IRBuilderBase &Builder, int Flags) { - assert((Flags == 1 || Flags == 2 || Flags == 3) && "Invalid barrier flags"); - - // See - // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- - createSPIRVCall(Builder, BarrierName, - {Builder.getInt32(/*Exec Scope : Workgroup = */ 2), - Builder.getInt32(/*Exec Scope : Workgroup = */ 2), - Builder.getInt32(0x10 | (Flags % 2 == 1 ? 0x100 : 0x0) | - ((Flags >> 1 == 1 ? 0x200 : 0x0)))}); -} - Value *jit_compiler::createSPIRVCall(IRBuilderBase &Builder, StringRef FunctionName, ArrayRef Args) { diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.h b/sycl-fusion/passes/kernel-fusion/Builtins.h index bc8e186f14188..a55b6efab39f9 100644 --- a/sycl-fusion/passes/kernel-fusion/Builtins.h +++ b/sycl-fusion/passes/kernel-fusion/Builtins.h @@ -60,10 +60,6 @@ constexpr llvm::StringLiteral OffloadStartWrapperName{ llvm::Value *getGlobalLinearID(llvm::IRBuilderBase &Builder, const NDRange &FusedNDRange); -/// -/// Creates a call to a barrier function. -void barrierCall(llvm::IRBuilderBase &Builder, int Flags); - /// /// @return A call to a SPIRV function, which will be declared if not already in /// the module. diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 8770f58172548..e4b16db373df8 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -799,7 +799,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled); JITConfig.set<::jit_compiler::option::JITEnableCaching>( detail::SYCLConfig::get()); - + ::jit_compiler::BinaryFormat TargetFormat = getTargetFormat(Queue); JITConfig.set<::jit_compiler::option::JITTargetFormat>(TargetFormat); diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index 4b299572bacbc..71a57723ea6bb 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -13,7 +13,7 @@ #include namespace jit_compiler { -enum class BinaryFormat; +enum class BinaryFormat : uint32_t; class JITContext; struct SYCLKernelInfo; struct SYCLKernelAttribute; From 4506db6bc0232ef4e7d0292813987ff4a0527002 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 14 Feb 2023 16:14:50 +0000 Subject: [PATCH 13/16] [SYCL][Fusion] Enable JIT caching for CUDA fusion Signed-off-by: Lukas Sommer --- sycl/source/detail/jit_compiler.cpp | 11 ++++++++--- sycl/source/detail/jit_compiler.hpp | 4 ++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index e4b16db373df8..d5742a3580445 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -847,9 +847,14 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat); detail::ProgramManager::getInstance().addImages(PIDeviceBinaries); Handle = OSUtil::getOSModuleHandle(PIDeviceBinaries->DeviceBinaries); - } else if (DebugEnabled) { - // TODO(Lukas): Create correct OSModuleHandle when using a cached binary. - std::cerr << "INFO: Re-using existing device binary for fused kernel\n"; + CachedModules.emplace(FusedKernelInfo.Name, Handle); + } else { + if (DebugEnabled) { + std::cerr << "INFO: Re-using existing device binary for fused kernel\n"; + } + // Retrieve an OSModuleHandle for the cached binary. + assert(CachedModules.count(FusedKernelInfo.Name) && "No cached binary"); + Handle = CachedModules.at(FusedKernelInfo.Name); } // Create a kernel bundle for the fused kernel. diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index 71a57723ea6bb..fae774cadd09a 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -12,6 +12,8 @@ #include #include +#include + namespace jit_compiler { enum class BinaryFormat : uint32_t; class JITContext; @@ -60,6 +62,8 @@ class jit_compiler { // Manages the lifetime of the PI structs for device binaries. std::vector JITDeviceBinaries; + std::unordered_map CachedModules; + std::unique_ptr<::jit_compiler::JITContext> MJITContext; }; From 26978b2586cf5f096a8cbd15aae245110fe08bad Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 15 Feb 2023 12:56:58 +0000 Subject: [PATCH 14/16] [SYCL][Fusion] Catch empty standard arguments Signed-off-by: Lukas Sommer --- sycl/source/detail/jit_compiler.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index d5742a3580445..de07a86623219 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -457,7 +457,10 @@ static ParamIterator preProcessArguments( // which will go out-of-scope before we execute the fused kernel. Therefore, // we need to copy the argument to a permant location and update the // argument. - Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize); + if (Arg->Arg.MPtr) { + Arg->Arg.MPtr = + storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize); + } // Standard layout arguments do not participate in identical argument // detection, but we still add it to the list here. As the SYCL runtime can // only check the raw bytes for identical content, but is unaware of the @@ -474,6 +477,7 @@ static ParamIterator preProcessArguments( ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex}, Arg->Arg.MPtr, Arg->Arg.MSize); return ++Arg; + } // First check if there's already another parameter with identical // value. From aa2b629f4f98d1d16d7fbe4c8bce7b9785a177f3 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 7 Mar 2023 14:03:28 +0000 Subject: [PATCH 15/16] [SYCL][Fusion] Rebase and address feedback Signed-off-by: Lukas Sommer --- clang/include/clang/Driver/Action.h | 2 +- clang/lib/Driver/Action.cpp | 2 +- sycl-fusion/jit-compiler/CMakeLists.txt | 7 +- sycl-fusion/jit-compiler/include/JITContext.h | 7 +- sycl-fusion/jit-compiler/include/Options.h | 1 + sycl-fusion/jit-compiler/lib/JITContext.cpp | 11 +- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 22 + .../lib/fusion/FusionPipeline.cpp | 4 +- .../jit-compiler/lib/fusion/ModuleHelper.cpp | 2 +- .../lib/translation/KernelTranslation.cpp | 43 +- .../lib/translation/KernelTranslation.h | 10 +- .../lib/translation/SPIRVLLVMTranslation.cpp | 2 +- .../internalization/Internalization.cpp | 2 +- .../passes/kernel-fusion/SYCLKernelFusion.cpp | 13 +- sycl-fusion/passes/syclcp/SYCLCP.cpp | 2 +- .../passes/target/TargetFusionInfo.cpp | 458 +++++++++++------- sycl-fusion/passes/target/TargetFusionInfo.h | 131 +---- sycl/source/detail/device_info.hpp | 3 +- sycl/source/detail/jit_compiler.cpp | 38 +- sycl/source/detail/scheduler/commands.hpp | 2 +- 20 files changed, 412 insertions(+), 350 deletions(-) diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h index 9dab2e32b2ffc..45e9133b7ed13 100644 --- a/clang/include/clang/Driver/Action.h +++ b/clang/include/clang/Driver/Action.h @@ -665,7 +665,7 @@ class OffloadWrapperJobAction : public JobAction { public: OffloadWrapperJobAction(ActionList &Inputs, types::ID Type); OffloadWrapperJobAction(Action *Input, types::ID OutputType, - bool IsEmbeddedIR = false); + bool EmbedIR = false); bool isEmbeddedIR() const { return EmbedIR; } diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp index f55233c59a6db..4cb0225cad293 100644 --- a/clang/lib/Driver/Action.cpp +++ b/clang/lib/Driver/Action.cpp @@ -478,7 +478,7 @@ void OffloadWrapperJobAction::anchor() {} OffloadWrapperJobAction::OffloadWrapperJobAction(ActionList &Inputs, types::ID Type) - : JobAction(OffloadWrapperJobClass, Inputs, Type) {} + : JobAction(OffloadWrapperJobClass, Inputs, Type), EmbedIR(false) {} OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input, types::ID Type, bool IsEmbeddedIR) diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index 777f06b303e0c..bf323239679b4 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -22,9 +22,8 @@ add_llvm_library(sycl-fusion InstCombine Target TargetParser - NVPTX - X86 MC + ${LLVM_TARGETS_TO_BUILD} ) target_include_directories(sycl-fusion @@ -47,6 +46,10 @@ target_link_libraries(sycl-fusion ${CMAKE_THREAD_LIBS_INIT} ) +if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) + target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_PTX) +endif() + if (BUILD_SHARED_LIBS) if(NOT MSVC AND NOT APPLE) # Manage symbol visibility through the linker to make sure no LLVM symbols diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h index eda8c83d8b21c..69465a74d8371 100644 --- a/sycl-fusion/jit-compiler/include/JITContext.h +++ b/sycl-fusion/jit-compiler/include/JITContext.h @@ -39,7 +39,7 @@ using CacheKeyT = /// Wrapper around a kernel binary. class KernelBinary { public: - explicit KernelBinary(std::string Binary, BinaryFormat Format); + explicit KernelBinary(std::string &&Binary, BinaryFormat Format); jit_compiler::BinaryAddress address() const; @@ -65,7 +65,10 @@ class JITContext { llvm::LLVMContext *getLLVMContext(); - KernelBinary &emplaceSPIRVBinary(std::string Binary, BinaryFormat Format); + template KernelBinary &emplaceKernelBinary(Ts &&...Args) { + WriteLockT WriteLock{BinariesMutex}; + return Binaries.emplace_back(std::forward(Args)...); + } std::optional getCacheEntry(CacheKeyT &Identifier) const; diff --git a/sycl-fusion/jit-compiler/include/Options.h b/sycl-fusion/jit-compiler/include/Options.h index 4fe7787df00db..841a229adb7a3 100644 --- a/sycl-fusion/jit-compiler/include/Options.h +++ b/sycl-fusion/jit-compiler/include/Options.h @@ -10,6 +10,7 @@ #define SYCL_FUSION_JIT_COMPILER_OPTIONS_H #include "Kernel.h" + #include #include diff --git a/sycl-fusion/jit-compiler/lib/JITContext.cpp b/sycl-fusion/jit-compiler/lib/JITContext.cpp index e1dda8b928c45..4499dd93f4d76 100644 --- a/sycl-fusion/jit-compiler/lib/JITContext.cpp +++ b/sycl-fusion/jit-compiler/lib/JITContext.cpp @@ -11,7 +11,7 @@ using namespace jit_compiler; -KernelBinary::KernelBinary(std::string Binary, BinaryFormat Fmt) +KernelBinary::KernelBinary(std::string &&Binary, BinaryFormat Fmt) : Blob{std::move(Binary)}, Format{Fmt} {} jit_compiler::BinaryAddress KernelBinary::address() const { @@ -29,15 +29,6 @@ JITContext::~JITContext() = default; llvm::LLVMContext *JITContext::getLLVMContext() { return LLVMCtx.get(); } -KernelBinary &JITContext::emplaceSPIRVBinary(std::string Binary, - BinaryFormat Format) { - WriteLockT WriteLock{BinariesMutex}; - // NOTE: With C++17, which returns a reference from emplace_back, the - // following code would be even simpler. - Binaries.emplace_back(std::move(Binary), Format); - return Binaries.back(); -} - std::optional JITContext::getCacheEntry(CacheKeyT &Identifier) const { ReadLockT ReadLock{CacheMutex}; diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index e2f321bbcb81b..be7515d935247 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -48,6 +48,22 @@ gatherNDRanges(llvm::ArrayRef KernelInformation) { return NDRanges; } +static bool isTargetFormatSupported(BinaryFormat TargetFormat) { + switch (TargetFormat) { + case BinaryFormat::SPIRV: + return true; + case BinaryFormat::PTX: { +#ifdef FUSION_JIT_SUPPORT_PTX + return true; +#else // FUSION_JIT_SUPPORT_PTX + return false; +#endif // FUSION_JIT_SUPPORT_PTX + } + default: + return false; + } +} + FusionResult KernelFusion::fuseKernels( JITContext &JITCtx, Config &&JITConfig, const std::vector &KernelInformation, @@ -71,6 +87,12 @@ FusionResult KernelFusion::fuseKernels( bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges); BinaryFormat TargetFormat = ConfigHelper::get(); + + if (!isTargetFormatSupported(TargetFormat)) { + return FusionResult( + "Fusion output target format not supported by this build"); + } + if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) { return FusionResult{"Heterogeneous ND ranges not supported for CUDA"}; } diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp index fb2e2b15200a7..965202fe5bbfa 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp @@ -23,8 +23,8 @@ #ifndef NDEBUG #include "llvm/IR/Verifier.h" #endif // NDEBUG -#include "llvm/ADT/Triple.h" #include "llvm/Passes/PassBuilder.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" @@ -103,7 +103,7 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, // to/from generic address-space as possible, because these hinder // internalization. // Ideally, the static compiler should have performed that job. - unsigned FlatAddressSpace = getFlatAddressSpace(Mod); + const unsigned FlatAddressSpace = getFlatAddressSpace(Mod); FPM.addPass(InferAddressSpacesPass(FlatAddressSpace)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } diff --git a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp index 0a90600530f74..0d6ac7f48fbbe 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp @@ -24,7 +24,7 @@ helper::ModuleHelper::cloneAndPruneModule(Module *Mod, identifyUnusedFunctions(Mod, CGRoots, UnusedFunctions); { - auto TFI = llvm::TargetFusionInfo::getTargetFusionInfo(Mod); + TargetFusionInfo TFI{Mod}; SmallVector Unused{UnusedFunctions.begin(), UnusedFunctions.end()}; TFI.notifyFunctionsDelete(Unused); diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index b877829782658..8b1f3d913026d 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "KernelTranslation.h" + #include "SPIRVLLVMTranslation.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/Constants.h" @@ -182,7 +183,8 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel, break; } case BinaryFormat::PTX: { - llvm::Expected BinaryOrError = translateToPTX(Mod, JITCtx); + llvm::Expected BinaryOrError = + translateToPTX(Kernel, Mod, JITCtx); if (auto Error = BinaryOrError.takeError()) { return Error; } @@ -215,12 +217,20 @@ KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) { } llvm::Expected -KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { - // FIXME: Can we limit this to the NVPTX specific target? - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllAsmPrinters(); - llvm::InitializeAllTargetMCs(); +KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod, + JITContext &JITCtx) { +#ifndef FUSION_JIT_SUPPORT_PTX + return createStringError(inconvertibleErrorCode(), + "PTX translation not supported in this build"); +#else // FUSION_JIT_SUPPORT_PTX + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXAsmPrinter(); + LLVMInitializeNVPTXTargetMC(); +#endif // FUSION_JIT_SUPPORT_PTX + + static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; + static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; std::string TargetTriple{"nvptx64-nvidia-cuda"}; @@ -231,13 +241,26 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { if (!Target) { return createStringError( inconvertibleErrorCode(), - "Failed to load and translate SPIR-V module with error %s", + "Failed to load and translate PTX LLVM IR module with error %s", ErrorMessage.c_str()); } + llvm::StringRef TargetCPU{"sm_50"}; + llvm::StringRef TargetFeatures{"+sm_50,+ptx76"}; + if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name)) { + if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { + TargetCPU = + KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); + } + if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { + TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) + .getValueAsString(); + } + } + // FIXME: Check whether we can provide more accurate target information here auto *TargetMachine = Target->createTargetMachine( - TargetTriple, "sm_50", "+sm_50,+ptx76", {}, llvm::Reloc::PIC_, + TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_, std::nullopt, llvm::CodeGenOpt::Default); llvm::legacy::PassManager PM; @@ -259,5 +282,5 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { ASMStream.flush(); } - return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX); + return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX); } diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h index 276b6e536d269..7e4816df9bf94 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#ifndef SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H +#define SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H #include "JITContext.h" #include "Kernel.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include +#include "llvm/Support/Error.h" #include namespace jit_compiler { @@ -39,8 +41,10 @@ class KernelTranslator { static llvm::Expected translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx); - static llvm::Expected translateToPTX(llvm::Module &Mod, - JITContext &JITCtx); + static llvm::Expected + translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx); }; } // namespace translation } // namespace jit_compiler + +#endif // SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp index a5effb33a85e1..4092f9dd96fc8 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp @@ -94,5 +94,5 @@ SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) { "Translation of LLVM IR to SPIR-V failed with error %s", ErrMsg.c_str()); } - return &JITCtx.emplaceSPIRVBinary(BinaryStream.str(), BinaryFormat::SPIRV); + return &JITCtx.emplaceKernelBinary(BinaryStream.str(), BinaryFormat::SPIRV); } diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index bca46ba43acb3..3d5c38c799e8f 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -631,7 +631,7 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM, PreservedAnalyses llvm::SYCLInternalizer::run(Module &M, ModuleAnalysisManager &AM) { - auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + TargetFusionInfo TFI{&M}; // Private promotion const PreservedAnalyses Tmp = SYCLInternalizerImpl{ TFI.getPrivateAddressSpace(), PrivatePromotion, true, TFI}(M, AM); diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp index a0079affbaa06..b61a0936eb32e 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp @@ -138,7 +138,7 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) { AM.getResult(M).ModuleInfo; assert(ModuleInfo && "No module information available"); - auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + TargetFusionInfo TFI{&M}; // Iterate over the functions in the module and locate all // stub functions identified by metadata. @@ -456,11 +456,20 @@ Error SYCLKernelFusion::fuseKernel( FT, GlobalValue::LinkageTypes::ExternalLinkage, M.getDataLayout().getProgramAddressSpace(), KernelName->getString(), &M); { + auto DefaultAttr = FusedFunction->getAttributes(); + // Add uniform function attributes, i.e., attributes with identical value on + // each input function, to the fused function. + auto *FirstFunction = InputFunctions.front().F; + for (const auto &UniformKey : TargetInfo.getUniformKernelAttributes()) { + if (FirstFunction->hasFnAttribute(UniformKey)) { + DefaultAttr = DefaultAttr.addFnAttribute( + LLVMCtx, FirstFunction->getFnAttribute(UniformKey)); + } + } // Add the collected parameter attributes to the fused function. // Copying the parameter attributes from their original definition in the // input kernels should be safe and they most likely can't be deducted later // on, as no caller is present in the module. - auto DefaultAttr = FusedFunction->getAttributes(); auto FusedFnAttrs = AttributeList::get(LLVMCtx, DefaultAttr.getFnAttrs(), DefaultAttr.getRetAttrs(), FusedParamAttributes); diff --git a/sycl-fusion/passes/syclcp/SYCLCP.cpp b/sycl-fusion/passes/syclcp/SYCLCP.cpp index 5cb9b00433ef7..b928a33760da0 100644 --- a/sycl-fusion/passes/syclcp/SYCLCP.cpp +++ b/sycl-fusion/passes/syclcp/SYCLCP.cpp @@ -249,7 +249,7 @@ PreservedAnalyses SYCLCP::run(Module &M, ModuleAnalysisManager &AM) { Changed = propagateConstants(F, *ConstantsOrErr) || Changed; } - auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + TargetFusionInfo TFI{&M}; if (Changed) { moduleCleanup(M, AM, TFI); diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp index c17e2464ac323..27514bbb80269 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.cpp +++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp @@ -7,234 +7,336 @@ //===----------------------------------------------------------------------===// #include "TargetFusionInfo.h" -#include "llvm/ADT/Triple.h" + #include "llvm/IR/Constants.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/TargetParser/Triple.h" -using namespace llvm; +namespace llvm { +class TargetFusionInfoImpl { -// -// TargetFusionInfo -// +public: + explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {}; -TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) { - llvm::Triple Tri(Mod->getTargetTriple()); - if (Tri.isNVPTX()) { - return TargetFusionInfo( - std::shared_ptr(new NVPTXTargetFusionInfo(Mod))); - } - if (Tri.isSPIRV() || Tri.isSPIR()) { - return TargetFusionInfo( - std::shared_ptr(new SPIRVTargetFusionInfo(Mod))); - } - assert(false && "Unsupported target for fusion"); -} + virtual ~TargetFusionInfoImpl() = default; + + virtual void notifyFunctionsDelete( + [[maybe_unused]] llvm::ArrayRef Funcs) const {} + + virtual void addKernelFunction([[maybe_unused]] Function *KernelFunc) const {} + + virtual void postProcessKernel([[maybe_unused]] Function *KernelFunc) const {} + + virtual ArrayRef getKernelMetadataKeys() const { return {}; } + + virtual ArrayRef getUniformKernelAttributes() const { return {}; } + + virtual void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const = 0; + + virtual unsigned getPrivateAddressSpace() const = 0; + + virtual unsigned getLocalAddressSpace() const = 0; + + virtual void + updateAddressSpaceMetadata([[maybe_unused]] Function *KernelFunc, + [[maybe_unused]] ArrayRef LocalSize, + [[maybe_unused]] unsigned AddressSpace) const {} + +protected: + llvm::Module *LLVMMod; +}; // // SPIRVTargetFusionInfo // +class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { +public: + using TargetFusionInfoImpl::TargetFusionInfoImpl; -void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) const { - KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL); -} - -ArrayRef SPIRVTargetFusionInfo::getKernelMetadataKeys() const { - // NOTE: We do not collect the "kernel_arg_name" metadata, because - // the kernel arguments receive new names in the fused kernel. - static SmallVector Keys{ - {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type", - "kernel_arg_base_type", "kernel_arg_type_qual"}}; - return Keys; -} + void addKernelFunction(Function *KernelFunc) const override { + KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL); + } -void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) const { - // Attach the kernel_arg_name metadata. - SmallVector KernelArgNames; - for (auto &P : KernelFunc->args()) { - KernelArgNames.push_back(MDString::get(LLVMMod->getContext(), P.getName())); + ArrayRef getKernelMetadataKeys() const override { + // NOTE: We do not collect the "kernel_arg_name" metadata, because + // the kernel arguments receive new names in the fused kernel. + static SmallVector Keys{ + {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type", + "kernel_arg_base_type", "kernel_arg_type_qual"}}; + return Keys; } - auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames); - KernelFunc->setMetadata("kernel_arg_name", ArgNameMD); - - static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; - static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; - // Remove all existing calls of the ITT instrumentation functions. Insert new - // ones in the entry block of the fused kernel and every exit block if the - // functions are present in the module. - // We cannot use the existing SPIRITTAnnotations pass, because that pass might - // insert calls to functions not present in the module (e.g., ITT - // instrumentations for barriers). As the JITed module is not linked with - // libdevice anymore, the functions would remain unresolved and cause the - // driver to fail. - Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper); - Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper); - bool InsertWrappers = - ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) && - (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration())); - auto *WrapperFuncTy = FunctionType::get( - Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false); - for (auto &BB : *KernelFunc) { - for (auto Inst = BB.begin(); Inst != BB.end();) { - if (auto *CB = dyn_cast(Inst)) { - if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) { - Inst = Inst->eraseFromParent(); - continue; + + void postProcessKernel(Function *KernelFunc) const override { + // Attach the kernel_arg_name metadata. + SmallVector KernelArgNames; + for (auto &P : KernelFunc->args()) { + KernelArgNames.push_back( + MDString::get(LLVMMod->getContext(), P.getName())); + } + auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames); + KernelFunc->setMetadata("kernel_arg_name", ArgNameMD); + + static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; + static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; + // Remove all existing calls of the ITT instrumentation functions. Insert + // new ones in the entry block of the fused kernel and every exit block if + // the functions are present in the module. We cannot use the existing + // SPIRITTAnnotations pass, because that pass might insert calls to + // functions not present in the module (e.g., ITT instrumentations for + // barriers). As the JITed module is not linked with libdevice anymore, the + // functions would remain unresolved and cause the driver to fail. + Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper); + Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper); + bool InsertWrappers = + ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) && + (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration())); + auto *WrapperFuncTy = FunctionType::get( + Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false); + for (auto &BB : *KernelFunc) { + for (auto Inst = BB.begin(); Inst != BB.end();) { + if (auto *CB = dyn_cast(Inst)) { + if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) { + Inst = Inst->eraseFromParent(); + continue; + } + } + ++Inst; + } + if (InsertWrappers) { + if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { + auto *WrapperCall = + CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI); + WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); } } - ++Inst; } if (InsertWrappers) { - if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { - auto *WrapperCall = - CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI); - WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); - } + KernelFunc->getEntryBlock().getFirstInsertionPt(); + auto *WrapperCall = + CallInst::Create(WrapperFuncTy, StartWrapperFunc, "", + &*KernelFunc->getEntryBlock().getFirstInsertionPt()); + WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); } } - if (InsertWrappers) { - KernelFunc->getEntryBlock().getFirstInsertionPt(); - auto *WrapperCall = - CallInst::Create(WrapperFuncTy, StartWrapperFunc, "", - &*KernelFunc->getEntryBlock().getFirstInsertionPt()); - WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); - } -} -void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const { - if (BarrierFlags == -1) { - return; - } - assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) && - "Invalid barrier flags"); + void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const override { + if (BarrierFlags == -1) { + return; + } + assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) && + "Invalid barrier flags"); + + static const auto FnAttrs = AttributeSet::get( + LLVMMod->getContext(), + {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent), + Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)}); - static const auto FnAttrs = AttributeSet::get( - LLVMMod->getContext(), - {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent), - Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)}); + static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"}; - static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"}; + Function *F = LLVMMod->getFunction(N); + if (!F) { + constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage; - Function *F = LLVMMod->getFunction(N); - if (!F) { - constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage; + auto *Ty = FunctionType::get( + Builder.getVoidTy(), + {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()}, + false /* isVarArg*/); - auto *Ty = FunctionType::get( - Builder.getVoidTy(), - {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()}, - false /* isVarArg*/); + F = Function::Create(Ty, Linkage, N, *LLVMMod); - F = Function::Create(Ty, Linkage, N, *LLVMMod); + F->setAttributes( + AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); + F->setCallingConv(CallingConv::SPIR_FUNC); + } + + // See + // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- + SmallVector Args{ + Builder.getInt32(/*Exec Scope : Workgroup = */ 2), + Builder.getInt32(/*Exec Scope : Workgroup = */ 2), + Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) | + ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))}; - F->setAttributes( + auto *BarrierCallInst = Builder.CreateCall(F, Args); + BarrierCallInst->setAttributes( AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); - F->setCallingConv(CallingConv::SPIR_FUNC); + BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC); } - // See - // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- - SmallVector Args{ - Builder.getInt32(/*Exec Scope : Workgroup = */ 2), - Builder.getInt32(/*Exec Scope : Workgroup = */ 2), - Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) | - ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))}; - - auto *BarrierCallInst = Builder.CreateCall(F, Args); - BarrierCallInst->setAttributes( - AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); - BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC); -} + // Corresponds to definition of spir_private and spir_local in + // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap". + unsigned getPrivateAddressSpace() const override { return 0; } + unsigned getLocalAddressSpace() const override { return 3; } -void SPIRVTargetFusionInfo::updateAddressSpaceMetadata( - Function *KernelFunc, ArrayRef LocalSize, - unsigned AddressSpace) const { - static constexpr unsigned AddressSpaceBitWidth{32}; - static constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"}; - - auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get( - IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth), - AddressSpace)); - if (auto *AddrspaceMD = dyn_cast_or_null( - KernelFunc->getMetadata(KernelArgAddrSpaceMD))) { - // If we have kernel_arg_addr_space metadata in the original function, - // we should update it in the new one. - SmallVector NewInfo{AddrspaceMD->op_begin(), - AddrspaceMD->op_end()}; - for (auto I : enumerate(LocalSize)) { - if (I.value() == 0) { - continue; - } - const auto Index = I.index(); - if (const auto *PtrTy = - dyn_cast(KernelFunc->getArg(Index)->getType())) { - if (PtrTy->getAddressSpace() == getLocalAddressSpace()) { - NewInfo[Index] = NewAddrspace; + void updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const override { + static constexpr unsigned AddressSpaceBitWidth{32}; + static constexpr StringLiteral KernelArgAddrSpaceMD{ + "kernel_arg_addr_space"}; + + auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get( + IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth), + AddressSpace)); + if (auto *AddrspaceMD = dyn_cast_or_null( + KernelFunc->getMetadata(KernelArgAddrSpaceMD))) { + // If we have kernel_arg_addr_space metadata in the original function, + // we should update it in the new one. + SmallVector NewInfo{AddrspaceMD->op_begin(), + AddrspaceMD->op_end()}; + for (auto I : enumerate(LocalSize)) { + if (I.value() == 0) { + continue; + } + const auto Index = I.index(); + if (const auto *PtrTy = + dyn_cast(KernelFunc->getArg(Index)->getType())) { + if (PtrTy->getAddressSpace() == getLocalAddressSpace()) { + NewInfo[Index] = NewAddrspace; + } } } + KernelFunc->setMetadata(KernelArgAddrSpaceMD, + MDNode::get(KernelFunc->getContext(), NewInfo)); } - KernelFunc->setMetadata(KernelArgAddrSpaceMD, - MDNode::get(KernelFunc->getContext(), NewInfo)); } -} +}; // // NVPTXTargetFusionInfo // +class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { +public: + using TargetFusionInfoImpl::TargetFusionInfoImpl; -void NVPTXTargetFusionInfo::notifyFunctionsDelete( - llvm::ArrayRef Funcs) const { - SmallPtrSet DeletedFuncs{Funcs.begin(), Funcs.end()}; - SmallVector ValidKernels; - auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations"); - for (auto *Op : OldAnnotations->operands()) { - if (auto *TOp = dyn_cast(Op)) { - if (auto *COp = dyn_cast_if_present( - TOp->getOperand(0).get())) { - if (!DeletedFuncs.contains(COp->getValue())) { - ValidKernels.push_back(Op); - // Add to the set to also remove duplicate entries. - DeletedFuncs.insert(COp->getValue()); + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const override { + SmallPtrSet DeletedFuncs{Funcs.begin(), Funcs.end()}; + SmallVector ValidKernels; + auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations"); + for (auto *Op : OldAnnotations->operands()) { + if (auto *TOp = dyn_cast(Op)) { + if (auto *COp = dyn_cast_if_present( + TOp->getOperand(0).get())) { + if (!DeletedFuncs.contains(COp->getValue())) { + ValidKernels.push_back(Op); + // Add to the set to also remove duplicate entries. + DeletedFuncs.insert(COp->getValue()); + } } } } + LLVMMod->eraseNamedMetadata(OldAnnotations); + auto *NewAnnotations = + LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + for (auto *Kernel : ValidKernels) { + NewAnnotations->addOperand(Kernel); + } + } + + void addKernelFunction(Function *KernelFunc) const override { + auto *NVVMAnnotations = + LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + auto *MDOne = ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1)); + auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel"); + auto *MDFunc = ConstantAsMetadata::get(KernelFunc); + SmallVector KernelMD({MDFunc, MDKernelString, MDOne}); + auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD); + NVVMAnnotations->addOperand(Tuple); + } + + ArrayRef getKernelMetadataKeys() const override { + // FIXME: Check whether we need to take care of sycl_fixed_targets. + static SmallVector Keys{{"kernel_arg_buffer_location", + "kernel_arg_runtime_aligned", + "kernel_arg_exclusive_ptr"}}; + return Keys; + } + + ArrayRef getUniformKernelAttributes() const override { + static SmallVector Keys{ + {"target-cpu", "target-features", "uniform-work-group-size"}}; + return Keys; + } + + void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const override { + if (BarrierFlags == -1) { + return; + } + // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX + // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0 + // instruction, equivalent to the __syncthreads() call in CUDA." + Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {}); + } + + // Corresponds to the definitions in the LLVM NVPTX backend user guide: + // https://llvm.org/docs/NVPTXUsage.html#address-spaces + unsigned getPrivateAddressSpace() const override { return 0; } + unsigned getLocalAddressSpace() const override { return 3; } +}; + +// +// TargetFusionInfo +// + +TargetFusionInfo::TargetFusionInfo(llvm::Module *Mod) { + llvm::Triple Tri(Mod->getTargetTriple()); + if (Tri.isNVPTX()) { + Impl = std::make_shared(Mod); + return; } - LLVMMod->eraseNamedMetadata(OldAnnotations); - auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); - for (auto *Kernel : ValidKernels) { - NewAnnotations->addOperand(Kernel); + if (Tri.isSPIRV() || Tri.isSPIR()) { + Impl = std::make_shared(Mod); + return; } + llvm_unreachable("Unsupported target for fusion"); } -void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) const { - auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); - auto *MDOne = ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1)); - auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel"); - auto *MDFunc = ConstantAsMetadata::get(KernelFunc); - SmallVector KernelMD({MDFunc, MDKernelString, MDOne}); - auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD); - NVVMAnnotations->addOperand(Tuple); +void TargetFusionInfo::notifyFunctionsDelete( + llvm::ArrayRef Funcs) const { + Impl->notifyFunctionsDelete(Funcs); } -ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() const { - // FIXME: Check whether we need to take care of sycl_fixed_targets. - static SmallVector Keys{{"kernel_arg_buffer_location", - "kernel_arg_runtime_aligned", - "kernel_arg_exclusive_ptr"}}; - return Keys; +void TargetFusionInfo::addKernelFunction(llvm::Function *KernelFunc) const { + Impl->addKernelFunction(KernelFunc); } -void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const { - if (BarrierFlags == -1) { - return; - } - // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX - // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0 - // instruction, equivalent to the __syncthreads() call in CUDA." - Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {}); +void TargetFusionInfo::postProcessKernel(Function *KernelFunc) const { + Impl->postProcessKernel(KernelFunc); +} + +llvm::ArrayRef +TargetFusionInfo::getKernelMetadataKeys() const { + return Impl->getKernelMetadataKeys(); +} + +void TargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const { + Impl->createBarrierCall(Builder, BarrierFlags); +} + +unsigned TargetFusionInfo::getPrivateAddressSpace() const { + return Impl->getPrivateAddressSpace(); +} + +unsigned TargetFusionInfo::getLocalAddressSpace() const { + return Impl->getLocalAddressSpace(); +} + +void TargetFusionInfo::updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const { + Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace); +} + +llvm::ArrayRef +TargetFusionInfo::getUniformKernelAttributes() const { + return Impl->getUniformKernelAttributes(); } // @@ -265,3 +367,5 @@ void MetadataCollection::attachToFunction(llvm::Function *Func) { Func->setMetadata(Key, MDEntries); } } + +} // namespace llvm diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h index c307de2dd764e..f88476c01ebc3 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.h +++ b/sycl-fusion/passes/target/TargetFusionInfo.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef SYCL_FUSION_PASSES_TARGETFUSIONINFO_H -#define SYCL_FUSION_PASSES_TARGETFUSIONINFO_H +#ifndef SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H +#define SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -15,90 +15,7 @@ namespace llvm { -class TargetFusionInfoImpl { - -public: - virtual ~TargetFusionInfoImpl() = default; - - virtual void notifyFunctionsDelete(llvm::ArrayRef Funcs) const { - (void)Funcs; - } - - virtual void addKernelFunction(Function *KernelFunc) const { - (void)KernelFunc; - } - - virtual void postProcessKernel(Function *KernelFunc) const { - (void)KernelFunc; - } - - virtual ArrayRef getKernelMetadataKeys() const { return {}; } - - virtual void createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const = 0; - - virtual unsigned getPrivateAddressSpace() const = 0; - - virtual unsigned getLocalAddressSpace() const = 0; - - virtual void updateAddressSpaceMetadata(Function *KernelFunc, - ArrayRef LocalSize, - unsigned AddressSpace) const { - (void)KernelFunc; - (void)LocalSize; - } - -protected: - explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {}; - - llvm::Module *LLVMMod; - - friend class TargetFusionInfo; -}; - -class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { -public: - void addKernelFunction(Function *KernelFunc) const override; - - ArrayRef getKernelMetadataKeys() const override; - - void postProcessKernel(Function *KernelFunc) const override; - - void createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const override; - - // Corresponds to definition of spir_private and spir_local in - // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap". - unsigned getPrivateAddressSpace() const override { return 0; } - unsigned getLocalAddressSpace() const override { return 3; } - - void updateAddressSpaceMetadata(Function *KernelFunc, - ArrayRef LocalSize, - unsigned AddressSpace) const override; - -private: - using TargetFusionInfoImpl::TargetFusionInfoImpl; -}; - -class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { -public: - void notifyFunctionsDelete(llvm::ArrayRef Funcs) const override; - - void addKernelFunction(Function *KernelFunc) const override; - - ArrayRef getKernelMetadataKeys() const override; - - void createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const override; - - // Corresponds to the definitions in the LLVM NVPTX backend user guide: - // https://llvm.org/docs/NVPTXUsage.html#address-spaces - unsigned getPrivateAddressSpace() const override { return 0; } - unsigned getLocalAddressSpace() const override { return 3; } - -private: - using TargetFusionInfoImpl::TargetFusionInfoImpl; -}; +class TargetFusionInfoImpl; /// /// Common interface to target-specific logic around handling of kernel @@ -108,59 +25,49 @@ class TargetFusionInfo { /// /// Create the correct target-specific implementation based on the target /// triple of \p Module. - static TargetFusionInfo getTargetFusionInfo(llvm::Module *Module); + explicit TargetFusionInfo(llvm::Module *Module); /// /// Notify the target-specific implementation that set of functions \p Funcs /// is about to be erased from the module. This should be called BEFORE /// erasing the functions. - void notifyFunctionsDelete(llvm::ArrayRef Funcs) const { - Impl->notifyFunctionsDelete(Funcs); - } + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const; /// /// Notify the target-specific implementation that the function \p KernelFunc /// was added as a new kernel. This should be called AFTER the function has /// been added. - void addKernelFunction(llvm::Function *KernelFunc) const { - Impl->addKernelFunction(KernelFunc); - } + void addKernelFunction(llvm::Function *KernelFunc) const; /// /// Target-specific post-processing of the new kernel function \p KernelFunc. /// This should be called AFTER the function has been added and defined. - void postProcessKernel(Function *KernelFunc) const { - Impl->postProcessKernel(KernelFunc); - } + void postProcessKernel(Function *KernelFunc) const; /// /// Get the target-specific list of argument metadata attached to each /// function that should be collected and attached to the fused kernel. - llvm::ArrayRef getKernelMetadataKeys() const { - return Impl->getKernelMetadataKeys(); - } + llvm::ArrayRef getKernelMetadataKeys() const; + + /// + /// Get the target-specific list of kernel function attributes that are + /// uniform across all input kernels and should be attached to the fused + /// kernel. + llvm::ArrayRef getUniformKernelAttributes() const; - void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const { - Impl->createBarrierCall(Builder, BarrierFlags); - } + void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const; - unsigned getPrivateAddressSpace() const { - return Impl->getPrivateAddressSpace(); - } + unsigned getPrivateAddressSpace() const; - unsigned getLocalAddressSpace() const { return Impl->getLocalAddressSpace(); } + unsigned getLocalAddressSpace() const; void updateAddressSpaceMetadata(Function *KernelFunc, ArrayRef LocalSize, - unsigned AddressSpace) const { - Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace); - } + unsigned AddressSpace) const; private: using ImplPtr = std::shared_ptr; - TargetFusionInfo(ImplPtr &&I) : Impl{I} {} - ImplPtr Impl; }; @@ -183,4 +90,4 @@ class MetadataCollection { }; } // namespace llvm -#endif // SYCL_FUSION_PASSES_TARGETFUSIONINFO_H +#endif // SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 4a0d004ff8a11..7c96bef0ec5b8 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -762,7 +762,8 @@ struct get_device_info_impl< // and LevelZero. (void)dev; return (Plugin.getBackend() == backend::ext_oneapi_level_zero) || - (Plugin.getBackend() == backend::opencl); + (Plugin.getBackend() == backend::opencl) || + (Plugin.getBackend() == backend::ext_oneapi_cuda); #else // SYCL_EXT_CODEPLAY_KERNEL_FUSION (void)dev; (void)Plugin; diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index de07a86623219..ae172158e5e67 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -62,21 +62,16 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { backend::ext_oneapi_cuda; if (isNvidia) { auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName); - std::vector KernelIds; - KernelIds.push_back(KernelID); + std::vector KernelIds{KernelID}; auto DeviceImages = ProgramManager::getInstance().getRawDeviceImages(KernelIds); - const RTDeviceBinaryImage *DeviceImage = nullptr; - for (auto *DI : DeviceImages) { - // We are looking for a device image with LLVM IR format and target spec - // "llvm_nvptx64", which has been set by the offload-wrapper action. - if (DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE && - DI->getRawData().DeviceTargetSpec == std::string("llvm_nvptx64")) { - DeviceImage = DI; - break; - } - } - if (!DeviceImage) { + auto DeviceImage = std::find_if( + DeviceImages.begin(), DeviceImages.end(), [](RTDeviceBinaryImage *DI) { + return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE && + DI->getRawData().DeviceTargetSpec == + std::string("llvm_nvptx64"); + }); + if (DeviceImage == DeviceImages.end()) { return {nullptr, nullptr}; } auto ContextImpl = Queue->getContextImplPtr(); @@ -84,9 +79,9 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { auto DeviceImpl = Queue->getDeviceImplPtr(); auto Device = detail::createSyclObjFromImpl(DeviceImpl); RT::PiProgram Program = - detail::ProgramManager::getInstance().createPIProgram(*DeviceImage, + detail::ProgramManager::getInstance().createPIProgram(**DeviceImage, Context, Device); - return {DeviceImage, Program}; + return {*DeviceImage, Program}; } const RTDeviceBinaryImage *DeviceImage = nullptr; @@ -460,6 +455,11 @@ static ParamIterator preProcessArguments( if (Arg->Arg.MPtr) { Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize); + // Propagate values of scalar parameters as constants to the JIT + // compiler. + JITConstants.emplace_back( + ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex}, + Arg->Arg.MPtr, Arg->Arg.MSize); } // Standard layout arguments do not participate in identical argument // detection, but we still add it to the list here. As the SYCL runtime can @@ -468,16 +468,10 @@ static ParamIterator preProcessArguments( // not be materialized by the JIT compiler. Instead of removing some // standard layout arguments due to identity and missing some in case the // materialization is not possible, we rely on constant propagation to - // replace standard layout arguments by constants (see below). + // replace standard layout arguments by constants. NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex, true); - // Propagate values of scalar parameters as constants to the JIT - // compiler. - JITConstants.emplace_back( - ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex}, - Arg->Arg.MPtr, Arg->Arg.MSize); return ++Arg; - } // First check if there's already another parameter with identical // value. diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 482c7a1d7d309..cdabd6f1f0c68 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -289,7 +289,7 @@ class Command { // XPTI instrumentation. Copy code location details to the internal struct. // Memory is allocated in this method and released in destructor. void copySubmissionCodeLocation(); - + /// Clear all dependency events for device and host dependencies. This should /// only be used if a command is about to be deleted without being executed /// before that. From d323beb24f82f4a1de7b8549df19c7f5a5cbada9 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 22 Mar 2023 16:40:10 +0000 Subject: [PATCH 16/16] [SYCL][Fusion] Update linkage graph diagram Signed-off-by: Lukas Sommer --- clang/lib/Driver/Driver.cpp | 106 ++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 5c60b7b33fbd8..afb7103e2c1ed 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -5506,6 +5506,8 @@ class OffloadingActionBuilder final { // s - device code split requested // r - relocatable device code is requested // f - link object output type is TY_Tempfilelist (fat archive) + // e - Embedded IR for fusion (-fsycl-embed-ir) was requested + // and target is NVPTX. // * - "all other cases" // - no condition means output/input is "always" present // First symbol indicates output/input type @@ -5525,58 +5527,58 @@ class OffloadingActionBuilder final { // | | // | | // .---------------------------------------. - // | PostLink | - // .---------------------------------------. - // [+*] [+] - // | | - // | | - // |--------- | - // | | | - // | | | - // | [+!rf] | - // | .-------------. | - // | | llvm-foreach| | - // | .-------------. | - // | | | - // [+*] [+!rf] | - // .-----------------. | - // | FileTableTform | | - // | (extract "Code")| | - // .-----------------. | - // [-] |----------- - // --------------------| | - // | | | - // | |----------------- | - // | | | | - // | | [-!rf] | - // | | .--------------. | - // | | |FileTableTform| | - // | | | (merge) | | - // | | .--------------. | - // | | [-] |------- - // | | | | | - // | | | ------| | - // | | --------| | | - // [.] [-*] [-!rf] [+!rf] | - // .---------------. .-------------------. .--------------. | - // | finalizeNVPTX | | SPIRVTranslator | |FileTableTform| | - // | finalizeAMDGCN | | | | (merge) | | - // .---------------. .-------------------. . -------------. | - // [.] [-as] [-!a] | | - // | | | | | - // | [-s] | | | - // | .----------------. | | | - // | | BackendCompile | | | | - // | .----------------. | ------| | - // | [-s] | | | - // | | | | | - // | [-a] [-!a] [-!rf] | - // | .--------------------. | - // -----------[-n]| FileTableTform |[+*]--------------| - // | (replace "Code") | - // .--------------------. - // | - // [+*] + // | PostLink |[+e]---------------- + // .---------------------------------------. | + // [+*] [+] | + // | | | + // | | | + // |--------- | | + // | | | | + // | | | | + // | [+!rf] | | + // | .-------------. | | + // | | llvm-foreach| | | + // | .-------------. | | + // | | | | + // [+*] [+!rf] | | + // .-----------------. | | + // | FileTableTform | | | + // | (extract "Code")| | | + // .-----------------. | | + // [-] |----------- | + // --------------------| | | + // | | | | + // | |----------------- | | + // | | | | | + // | | [-!rf] | | + // | | .--------------. | | + // | | |FileTableTform| | | + // | | | (merge) | | | + // | | .--------------. | | + // | | [-] |------- | + // | | | | | | + // | | | ------| | | + // | | --------| | | | + // [.] [-*] [-!rf] [+!rf] | | + // .---------------. .-------------------. .--------------. | | + // | finalizeNVPTX | | SPIRVTranslator | |FileTableTform| | | + // | finalizeAMDGCN | | | | (merge) | | | + // .---------------. .-------------------. . -------------. | | + // [.] [-as] [-!a] | | | + // | | | | | | + // | [-s] | | | | + // | .----------------. | | | | + // | | BackendCompile | | | | | + // | .----------------. | ------| | | + // | [-s] | | | | + // | | | | | | + // | [-a] [-!a] [-!rf] | | + // | .--------------------. | | + // -----------[-n]| FileTableTform |[+*]--------------| | + // | (replace "Code") | | + // .--------------------. | + // | ------------------------- + // [+*] | [+e] // .--------------------------------------. // | OffloadWrapper | // .--------------------------------------.