From cd2497c0806bb45caf2c2ee5aba2a46166f0bb44 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Mon, 9 Jan 2023 13:00:37 +0000
Subject: [PATCH 01/16] [SYCL][Fusion] Embed LLVM IR for SYCL for Nvidia

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 clang/include/clang/Driver/Action.h   |  7 ++++++-
 clang/include/clang/Driver/Options.td |  2 ++
 clang/lib/Driver/Action.cpp           |  6 +++---
 clang/lib/Driver/Driver.cpp           | 10 ++++++++++
 clang/lib/Driver/ToolChains/Clang.cpp | 10 +++++++++-
 5 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index 2c9f59cdde9b0..9dab2e32b2ffc 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -660,9 +660,14 @@ class OffloadUnbundlingJobAction final : public JobAction {
 class OffloadWrapperJobAction : public JobAction {
   void anchor() override;
 
+  bool EmbedIR;
+
 public:
   OffloadWrapperJobAction(ActionList &Inputs, types::ID Type);
-  OffloadWrapperJobAction(Action *Input, types::ID OutputType);
+  OffloadWrapperJobAction(Action *Input, types::ID OutputType,
+                          bool IsEmbeddedIR = false);
+
+  bool isEmbeddedIR() const { return EmbedIR; }
 
   static bool classof(const Action *A) {
     return A->getKind() == OffloadWrapperJobClass;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 73b5e0ae1c905..7eddf97fa527f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2955,6 +2955,8 @@ def fintelfpga : Flag<["-"], "fintelfpga">, Group<f_Group>,
   HelpText<"Perform ahead-of-time compilation for FPGA">;
 def fsycl_device_only : Flag<["-"], "fsycl-device-only">, Flags<[CoreOption]>,
   HelpText<"Compile SYCL kernels for device">;
+def fsycl_embed_ir : Flag<["-"], "fsycl-embed-ir">, Flags<[CoreOption]>,
+  HelpText<"Embed LLVM IR for runtime kernel fusion">;
 defm sycl_esimd_force_stateless_mem : BoolFOption<"sycl-esimd-force-stateless-mem",
     LangOpts<"SYCLESIMDForceStatelessMem">, DefaultFalse,
     PosFlag<SetTrue, [], "Enforce using stateless memory accesses. "
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index e8c29e7a2f147..f55233c59a6db 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -480,9 +480,9 @@ OffloadWrapperJobAction::OffloadWrapperJobAction(ActionList &Inputs,
                                                  types::ID Type)
   : JobAction(OffloadWrapperJobClass, Inputs, Type) {}
 
-OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input,
-                                                 types::ID Type)
-    : JobAction(OffloadWrapperJobClass, Input, Type) {}
+OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input, types::ID Type,
+                                                 bool IsEmbeddedIR)
+    : JobAction(OffloadWrapperJobClass, Input, Type), EmbedIR(IsEmbeddedIR) {}
 
 void OffloadPackagerJobAction::anchor() {}
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 9185f617dbb0e..5c60b7b33fbd8 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5683,6 +5683,16 @@ class OffloadingActionBuilder final {
             return TypedPostLinkAction;
           };
           Action *PostLinkAction = createPostLinkAction();
+          if (isNVPTX && Args.hasArg(options::OPT_fsycl_embed_ir)) {
+            // When compiling for Nvidia/CUDA devices and the user requested the
+            // IR to be embedded in the application (via option), run the output
+            // of sycl-post-link (filetable referencing LLVM Bitcode + symbols)
+            // through the offload wrapper and link the resulting object to the
+            // application.
+            auto *WrapBitcodeAction = C.MakeAction<OffloadWrapperJobAction>(
+                PostLinkAction, types::TY_Object, true);
+            DA.add(*WrapBitcodeAction, *TC, BoundArch, Action::OFK_SYCL);
+          }
           bool NoRDCFatStaticArchive =
               !IsRDC &&
               FullDeviceLinkAction->getType() == types::TY_Tempfilelist;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f0851ea74c0d2..10c8e42d38ba3 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9274,6 +9274,14 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
       createArgString("-link-opts=");
     }
 
+    bool IsEmbeddedIR = cast<OffloadWrapperJobAction>(JA).isEmbeddedIR();
+    if (IsEmbeddedIR) {
+      // When the offload-wrapper is called to embed LLVM IR, add a prefix to
+      // the target triple to distinguish the LLVM IR from the actual device
+      // binary for that target.
+      TargetTripleOpt = ("llvm_" + TargetTripleOpt).str();
+    }
+
     WrapperArgs.push_back(
         C.getArgs().MakeArgString(Twine("-target=") + TargetTripleOpt));
 
@@ -9295,7 +9303,7 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
     assert(I.isFilename() && "Invalid input.");
 
     if (I.getType() == types::TY_Tempfiletable ||
-        I.getType() == types::TY_Tempfilelist)
+        I.getType() == types::TY_Tempfilelist || IsEmbeddedIR)
       // wrapper actual input files are passed via the batch job file table:
       WrapperArgs.push_back(C.getArgs().MakeArgString("-batch"));
     WrapperArgs.push_back(C.getArgs().MakeArgString(I.getFilename()));

From 57bd8f656b4ec3a16f12acaa2615ee81e7d3c007 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 10 Jan 2023 13:29:22 +0000
Subject: [PATCH 02/16] Enable LLVM IR as alternative fusion input format;

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/CMakeLists.txt       |   4 +-
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |   5 +-
 .../lib/translation/LoadKernels.cpp           | 116 ++++++++++++++++++
 .../lib/translation/LoadKernels.h             |  36 ++++++
 .../lib/translation/SPIRVLLVMTranslation.cpp  |  94 +++-----------
 .../lib/translation/SPIRVLLVMTranslation.h    |  13 +-
 sycl/source/detail/jit_compiler.cpp           |  97 ++++++++++-----
 7 files changed, 245 insertions(+), 120 deletions(-)
 create mode 100644 sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
 create mode 100644 sycl-fusion/jit-compiler/lib/translation/LoadKernels.h

diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index de6a73b1eab3d..92f0cefd68634 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -2,13 +2,15 @@
 add_llvm_library(sycl-fusion
    lib/KernelFusion.cpp
    lib/JITContext.cpp
+   lib/translation/LoadKernels.cpp
    lib/translation/SPIRVLLVMTranslation.cpp
    lib/fusion/FusionPipeline.cpp
    lib/fusion/FusionHelper.cpp
    lib/fusion/ModuleHelper.cpp
    lib/helper/ConfigHelper.cpp
 
-  LINK_COMPONENTS
+   LINK_COMPONENTS
+   BitReader
    Core
    Support
    Analysis
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 56ea9401c465b..fc168587738b5 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -15,6 +15,7 @@
 #include "fusion/FusionPipeline.h"
 #include "helper/ConfigHelper.h"
 #include "helper/ErrorHandling.h"
+#include "translation/LoadKernels.h"
 #include "translation/SPIRVLLVMTranslation.h"
 #include <llvm/Support/Error.h>
 #include <sstream>
@@ -97,8 +98,8 @@ FusionResult KernelFusion::fuseKernels(
   // Load all input kernels from their respective SPIR-V modules into a single
   // LLVM IR module.
   llvm::Expected<std::unique_ptr<llvm::Module>> ModOrError =
-      translation::SPIRVLLVMTranslator::loadSPIRVKernels(
-          *JITCtx.getLLVMContext(), ModuleInfo.kernels());
+      translation::KernelLoader::loadKernels(*JITCtx.getLLVMContext(),
+                                             ModuleInfo.kernels());
   if (auto Error = ModOrError.takeError()) {
     return errorToFusionResult(std::move(Error), "SPIR-V translation failed");
   }
diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
new file mode 100644
index 0000000000000..cabf1f6c44e9f
--- /dev/null
+++ b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
@@ -0,0 +1,116 @@
+//==-------------------------- LoadKernels.cpp  ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoadKernels.h"
+#include "SPIRVLLVMTranslation.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace jit_compiler;
+using namespace jit_compiler::translation;
+using namespace llvm;
+
+llvm::Expected<std::unique_ptr<llvm::Module>>
+KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx,
+                          std::vector<SYCLKernelInfo> &Kernels) {
+  std::unique_ptr<Module> Result{nullptr};
+  bool First = true;
+  DenseSet<BinaryBlob> ParsedBinaries;
+  size_t AddressBits = 0;
+  for (auto &Kernel : Kernels) {
+    // FIXME: Currently, we use the front of the list.
+    // Do we need to iterate to find the most suitable
+    // SPIR-V module?
+    SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo;
+
+    const unsigned char *ModulePtr = BinInfo.BinaryStart;
+    size_t ModuleSize = BinInfo.BinarySize;
+    BinaryBlob BinBlob{ModulePtr, ModuleSize};
+    if (ParsedBinaries.contains(BinBlob)) {
+      // Multiple kernels can be stored in the same SPIR-V or LLVM IR module.
+      // If we encountered the same binary module before, skip.
+      // NOTE: We compare the pointer as well as the size, in case
+      // a previous kernel only referenced part of the SPIR-V/LLVM IR module.
+      // Not sure this can actually happen, but better safe than sorry.
+      continue;
+    }
+    // Simply load and translate the SPIR-V into the currently still empty
+    // module.
+    std::unique_ptr<llvm::Module> NewMod;
+
+    switch (BinInfo.Format) {
+    case BinaryFormat::LLVM: {
+      auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel);
+      if (auto Err = ModOrError.takeError()) {
+        return std::move(Err);
+      }
+      NewMod = std::move(*ModOrError);
+      break;
+    }
+    case BinaryFormat::SPIRV: {
+      auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel);
+      if (auto Err = ModOrError.takeError()) {
+        return std::move(Err);
+      }
+      NewMod = std::move(*ModOrError);
+      break;
+    }
+    default: {
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Failed to load kernel from unsupported input format");
+    }
+    }
+
+    // We do not assume that the input binary information has the address bits
+    // set, but rather retrieve this information from the SPIR-V/LLVM module's
+    // data-layout.
+    BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
+
+    if (First) {
+      // We can simply assign the module we just loaded from SPIR-V to the
+      // empty pointer on the first iteration.
+      Result = std::move(NewMod);
+      // The first module will dictate the address bits for the remaining.
+      AddressBits = BinInfo.AddressBits;
+      First = false;
+    } else {
+      // We have already loaded some module, so now we need to
+      // link the module we just loaded with the result so far.
+      // FIXME: We allow duplicates to be overridden by the module
+      // read last. This could cause problems if different modules contain
+      // definitions with the same name, but different body/content.
+      // Check that this is not problematic.
+      Linker::linkModules(*Result, std::move(NewMod),
+                          Linker::Flags::OverrideFromSrc);
+      if (AddressBits != BinInfo.AddressBits) {
+        return createStringError(
+            inconvertibleErrorCode(),
+            "Number of address bits between SPIR-V modules does not match");
+      }
+    }
+  }
+  return std::move(Result);
+}
+
+llvm::Expected<std::unique_ptr<llvm::Module>>
+KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
+                             SYCLKernelInfo &Kernel) {
+  auto &BinInfo = Kernel.BinaryInfo;
+  llvm::StringRef RawData(reinterpret_cast<const char *>(BinInfo.BinaryStart),
+                          BinInfo.BinarySize);
+  return llvm::parseBitcodeFile(
+      MemoryBuffer::getMemBuffer(RawData)->getMemBufferRef(), LLVMCtx);
+}
+
+llvm::Expected<std::unique_ptr<llvm::Module>>
+KernelLoader::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
+                              SYCLKernelInfo &Kernel) {
+  return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel);
+}
diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h
new file mode 100644
index 0000000000000..5720abf09bfd7
--- /dev/null
+++ b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h
@@ -0,0 +1,36 @@
+//==-- LoadKernels.h - Load LLVM IR for SYCL kernels in different formats  -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Kernel.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include <llvm/Support/Error.h>
+#include <vector>
+
+namespace jit_compiler {
+namespace translation {
+
+class KernelLoader {
+
+public:
+  static llvm::Expected<std::unique_ptr<llvm::Module>>
+  loadKernels(llvm::LLVMContext &LLVMCtx, std::vector<SYCLKernelInfo> &Kernels);
+
+private:
+  ///
+  /// Pair of address and size to represent a binary blob.
+  using BinaryBlob = std::pair<BinaryAddress, size_t>;
+
+  static llvm::Expected<std::unique_ptr<llvm::Module>>
+  loadLLVMKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
+
+  static llvm::Expected<std::unique_ptr<llvm::Module>>
+  loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
+};
+} // namespace translation
+} // namespace jit_compiler
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
index 77217419e98bd..fc481f23b141d 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
@@ -16,7 +16,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/Support/raw_ostream.h"
 #include <iostream>
 #include <sstream>
@@ -86,12 +85,19 @@ SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() {
   return Opts;
 }
 
-Expected<std::unique_ptr<Module>>
-SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx,
-                                           BinaryBlob Input) {
-  // Create an input stream for the binary blob.
+Expected<std::unique_ptr<llvm::Module>>
+SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
+                                     SYCLKernelInfo &Kernel) {
+  std::unique_ptr<Module> Result{nullptr};
+
+  SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo;
+  assert(BinInfo.Format == BinaryFormat::SPIRV &&
+         "Only SPIR-V supported as input");
+
+  // Create an input stream for the SPIR-V binary.
   std::stringstream SPIRStream(
-      std::string(reinterpret_cast<const char *>(Input.first), Input.second),
+      std::string(reinterpret_cast<const char *>(BinInfo.BinaryStart),
+                  BinInfo.BinarySize),
       std::ios_base::in | std::ios_base::binary);
   std::string ErrMsg;
   // Create a raw pointer. readSpirv accepts a reference to a pointer,
@@ -105,77 +111,13 @@ SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx,
         "Failed to load and translate SPIR-V module with error %s",
         ErrMsg.c_str());
   }
-  return std::unique_ptr<Module>(LLVMMod);
-}
+  std::unique_ptr<Module> NewMod{LLVMMod};
 
-Expected<std::unique_ptr<llvm::Module>>
-SPIRVLLVMTranslator::loadSPIRVKernels(llvm::LLVMContext &LLVMCtx,
-                                      std::vector<SYCLKernelInfo> &Kernels) {
-  std::unique_ptr<Module> Result{nullptr};
-  bool First = true;
-  DenseSet<BinaryBlob> ParsedSPIRVModules;
-  size_t AddressBits = 0;
-  for (auto &Kernel : Kernels) {
-    // FIXME: Currently, we use the front of the list.
-    // Do we need to iterate to find the most suitable
-    // SPIR-V module?
-    SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo;
-    // TODO(Lukas, ONNX-399): Also support LLVM IR as input but simply skipping
-    // the translation from SPIR-V to LLVM.
-    assert(BinInfo.Format == BinaryFormat::SPIRV &&
-           "Only SPIR-V supported as input");
-    const unsigned char *SPRModulePtr = BinInfo.BinaryStart;
-    size_t SPRModuleSize = BinInfo.BinarySize;
-    BinaryBlob BinBlob{SPRModulePtr, SPRModuleSize};
-    if (ParsedSPIRVModules.contains(BinBlob)) {
-      // Multiple kernels can be stored in the same SPIR-V module.
-      // If we encountered the same SPIR-V module before, skip.
-      // NOTE: We compare the pointer as well as the size, in case
-      // a previous kernel only referenced part of the SPIR-V module.
-      // Not sure this can actually happen, but better safe than sorry.
-      continue;
-    }
-    // Simply load and translate the SPIR-V into the currently still empty
-    // module.
-    PROPAGATE_ERROR(NewMod, readAndTranslateSPIRV(LLVMCtx, BinBlob));
-
-    // We do not assume that the input binary information has the address bits
-    // set, but rather retrieve this information from the SPIR-V/LLVM module's
-    // data-layout.
-    BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
-    assert((First || BinInfo.AddressBits == AddressBits) &&
-           "Address bits do not match");
-    // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
-    // 'work_group_size_hint' from metadata attached to the kernel function and
-    // store it in the SYCLKernelInfo.
-    // TODO(Lukas, ONNX-399): Validate that DPC++ used metadata to represent
-    // that information.
-    restoreKernelAttributes(NewMod.get(), Kernel);
-
-    if (First) {
-      // We can simply assign the module we just loaded from SPIR-V to the
-      // empty pointer on the first iteration.
-      Result = std::move(NewMod);
-      // The first module will dictate the address bits for the remaining.
-      AddressBits = BinInfo.AddressBits;
-      First = false;
-    } else {
-      // We have already loaded some module, so now we need to
-      // link the module we just loaded with the result so far.
-      // FIXME: We allow duplicates to be overridden by the module
-      // read last. This could cause problems if different modules contain
-      // definitions with the same name, but different body/content.
-      // Check that this is not problematic.
-      Linker::linkModules(*Result, std::move(NewMod),
-                          Linker::Flags::OverrideFromSrc);
-      if (AddressBits != BinInfo.AddressBits) {
-        return createStringError(
-            inconvertibleErrorCode(),
-            "Number of address bits between SPIR-V modules does not match");
-      }
-    }
-  }
-  return std::move(Result);
+  // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
+  // 'work_group_size_hint' from metadata attached to the kernel function and
+  // store it in the SYCLKernelInfo.
+  restoreKernelAttributes(NewMod.get(), Kernel);
+  return std::move(NewMod);
 }
 
 Expected<jit_compiler::SPIRVBinary *>
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
index 5f1d416e45150..d82a9cc82466c 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
@@ -25,8 +25,7 @@ class SPIRVLLVMTranslator {
   ///
   /// Load a list of SPIR-V kernels into a single LLVM module.
   static llvm::Expected<std::unique_ptr<llvm::Module>>
-  loadSPIRVKernels(llvm::LLVMContext &LLVMCtx,
-                   std::vector<SYCLKernelInfo> &Kernels);
+  loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
 
   ///
   /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and
@@ -35,10 +34,6 @@ class SPIRVLLVMTranslator {
                                                             JITContext &JITCtx);
 
 private:
-  ///
-  /// Pair of address and size to represent a binary blob.
-  using BinaryBlob = std::pair<BinaryAddress, size_t>;
-
   ///
   /// Get an attribute value consisting of NumValues scalar constant integers
   /// from the MDNode.
@@ -53,12 +48,6 @@ class SPIRVLLVMTranslator {
   ///   - work_group_size_hint
   static void restoreKernelAttributes(llvm::Module *Mod, SYCLKernelInfo &Info);
 
-  ///
-  /// Read the given SPIR-V binary and translate it to a new LLVM module
-  /// associated with the given context.
-  static llvm::Expected<std::unique_ptr<llvm::Module>>
-  readAndTranslateSPIRV(llvm::LLVMContext &LLVMCtx, BinaryBlob Input);
-
   ///
   /// Default settings for the SPIRV translation options.
   static SPIRV::TranslatorOpts &translatorOpts();
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index cc39eabc294c8..ad93d93dc70d5 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -39,6 +39,70 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
   }
 }
 
+std::pair<const RTDeviceBinaryImage *, RT::PiProgram>
+retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
+  auto KernelName = KernelCG->getKernelName();
+
+  bool isNvidia = Queue->getDeviceImplPtr()->getPlugin().getBackend() ==
+                  backend::ext_oneapi_cuda;
+  if (isNvidia) {
+    auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
+    std::vector<kernel_id> KernelIds;
+    KernelIds.push_back(KernelID);
+    auto DeviceImages =
+        ProgramManager::getInstance().getRawDeviceImages(KernelIds);
+    const RTDeviceBinaryImage *DeviceImage = nullptr;
+    for (auto *DI : DeviceImages) {
+      // We are looking for a device image with LLVM IR format and target spec
+      // "llvm_nvptx64", which has been set by the offload-wrapper action.
+      if (DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
+          DI->getRawData().DeviceTargetSpec == std::string("llvm_nvptx64")) {
+        DeviceImage = DI;
+        break;
+      }
+    }
+    if (!DeviceImage) {
+      return {nullptr, nullptr};
+    }
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    RT::PiProgram Program =
+        detail::ProgramManager::getInstance().createPIProgram(*DeviceImage,
+                                                              Context, Device);
+    return {DeviceImage, Program};
+  }
+
+  const RTDeviceBinaryImage *DeviceImage = nullptr;
+  RT::PiProgram Program = nullptr;
+  if (KernelCG->getKernelBundle() != nullptr) {
+    // Retrieve the device image from the kernel bundle.
+    auto KernelBundle = KernelCG->getKernelBundle();
+    kernel_id KernelID =
+        detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
+
+    auto SyclKernel = detail::getSyclObjImpl(
+        KernelBundle->get_kernel(KernelID, KernelBundle));
+
+    DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = SyclKernel->getDeviceImage()->get_program_ref();
+  } else if (KernelCG->MSyclKernel != nullptr) {
+    DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
+  } else {
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
+        KernelCG->MOSModuleHandle, KernelName, Context, Device);
+    Program = detail::ProgramManager::getInstance().createPIProgram(
+        *DeviceImage, Context, Device);
+  }
+  return {DeviceImage, Program};
+}
+
 static ::jit_compiler::ParameterKind
 translateArgType(kernel_param_kind_t Kind) {
   using PK = ::jit_compiler::ParameterKind;
@@ -574,33 +638,9 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
           "Cannot fuse kernel with invalid kernel function name");
       return nullptr;
     }
-    const RTDeviceBinaryImage *DeviceImage = nullptr;
-    RT::PiProgram Program = nullptr;
-    if (KernelCG->getKernelBundle() != nullptr) {
-      // Retrieve the device image from the kernel bundle.
-      auto KernelBundle = KernelCG->getKernelBundle();
-      kernel_id KernelID =
-          detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
-
-      auto SyclKernel = detail::getSyclObjImpl(
-          KernelBundle->get_kernel(KernelID, KernelBundle));
-
-      DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
-      Program = SyclKernel->getDeviceImage()->get_program_ref();
-    } else if (KernelCG->MSyclKernel != nullptr) {
-      DeviceImage =
-          KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
-      Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
-    } else {
-      auto ContextImpl = Queue->getContextImplPtr();
-      auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
-      auto DeviceImpl = Queue->getDeviceImplPtr();
-      auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
-      DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
-          KernelCG->MOSModuleHandle, KernelName, Context, Device);
-      Program = detail::ProgramManager::getInstance().createPIProgram(
-          *DeviceImage, Context, Device);
-    }
+
+    auto [DeviceImage, Program] = retrieveKernelBinary(Queue, KernelCG);
+
     if (!DeviceImage || !Program) {
       printPerformanceWarning("No suitable IR available for fusion");
       return nullptr;
@@ -664,8 +704,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       return nullptr;
     }
     ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
-        translateBinaryImageFormat(DeviceImage->getFormat()), 0,
-        RawDeviceImage.BinaryStart, DeviceImageSize};
+        BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
 
     constexpr auto SYCLTypeToIndices = [](auto Val) -> ::jit_compiler::Indices {
       return {Val.get(0), Val.get(1), Val.get(2)};

From 0244fc050022a159dec12c62246687343dec7b9d Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 11 Jan 2023 14:02:59 +0000
Subject: [PATCH 03/16] [SYCL][Fusion] Add result translation to PTX

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/common/include/Kernel.h           |   2 +-
 sycl-fusion/common/lib/KernelIO.h             |   1 +
 sycl-fusion/jit-compiler/CMakeLists.txt       |   6 +-
 sycl-fusion/jit-compiler/include/JITContext.h |  16 ++-
 sycl-fusion/jit-compiler/include/Options.h    |   6 +-
 sycl-fusion/jit-compiler/lib/JITContext.cpp   |  14 +-
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |  18 +--
 ...{LoadKernels.cpp => KernelTranslation.cpp} | 121 ++++++++++++++++--
 .../{LoadKernels.h => KernelTranslation.h}    |  14 +-
 .../lib/translation/SPIRVLLVMTranslation.cpp  |   4 +-
 .../lib/translation/SPIRVLLVMTranslation.h    |   2 +-
 sycl/source/detail/jit_compiler.cpp           |  17 +++
 12 files changed, 186 insertions(+), 35 deletions(-)
 rename sycl-fusion/jit-compiler/lib/translation/{LoadKernels.cpp => KernelTranslation.cpp} (50%)
 rename sycl-fusion/jit-compiler/lib/translation/{LoadKernels.h => KernelTranslation.h} (64%)

diff --git a/sycl-fusion/common/include/Kernel.h b/sycl-fusion/common/include/Kernel.h
index 1962dd042ffbe..87726b3368d63 100644
--- a/sycl-fusion/common/include/Kernel.h
+++ b/sycl-fusion/common/include/Kernel.h
@@ -34,7 +34,7 @@ enum class ParameterKind : uint32_t {
 };
 
 /// Different binary formats supported as input to the JIT compiler.
-enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV };
+enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX };
 
 /// Information about a device intermediate representation module (e.g., SPIR-V,
 /// LLVM IR) from DPC++.
diff --git a/sycl-fusion/common/lib/KernelIO.h b/sycl-fusion/common/lib/KernelIO.h
index 12c194f8b4dd4..09058d61e9981 100644
--- a/sycl-fusion/common/lib/KernelIO.h
+++ b/sycl-fusion/common/lib/KernelIO.h
@@ -47,6 +47,7 @@ template <> struct ScalarEnumerationTraits<jit_compiler::BinaryFormat> {
   static void enumeration(IO &IO, jit_compiler::BinaryFormat &BF) {
     IO.enumCase(BF, "LLVM", jit_compiler::BinaryFormat::LLVM);
     IO.enumCase(BF, "SPIRV", jit_compiler::BinaryFormat::SPIRV);
+    IO.enumCase(BF, "PTX", jit_compiler::BinaryFormat::PTX);
     IO.enumCase(BF, "INVALID", jit_compiler::BinaryFormat::INVALID);
   }
 };
diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index 92f0cefd68634..f67eed5fc6c76 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -2,7 +2,7 @@
 add_llvm_library(sycl-fusion
    lib/KernelFusion.cpp
    lib/JITContext.cpp
-   lib/translation/LoadKernels.cpp
+   lib/translation/KernelTranslation.cpp
    lib/translation/SPIRVLLVMTranslation.cpp
    lib/fusion/FusionPipeline.cpp
    lib/fusion/FusionHelper.cpp
@@ -20,6 +20,10 @@ add_llvm_library(sycl-fusion
    Linker
    ScalarOpts
    InstCombine
+   Target
+   NVPTX
+   X86
+   MC
 )
 
 target_include_directories(sycl-fusion
diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index 4c0616e267941..d4654f19820ce 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -36,17 +36,21 @@ using CacheKeyT =
                std::optional<std::vector<NDRange>>>;
 
 ///
-/// Wrapper around a SPIR-V binary.
-class SPIRVBinary {
+/// Wrapper around a kernel binary.
+class KernelBinary {
 public:
-  explicit SPIRVBinary(std::string Binary);
+  explicit KernelBinary(std::string Binary, BinaryFormat Format);
 
   jit_compiler::BinaryAddress address() const;
 
   size_t size() const;
 
+  BinaryFormat format() const;
+
 private:
   std::string Blob;
+
+  BinaryFormat Format;
 };
 
 ///
@@ -61,7 +65,8 @@ class JITContext {
 
   llvm::LLVMContext *getLLVMContext();
 
-  SPIRVBinary &emplaceSPIRVBinary(std::string Binary);
+  KernelBinary &emplaceSPIRVBinary(std::string Binary,
+                                   BinaryFormat Format);
 
   std::optional<SYCLKernelInfo> getCacheEntry(CacheKeyT &Identifier) const;
 
@@ -79,11 +84,12 @@ class JITContext {
 
   MutexT BinariesMutex;
 
-  std::vector<SPIRVBinary> Binaries;
+  std::vector<KernelBinary> Binaries;
 
   mutable MutexT CacheMutex;
 
   std::unordered_map<CacheKeyT, SYCLKernelInfo> Cache;
+  
 };
 } // namespace jit_compiler
 
diff --git a/sycl-fusion/jit-compiler/include/Options.h b/sycl-fusion/jit-compiler/include/Options.h
index 335f58fb64cf7..4fe7787df00db 100644
--- a/sycl-fusion/jit-compiler/include/Options.h
+++ b/sycl-fusion/jit-compiler/include/Options.h
@@ -9,12 +9,13 @@
 #ifndef SYCL_FUSION_JIT_COMPILER_OPTIONS_H
 #define SYCL_FUSION_JIT_COMPILER_OPTIONS_H
 
+#include "Kernel.h"
 #include <memory>
 #include <unordered_map>
 
 namespace jit_compiler {
 
-enum OptionID { VerboseOutput, EnableCaching };
+enum OptionID { VerboseOutput, EnableCaching, TargetFormat };
 
 class OptionPtrBase {};
 
@@ -78,6 +79,9 @@ struct JITEnableVerbose : public OptionBase<OptionID::VerboseOutput, bool> {};
 
 struct JITEnableCaching : public OptionBase<OptionID::EnableCaching, bool> {};
 
+struct JITTargetFormat
+    : public OptionBase<OptionID::TargetFormat, BinaryFormat> {};
+
 } // namespace option
 } // namespace jit_compiler
 
diff --git a/sycl-fusion/jit-compiler/lib/JITContext.cpp b/sycl-fusion/jit-compiler/lib/JITContext.cpp
index 68c7031b9d8a9..e1dda8b928c45 100644
--- a/sycl-fusion/jit-compiler/lib/JITContext.cpp
+++ b/sycl-fusion/jit-compiler/lib/JITContext.cpp
@@ -11,14 +11,17 @@
 
 using namespace jit_compiler;
 
-SPIRVBinary::SPIRVBinary(std::string Binary) : Blob{std::move(Binary)} {}
+KernelBinary::KernelBinary(std::string Binary, BinaryFormat Fmt)
+    : Blob{std::move(Binary)}, Format{Fmt} {}
 
-jit_compiler::BinaryAddress SPIRVBinary::address() const {
+jit_compiler::BinaryAddress KernelBinary::address() const {
   // FIXME: Verify it's a good idea to perform this reinterpret_cast here.
   return reinterpret_cast<jit_compiler::BinaryAddress>(Blob.c_str());
 }
 
-size_t SPIRVBinary::size() const { return Blob.size(); }
+size_t KernelBinary::size() const { return Blob.size(); }
+
+BinaryFormat KernelBinary::format() const { return Format; }
 
 JITContext::JITContext() : LLVMCtx{new llvm::LLVMContext}, Binaries{} {}
 
@@ -26,11 +29,12 @@ JITContext::~JITContext() = default;
 
 llvm::LLVMContext *JITContext::getLLVMContext() { return LLVMCtx.get(); }
 
-SPIRVBinary &JITContext::emplaceSPIRVBinary(std::string Binary) {
+KernelBinary &JITContext::emplaceSPIRVBinary(std::string Binary,
+                                             BinaryFormat Format) {
   WriteLockT WriteLock{BinariesMutex};
   // NOTE: With C++17, which returns a reference from emplace_back, the
   // following code would be even simpler.
-  Binaries.emplace_back(std::move(Binary));
+  Binaries.emplace_back(std::move(Binary), Format);
   return Binaries.back();
 }
 
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index fc168587738b5..eeffb0af6ced8 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -15,7 +15,7 @@
 #include "fusion/FusionPipeline.h"
 #include "helper/ConfigHelper.h"
 #include "helper/ErrorHandling.h"
-#include "translation/LoadKernels.h"
+#include "translation/KernelTranslation.h"
 #include "translation/SPIRVLLVMTranslation.h"
 #include <llvm/Support/Error.h>
 #include <sstream>
@@ -98,8 +98,8 @@ FusionResult KernelFusion::fuseKernels(
   // Load all input kernels from their respective SPIR-V modules into a single
   // LLVM IR module.
   llvm::Expected<std::unique_ptr<llvm::Module>> ModOrError =
-      translation::KernelLoader::loadKernels(*JITCtx.getLLVMContext(),
-                                             ModuleInfo.kernels());
+      translation::KernelTranslator::loadKernels(*JITCtx.getLLVMContext(),
+                                                 ModuleInfo.kernels());
   if (auto Error = ModOrError.takeError()) {
     return errorToFusionResult(std::move(Error), "SPIR-V translation failed");
   }
@@ -137,14 +137,14 @@ FusionResult KernelFusion::fuseKernels(
 
   SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName);
 
-  // Translate the LLVM IR module resulting from the fusion pass into SPIR-V.
-  llvm::Expected<jit_compiler::SPIRVBinary *> BinaryOrError =
-      translation::SPIRVLLVMTranslator::translateLLVMtoSPIRV(*NewMod, JITCtx);
-  if (auto Error = BinaryOrError.takeError()) {
+  // TODO
+  BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
+
+  if (auto Error = translation::KernelTranslator::translateKernel(
+          FusedKernelInfo, *NewMod, JITCtx, TargetFormat)) {
     return errorToFusionResult(std::move(Error),
-                               "Translation to SPIR-V failed");
+                               "Translation to output format failed");
   }
-  jit_compiler::SPIRVBinary *SPIRVBin = *BinaryOrError;
 
   FusedKernelInfo.NDR = FusedKernel.FusedNDRange;
 
diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
similarity index 50%
rename from sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
rename to sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index cabf1f6c44e9f..31fa850d58d63 100644
--- a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -1,4 +1,4 @@
-//==-------------------------- LoadKernels.cpp  ----------------------------==//
+//==----------------------- KernelTranslation.cpp  -------------------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,19 +6,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LoadKernels.h"
+#include "KernelTranslation.h"
 #include "SPIRVLLVMTranslation.h"
 #include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Linker/Linker.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 
 using namespace jit_compiler;
 using namespace jit_compiler::translation;
 using namespace llvm;
 
 llvm::Expected<std::unique_ptr<llvm::Module>>
-KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx,
-                          std::vector<SYCLKernelInfo> &Kernels) {
+KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx,
+                              std::vector<SYCLKernelInfo> &Kernels) {
   std::unique_ptr<Module> Result{nullptr};
   bool First = true;
   DenseSet<BinaryBlob> ParsedBinaries;
@@ -100,8 +105,8 @@ KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx,
 }
 
 llvm::Expected<std::unique_ptr<llvm::Module>>
-KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
-                             SYCLKernelInfo &Kernel) {
+KernelTranslator::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
+                                 SYCLKernelInfo &Kernel) {
   auto &BinInfo = Kernel.BinaryInfo;
   llvm::StringRef RawData(reinterpret_cast<const char *>(BinInfo.BinaryStart),
                           BinInfo.BinarySize);
@@ -110,7 +115,107 @@ KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
 }
 
 llvm::Expected<std::unique_ptr<llvm::Module>>
-KernelLoader::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
-                              SYCLKernelInfo &Kernel) {
+KernelTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
+                                  SYCLKernelInfo &Kernel) {
   return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel);
 }
+
+llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
+                                              llvm::Module &Mod,
+                                              JITContext &JITCtx,
+                                              BinaryFormat Format) {
+
+  KernelBinary *KernelBin = nullptr;
+  switch (Format) {
+  case BinaryFormat::SPIRV: {
+    llvm::Expected<KernelBinary *> BinaryOrError =
+        translateToSPIRV(Mod, JITCtx);
+    if (auto Error = BinaryOrError.takeError()) {
+      return Error;
+    }
+    KernelBin = *BinaryOrError;
+    break;
+  }
+  case BinaryFormat::PTX: {
+    llvm::Expected<KernelBinary *> BinaryOrError = translateToPTX(Mod, JITCtx);
+    if (auto Error = BinaryOrError.takeError()) {
+      return Error;
+    }
+    KernelBin = *BinaryOrError;
+    break;
+  }
+  default: {
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Failed to translate kernel to unsupported output format");
+  }
+  }
+
+  // Update the KernelInfo for the fused kernel with the address and size of the
+  // SPIR-V binary resulting from translation.
+  SYCLKernelBinaryInfo &FusedBinaryInfo = Kernel.BinaryInfo;
+  FusedBinaryInfo.Format = Format;
+  // Output SPIR-V should use the same number of address bits as the input
+  // SPIR-V. SPIR-V translation requires all modules to use the same number of
+  // address bits, so it's safe to take the value from the first one.
+  FusedBinaryInfo.AddressBits = Mod.getDataLayout().getPointerSizeInBits();
+  FusedBinaryInfo.BinaryStart = KernelBin->address();
+  FusedBinaryInfo.BinarySize = KernelBin->size();
+  return Error::success();
+}
+
+llvm::Expected<KernelBinary *>
+KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) {
+  return SPIRVLLVMTranslator::translateLLVMtoSPIRV(Mod, JITCtx);
+}
+
+llvm::Expected<KernelBinary *>
+KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
+  // FIXME: Can we limit this to the NVPTX specific target?
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllTargetMCs();
+
+  std::string TargetTriple{"nvptx64-nvidia-cuda"};
+
+  std::string ErrorMessage;
+  const auto *Target =
+      llvm::TargetRegistry::lookupTarget(TargetTriple, ErrorMessage);
+
+  if (!Target) {
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Failed to load and translate SPIR-V module with error %s",
+        ErrorMessage.c_str());
+  }
+
+  // FIXME: Check whether we can provide more accurate target information here
+  auto *TargetMachine = Target->createTargetMachine(
+      TargetTriple, "sm_50", "+sm_50,+ptx76", {}, llvm::Reloc::PIC_,
+      std::nullopt, llvm::CodeGenOpt::Default);
+
+  llvm::legacy::PassManager PM;
+
+  std::string PTXASM;
+
+  {
+    llvm::raw_string_ostream ASMStream{PTXASM};
+    llvm::buffer_ostream BufferedASM{ASMStream};
+
+    if (TargetMachine->addPassesToEmitFile(PM, BufferedASM, nullptr,
+                                           llvm::CGFT_AssemblyFile)) {
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Failed to construct pass pipeline to emit output");
+    }
+
+    PM.run(Mod);
+    ASMStream.flush();
+  }
+
+  llvm::dbgs() << "PTX size: " << PTXASM.size() << "\n";
+  llvm::dbgs() << "PTX:\n" << PTXASM << "\n";
+
+  return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX);
+}
diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
similarity index 64%
rename from sycl-fusion/jit-compiler/lib/translation/LoadKernels.h
rename to sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
index 5720abf09bfd7..3d6824edbdd37 100644
--- a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -1,4 +1,4 @@
-//==-- LoadKernels.h - Load LLVM IR for SYCL kernels in different formats  -==//
+//==- KernelTranslation - Translate SYCL kernels between different formats -==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "JITContext.h"
 #include "Kernel.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -15,12 +16,15 @@
 namespace jit_compiler {
 namespace translation {
 
-class KernelLoader {
+class KernelTranslator {
 
 public:
   static llvm::Expected<std::unique_ptr<llvm::Module>>
   loadKernels(llvm::LLVMContext &LLVMCtx, std::vector<SYCLKernelInfo> &Kernels);
 
+  static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod,
+                              JITContext &JITCtx, BinaryFormat Format);
+
 private:
   ///
   /// Pair of address and size to represent a binary blob.
@@ -31,6 +35,12 @@ class KernelLoader {
 
   static llvm::Expected<std::unique_ptr<llvm::Module>>
   loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
+
+  static llvm::Expected<KernelBinary *> translateToSPIRV(llvm::Module &Mod,
+                                                         JITContext &JITCtx);
+
+  static llvm::Expected<KernelBinary *> translateToPTX(llvm::Module &Mod,
+                                                       JITContext &JITCtx);
 };
 } // namespace translation
 } // namespace jit_compiler
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
index fc481f23b141d..1046fd6dc2907 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
@@ -120,7 +120,7 @@ SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
   return std::move(NewMod);
 }
 
-Expected<jit_compiler::SPIRVBinary *>
+Expected<jit_compiler::KernelBinary *>
 SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) {
   std::ostringstream BinaryStream;
   std::string ErrMsg;
@@ -131,5 +131,5 @@ SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) {
         "Translation of LLVM IR to SPIR-V failed with error %s",
         ErrMsg.c_str());
   }
-  return &JITCtx.emplaceSPIRVBinary(BinaryStream.str());
+  return &JITCtx.emplaceSPIRVBinary(BinaryStream.str(), BinaryFormat::SPIRV);
 }
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
index d82a9cc82466c..440c00103b0d5 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
@@ -30,7 +30,7 @@ class SPIRVLLVMTranslator {
   ///
   /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and
   /// return a pointer to its container.
-  static llvm::Expected<SPIRVBinary *> translateLLVMtoSPIRV(llvm::Module &Mod,
+  static llvm::Expected<KernelBinary *> translateLLVMtoSPIRV(llvm::Module &Mod,
                                                             JITContext &JITCtx);
 
 private:
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index ad93d93dc70d5..e081a61a99b4b 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -39,6 +39,21 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
   }
 }
 
+::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) {
+  auto Backend = Queue->getDeviceImplPtr()->getPlugin().getBackend();
+  switch (Backend) {
+  case backend::ext_oneapi_level_zero:
+  case backend::opencl:
+    return ::jit_compiler::BinaryFormat::SPIRV;
+  case backend::ext_oneapi_cuda:
+    return ::jit_compiler::BinaryFormat::PTX;
+  default:
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::feature_not_supported),
+        "Backend unsupported by kernel fusion");
+  }
+}
+
 std::pair<const RTDeviceBinaryImage *, RT::PiProgram>
 retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
   auto KernelName = KernelCG->getKernelName();
@@ -784,6 +799,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
   JITConfig.set<::jit_compiler::option::JITEnableCaching>(
       detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get());
+  JITConfig.set<::jit_compiler::option::JITTargetFormat>(
+      getTargetFormat(Queue));
 
   auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
       *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames,

From 70980941c46313b490713e8045ca8e85a006a509 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Thu, 12 Jan 2023 08:56:43 +0000
Subject: [PATCH 04/16] [SYCL][Fusion] Provide correct target spec;

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 12 ------
 sycl/source/detail/jit_compiler.cpp           | 38 ++++++++++++++-----
 sycl/source/detail/jit_compiler.hpp           |  3 +-
 sycl/source/detail/jit_device_binaries.cpp    | 10 ++---
 sycl/source/detail/jit_device_binaries.hpp    |  4 +-
 5 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index eeffb0af6ced8..6c14ac182514a 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -148,18 +148,6 @@ FusionResult KernelFusion::fuseKernels(
 
   FusedKernelInfo.NDR = FusedKernel.FusedNDRange;
 
-  // Update the KernelInfo for the fused kernel with the address and size of the
-  // SPIR-V binary resulting from translation.
-  SYCLKernelBinaryInfo &FusedBinaryInfo = FusedKernelInfo.BinaryInfo;
-  FusedBinaryInfo.Format = BinaryFormat::SPIRV;
-  // Output SPIR-V should use the same number of address bits as the input
-  // SPIR-V. SPIR-V translation requires all modules to use the same number of
-  // address bits, so it's safe to take the value from the first one.
-  FusedBinaryInfo.AddressBits =
-      ModuleInfo.kernels().front().BinaryInfo.AddressBits;
-  FusedBinaryInfo.BinaryStart = SPIRVBin->address();
-  FusedBinaryInfo.BinarySize = SPIRVBin->size();
-
   if (CachingEnabled) {
     JITCtx.addCacheEntry(CacheKey, FusedKernelInfo);
   }
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index e081a61a99b4b..b2802c57e3589 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -799,8 +799,9 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
   JITConfig.set<::jit_compiler::option::JITEnableCaching>(
       detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get());
-  JITConfig.set<::jit_compiler::option::JITTargetFormat>(
-      getTargetFormat(Queue));
+      
+  ::jit_compiler::BinaryFormat TargetFormat = getTargetFormat(Queue);
+  JITConfig.set<::jit_compiler::option::JITTargetFormat>(TargetFormat);
 
   auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
       *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames,
@@ -842,7 +843,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
 
   if (!FusionResult.cached()) {
-    auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo);
+    auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
     detail::ProgramManager::getInstance().addImages(PIDeviceBinaries);
   } else if (DebugEnabled) {
     std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
@@ -854,9 +855,11 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       FusedKernelInfo.Name);
   std::vector<std::shared_ptr<const void>> RawExtendedMembers;
 
-  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
-      detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
-          Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
+  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr;
+  if (TargetFormat == ::jit_compiler::BinaryFormat::SPIRV) {
+    detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
+        Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
+  }
 
   std::unique_ptr<detail::CG> FusedCG;
   FusedCG.reset(new detail::CGExecKernel(
@@ -869,7 +872,25 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
 }
 
 pi_device_binaries jit_compiler::createPIDeviceBinary(
-    const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) {
+    const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
+    ::jit_compiler::BinaryFormat Format) {
+
+  const char *TargetSpec = nullptr;
+  switch (Format) {
+  case ::jit_compiler::BinaryFormat::PTX: {
+    TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64;
+    break;
+  }
+  case ::jit_compiler::BinaryFormat::SPIRV: {
+    TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64)
+                     ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64
+                     : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32;
+    break;
+  }
+  default:
+    sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                    "Invalid output format");
+  }
 
   DeviceBinaryContainer Binary;
 
@@ -897,8 +918,7 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
   DeviceBinariesCollection Collection;
   Collection.addDeviceBinary(std::move(Binary),
                              FusedKernelInfo.BinaryInfo.BinaryStart,
-                             FusedKernelInfo.BinaryInfo.BinarySize,
-                             FusedKernelInfo.BinaryInfo.AddressBits);
+                             FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec);
 
   JITDeviceBinaries.push_back(std::move(Collection));
   return JITDeviceBinaries.back().getPIDeviceStruct();
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 522c0749ef75b..e02be562de3ee 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -13,6 +13,7 @@
 #include <detail/scheduler/scheduler.hpp>
 
 namespace jit_compiler {
+enum class BinaryFormat;
 class JITContext;
 struct SYCLKernelInfo;
 using ArgUsageMask = std::vector<unsigned char>;
@@ -46,7 +47,7 @@ class jit_compiler {
   jit_compiler &operator=(const jit_compiler &&) = delete;
 
   pi_device_binaries
-  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo);
+  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, ::jit_compiler::BinaryFormat Format);
 
   std::vector<uint8_t>
   encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const;
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
index 0aa778da14240..eadd71d021ee8 100644
--- a/sycl/source/detail/jit_device_binaries.cpp
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -81,7 +81,7 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) {
 }
 
 pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
-    const unsigned char *BinaryStart, size_t BinarySize, size_t AddressBits) {
+    const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec) {
   pi_device_binary_struct DeviceBinary;
   DeviceBinary.Version = PI_DEVICE_BINARY_VERSION;
   DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL;
@@ -94,9 +94,7 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
   DeviceBinary.BinaryStart = BinaryStart;
   DeviceBinary.BinaryEnd = BinaryStart + BinarySize;
   DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV;
-  DeviceBinary.DeviceTargetSpec = (AddressBits == 32)
-                                      ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32
-                                      : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64;
+  DeviceBinary.DeviceTargetSpec = TargetSpec;
   DeviceBinary.EntriesBegin = PIOffloadEntries.data();
   DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size();
   DeviceBinary.PropertySetsBegin = PIPropertySets.data();
@@ -108,14 +106,14 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
 void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont,
                                                const unsigned char *BinaryStart,
                                                size_t BinarySize,
-                                               size_t AddressBits) {
+                                               const char* TargetSpec) {
   // Adding to the vectors might trigger reallocation, which would invalidate
   // the pointers used for PI structs if a PI struct has already been created
   // via getPIDeviceStruct(). Forbid calls to this method after the first PI
   // struct has been created.
   assert(Fused && "Adding to container would invalidate existing PI structs");
   PIBinaries.push_back(
-      Cont.getPIDeviceBinary(BinaryStart, BinarySize, AddressBits));
+      Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec));
   Binaries.push_back(std::move(Cont));
 }
 
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
index 7bf2c7d9fe07b..6fa142543b61f 100644
--- a/sycl/source/detail/jit_device_binaries.hpp
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -111,7 +111,7 @@ class DeviceBinaryContainer {
 
   pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart,
                                             size_t BinarySize,
-                                            size_t AddressBits);
+                                            const char* TargetSpec);
 
 private:
   bool Fused = true;
@@ -138,7 +138,7 @@ class DeviceBinariesCollection {
 
   void addDeviceBinary(DeviceBinaryContainer &&Cont,
                        const unsigned char *BinaryStart, size_t BinarySize,
-                       size_t AddressBits);
+                       const char* TargetSpec);
   pi_device_binaries getPIDeviceStruct();
 
 private:

From 2b8e92d86cdee9975100cbdd236dfc34405d75c4 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Fri, 13 Jan 2023 13:16:28 +0000
Subject: [PATCH 05/16] [SYCL][Fusion] Avoid removing dependencies

Avoid the dependencies of the dependencies to be removed when cleaning
up the input commands from the graph without executing them.

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/source/detail/scheduler/commands.hpp      | 8 ++++++++
 sycl/source/detail/scheduler/graph_builder.cpp | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index d4219770abbae..482c7a1d7d309 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -289,6 +289,14 @@ class Command {
   // XPTI instrumentation. Copy code location details to the internal struct.
   // Memory is allocated in this method and released in destructor.
   void copySubmissionCodeLocation();
+  
+  /// Clear all dependency events for device and host dependencies. This should
+  /// only be used if a command is about to be deleted without being executed
+  /// before that.
+  void clearAllDependencies() {
+    MPreparedDepsEvents.clear();
+    MPreparedHostDepsEvents.clear();
+  }
 
   /// Contains list of dependencies(edges)
   std::vector<DepDesc> MDeps;
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 1132d32b2f605..53cd1a7da6d13 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1385,6 +1385,10 @@ void Scheduler::GraphBuilder::removeNodeFromGraph(
   }
 
   Node->MDeps.clear();
+  // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called
+  // from the destructor of the command to delete the dependencies of the
+  // command this command depends on.
+  Node->clearAllDependencies();
 }
 
 void Scheduler::GraphBuilder::cancelFusion(QueueImplPtr Queue,

From fc43285af93e83862935889191452586d7ff26b9 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Fri, 13 Jan 2023 13:23:14 +0000
Subject: [PATCH 06/16] [SYCL][Fusion] Set device binary image format

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/source/detail/jit_compiler.cpp        | 12 +++++++++---
 sycl/source/detail/jit_device_binaries.cpp |  8 ++++----
 sycl/source/detail/jit_device_binaries.hpp |  4 ++--
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index b2802c57e3589..fe645e43ab28a 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -842,10 +842,13 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   }(FusedKernelInfo.NDR);
   updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
 
+  OSModuleHandle Handle = OSUtil::DummyModuleHandle;
   if (!FusionResult.cached()) {
     auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
     detail::ProgramManager::getInstance().addImages(PIDeviceBinaries);
+    Handle = OSUtil::getOSModuleHandle(PIDeviceBinaries->DeviceBinaries);
   } else if (DebugEnabled) {
+    // TODO(Lukas): Create correct OSModuleHandle when using a cached binary.
     std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
   }
 
@@ -866,8 +869,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
       std::move(ArgsStorage), std::move(AccStorage),
       std::move(RawExtendedMembers), std::move(Requirements), std::move(Events),
-      std::move(FusedArgs), FusedKernelInfo.Name, OSUtil::DummyModuleHandle, {},
-      {}, CG::CGTYPE::Kernel));
+      std::move(FusedArgs), FusedKernelInfo.Name, Handle, {}, {},
+      CG::CGTYPE::Kernel));
   return FusedCG;
 }
 
@@ -876,15 +879,18 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
     ::jit_compiler::BinaryFormat Format) {
 
   const char *TargetSpec = nullptr;
+  pi_device_binary_type BinFormat = PI_DEVICE_BINARY_TYPE_NATIVE;
   switch (Format) {
   case ::jit_compiler::BinaryFormat::PTX: {
     TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64;
+    BinFormat = PI_DEVICE_BINARY_TYPE_NONE;
     break;
   }
   case ::jit_compiler::BinaryFormat::SPIRV: {
     TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64)
                      ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64
                      : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32;
+    BinFormat = PI_DEVICE_BINARY_TYPE_SPIRV;
     break;
   }
   default:
@@ -918,7 +924,7 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
   DeviceBinariesCollection Collection;
   Collection.addDeviceBinary(std::move(Binary),
                              FusedKernelInfo.BinaryInfo.BinaryStart,
-                             FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec);
+                             FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat);
 
   JITDeviceBinaries.push_back(std::move(Collection));
   return JITDeviceBinaries.back().getPIDeviceStruct();
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
index eadd71d021ee8..59530a0e691ce 100644
--- a/sycl/source/detail/jit_device_binaries.cpp
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -81,10 +81,11 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) {
 }
 
 pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
-    const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec) {
+    const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec, pi_device_binary_type Format) {
   pi_device_binary_struct DeviceBinary;
   DeviceBinary.Version = PI_DEVICE_BINARY_VERSION;
   DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL;
+  DeviceBinary.Format = Format;
   DeviceBinary.CompileOptions = "";
   DeviceBinary.LinkOptions = "";
   DeviceBinary.ManifestStart = nullptr;
@@ -93,7 +94,6 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
   // the JITContext.
   DeviceBinary.BinaryStart = BinaryStart;
   DeviceBinary.BinaryEnd = BinaryStart + BinarySize;
-  DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV;
   DeviceBinary.DeviceTargetSpec = TargetSpec;
   DeviceBinary.EntriesBegin = PIOffloadEntries.data();
   DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size();
@@ -106,14 +106,14 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
 void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont,
                                                const unsigned char *BinaryStart,
                                                size_t BinarySize,
-                                               const char* TargetSpec) {
+                                               const char* TargetSpec, pi_device_binary_type Format) {
   // Adding to the vectors might trigger reallocation, which would invalidate
   // the pointers used for PI structs if a PI struct has already been created
   // via getPIDeviceStruct(). Forbid calls to this method after the first PI
   // struct has been created.
   assert(Fused && "Adding to container would invalidate existing PI structs");
   PIBinaries.push_back(
-      Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec));
+      Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec, Format));
   Binaries.push_back(std::move(Cont));
 }
 
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
index 6fa142543b61f..96079d3a25a19 100644
--- a/sycl/source/detail/jit_device_binaries.hpp
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -111,7 +111,7 @@ class DeviceBinaryContainer {
 
   pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart,
                                             size_t BinarySize,
-                                            const char* TargetSpec);
+                                            const char* TargetSpec, pi_device_binary_type Format);
 
 private:
   bool Fused = true;
@@ -138,7 +138,7 @@ class DeviceBinariesCollection {
 
   void addDeviceBinary(DeviceBinaryContainer &&Cont,
                        const unsigned char *BinaryStart, size_t BinarySize,
-                       const char* TargetSpec);
+                       const char* TargetSpec, pi_device_binary_type Format);
   pi_device_binaries getPIDeviceStruct();
 
 private:

From 6fad170cdcec62dd4745ade23e86f9f5fbeb735f Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Mon, 16 Jan 2023 14:28:36 +0000
Subject: [PATCH 07/16] [SYCL][Fusion] Refactor target-specific processing

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/passes/CMakeLists.txt             |   1 +
 .../passes/kernel-fusion/SYCLKernelFusion.cpp | 133 +++--------
 .../passes/kernel-fusion/SYCLKernelFusion.h   |   9 +-
 .../passes/target/TargetFusionInfo.cpp        | 225 ++++++++++++++++++
 sycl-fusion/passes/target/TargetFusionInfo.h  | 137 +++++++++++
 5 files changed, 395 insertions(+), 110 deletions(-)
 create mode 100644 sycl-fusion/passes/target/TargetFusionInfo.cpp
 create mode 100644 sycl-fusion/passes/target/TargetFusionInfo.h

diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index fe81b76b5bbcb..41398f6ee9e99 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_library(SYCLKernelFusionPasses
   syclcp/SYCLCP.cpp
   cleanup/Cleanup.cpp
   debug/PassDebug.cpp
+  target/TargetFusionInfo.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
index 241496ac9044f..d3dec7f16a81f 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
@@ -31,12 +31,6 @@
 
 using namespace llvm;
 
-constexpr static StringLiteral KernelArgAddrSpace{"kernel_arg_addr_space"};
-constexpr static StringLiteral KernelArgAccessQual{"kernel_arg_access_qual"};
-constexpr static StringLiteral KernelArgType{"kernel_arg_type"};
-constexpr static StringLiteral KernelArgBaseType{"kernel_arg_base_type"};
-constexpr static StringLiteral KernelArgTypeQual{"kernel_arg_type_qual"};
-
 constexpr StringLiteral SYCLKernelFusion::NDRangeMDKey;
 constexpr StringLiteral SYCLKernelFusion::NDRangesMDKey;
 
@@ -144,6 +138,8 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       AM.getResult<SYCLModuleInfoAnalysis>(M).ModuleInfo;
   assert(ModuleInfo && "No module information available");
 
+  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+
   // Iterate over the functions in the module and locate all
   // stub functions identified by metadata.
   SmallPtrSet<Function *, 8> ToCleanUp;
@@ -156,7 +152,7 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       // attached to this stub function.
       // The newly created function will carry the name also specified
       // in the metadata.
-      if (auto Err = fuseKernel(M, F, ModuleInfo, ToCleanUp)) {
+      if (auto Err = fuseKernel(M, F, ModuleInfo, TFI, ToCleanUp)) {
         DeferredErrs = joinErrors(std::move(DeferredErrs), std::move(Err));
       }
       // Rembember the stub for deletion, as it is not required anymore after
@@ -164,6 +160,10 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       ToCleanUp.insert(&F);
     }
   }
+  // Notify the target-specific logic that some functions will be erased
+  // shortly.
+  SmallVector<Function *> NotifyDelete{ToCleanUp.begin(), ToCleanUp.end()};
+  TFI.notifyFunctionsDelete(NotifyDelete);
   // Delete all the stub functions
   for (Function *SF : ToCleanUp) {
     SF->eraseFromParent();
@@ -230,11 +230,13 @@ static FusionInsertPoints addGuard(IRBuilderBase &Builder,
   return {Entry, CallInsertion, Exit};
 }
 
-static Expected<CallInst *> createFusionCall(
-    IRBuilderBase &Builder, Function *F, ArrayRef<Value *> CallArgs,
-    const jit_compiler::NDRange &SrcNDRange,
-    const jit_compiler::NDRange &FusedNDRange, bool IsLast, int BarriersFlags,
-    jit_compiler::Remapper &Remapper, bool ShouldRemap) {
+static Expected<CallInst *>
+createFusionCall(IRBuilderBase &Builder, Function *F,
+                 ArrayRef<Value *> CallArgs,
+                 const jit_compiler::NDRange &SrcNDRange,
+                 const jit_compiler::NDRange &FusedNDRange, bool IsLast,
+                 int BarriersFlags, jit_compiler::Remapper &Remapper,
+                 bool ShouldRemap, TargetFusionInfo &TargetInfo) {
   const auto IPs = addGuard(Builder, SrcNDRange, FusedNDRange, IsLast);
 
   if (ShouldRemap) {
@@ -260,7 +262,7 @@ static Expected<CallInst *> createFusionCall(
 
   // Insert barrier if needed
   if (!IsLast && BarriersFlags > 0) {
-    jit_compiler::barrierCall(Builder, BarriersFlags);
+    TargetInfo.createBarrierCall(Builder, BarriersFlags);
   }
 
   // Set insert point for future insertions
@@ -271,6 +273,7 @@ static Expected<CallInst *> createFusionCall(
 
 Error SYCLKernelFusion::fuseKernel(
     Module &M, Function &StubFunction, jit_compiler::SYCLModuleInfo *ModInfo,
+    TargetFusionInfo &TargetInfo,
     SmallPtrSetImpl<Function *> &ToCleanUp) const {
   // Retrieve the metadata from the stub function.
   // The first operand of the tuple is the name that the newly created,
@@ -343,12 +346,9 @@ Error SYCLKernelFusion::fuseKernel(
   SmallVector<std::string> FusedArgNames;
   SmallVector<AttributeSet> FusedParamAttributes;
   // We must keep track of some metadata attached to each parameter.
-  // Collect it in lists, so it can be attached to the fused function later on.
-  MDList KernelArgAddressSpaces;
-  MDList KernelArgAccessQualifiers;
-  MDList KernelArgTypes;
-  MDList KernelArgBaseTypes;
-  MDList KernelArgTypeQualifiers;
+  // Collect it, so it can be attached to the fused function later on.
+  MetadataCollection MDCollection{TargetInfo.getKernelMetadataKeys()};
+
   // Add the information about the new kernel to the SYCLModuleInfo.
   // Initialize the jit_compiler::SYCLKernelInfo with the name. The remaining
   // information for functor & argument layout and attributes will be filled in
@@ -425,14 +425,7 @@ Error SYCLKernelFusion::fuseKernel(
     // Add the metadata corresponding to the used arguments to the different
     // lists. NOTE: We do not collect the "kernel_arg_name" metadata, because
     // the kernel arguments receive new names in the fused kernel.
-    addToFusedMetadata(FF, KernelArgAddrSpace, UsedArgsMask,
-                       KernelArgAddressSpaces);
-    addToFusedMetadata(FF, KernelArgAccessQual, UsedArgsMask,
-                       KernelArgAccessQualifiers);
-    addToFusedMetadata(FF, KernelArgType, UsedArgsMask, KernelArgTypes);
-    addToFusedMetadata(FF, KernelArgBaseType, UsedArgsMask, KernelArgBaseTypes);
-    addToFusedMetadata(FF, KernelArgTypeQual, UsedArgsMask,
-                       KernelArgTypeQualifiers);
+    MDCollection.collectFromFunction(FF, UsedArgsMask);
 
     // Update the fused kernel's KernelInfo with information from this input
     // kernel.
@@ -502,26 +495,15 @@ Error SYCLKernelFusion::fuseKernel(
     AI.value().setName(ArgName);
     KernelArgNames.push_back(MDString::get(LLVMCtx, ArgName));
   }
-  // Attach the fused kernel_arg_* metadata collected from the different input
+  // Attach the fused metadata collected from the different input
   // kernels to the fused function.
-  attachFusedMetadata(FusedFunction, "kernel_arg_addr_space",
-                      KernelArgAddressSpaces);
-  attachFusedMetadata(FusedFunction, "kernel_arg_access_qual",
-                      KernelArgAccessQualifiers);
-  attachFusedMetadata(FusedFunction, "kernel_arg_type", KernelArgTypes);
-  attachFusedMetadata(FusedFunction, "kernel_arg_base_type",
-                      KernelArgBaseTypes);
-  attachFusedMetadata(FusedFunction, "kernel_arg_type_qual",
-                      KernelArgTypeQualifiers);
-  attachFusedMetadata(FusedFunction, "kernel_arg_name", KernelArgNames);
+  MDCollection.attachToFunction(FusedFunction);
   // Add metadata for reqd_work_group_size and work_group_size_hint
   attachKernelAttributeMD(LLVMCtx, FusedFunction, FusedKernelInfo);
 
-  // The fused kernel should be a SPIR-V kernel again.
-  // NOTE: If this pass is used in a scenario where input and output
-  // of the compilation are not SPIR-V, care must be taken of other
-  // potential calling conventions here (e.g., nvptx).
-  FusedFunction->setCallingConv(CallingConv::SPIR_KERNEL);
+  // Mark the fused function as a kernel by calling TargetFusionInfo, because
+  // this is target-specific.
+  TargetInfo.addKernelFunction(FusedFunction);
 
   // Fusion is implemented as a two step process: In the first step, we
   // simply create calls to the functions that should be fused into this
@@ -557,9 +539,9 @@ Error SYCLKernelFusion::fuseKernel(
         unsigned ParamIdx = ParamMapping[{FuncIndex, I}];
         CallArgs.push_back(FusedFunction->getArg(ParamIdx));
       }
-      auto CallOrErr = createFusionCall(Builder, IF, CallArgs, KF.ND, NDRange,
-                                        FuncIndex == BarriersEnd, BarriersFlags,
-                                        Remapper, IsHeterogeneousNDRangesList);
+      auto CallOrErr = createFusionCall(
+          Builder, IF, CallArgs, KF.ND, NDRange, FuncIndex == BarriersEnd,
+          BarriersFlags, Remapper, IsHeterogeneousNDRangesList, TargetInfo);
       // Add to the set of original kernel functions that can be deleted after
       // fusion is complete.
       ToCleanUp.insert(IF);
@@ -602,46 +584,8 @@ Error SYCLKernelFusion::fuseKernel(
     }
   }
 
-  // Remove all existing calls of the ITT instrumentation functions. Insert new
-  // ones in the entry block of the fused kernel and every exit block if the
-  // functions are present in the module.
-  // We cannot use the existing SPIRITTAnnotations pass, because that pass might
-  // insert calls to functions not present in the module (e.g., ITT
-  // instrumentations for barriers). As the JITed module is not linked with
-  // libdevice anymore, the functions would remain unresolved and cause the
-  // driver to fail.
-  Function *StartWrapperFunc = M.getFunction(ITTStartWrapper);
-  Function *FinishWrapperFunc = M.getFunction(ITTFinishWrapper);
-  bool InsertWrappers =
-      ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
-       (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
-  auto *WrapperFuncTy =
-      FunctionType::get(Type::getVoidTy(M.getContext()), /*isVarArg*/ false);
-  for (auto &BB : *FusedFunction) {
-    for (auto Inst = BB.begin(); Inst != BB.end();) {
-      if (auto *CB = dyn_cast<CallBase>(Inst)) {
-        if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
-          Inst = Inst->eraseFromParent();
-          continue;
-        }
-      }
-      ++Inst;
-    }
-    if (InsertWrappers) {
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-        auto *WrapperCall =
-            CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
-        WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-      }
-    }
-  }
-  if (InsertWrappers) {
-    FusedFunction->getEntryBlock().getFirstInsertionPt();
-    auto *WrapperCall = CallInst::Create(
-        WrapperFuncTy, StartWrapperFunc, "",
-        &*FusedFunction->getEntryBlock().getFirstInsertionPt());
-    WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-  }
+  // Perform target-specific post-processing of the new fused kernel.
+  TargetInfo.postProcessKernel(FusedFunction);
   return Error::success();
 }
 
@@ -717,23 +661,6 @@ static unsigned getUnsignedFromMD(Metadata *MD) {
   return ConstInt->getZExtValue();
 }
 
-void SYCLKernelFusion::addToFusedMetadata(
-    Function *InputFunction, const StringRef &Kind,
-    const ArrayRef<bool> IsArgPresentMask,
-    SmallVectorImpl<Metadata *> &FusedMDList) const {
-  // Retrieve metadata from one of the input kernels and add it to the list
-  // of fused metadata.
-  assert(InputFunction->hasMetadata(Kind) &&
-         "Required Metadata not present on input kernel");
-  if (auto *MD = InputFunction->getMetadata(Kind)) {
-    for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) {
-      if (std::get<0>(MaskedOps)) {
-        FusedMDList.emplace_back(std::get<1>(MaskedOps).get());
-      }
-    }
-  }
-}
-
 void SYCLKernelFusion::attachFusedMetadata(
     Function *FusedFunction, const StringRef &Kind,
     const ArrayRef<Metadata *> FusedMetadata) const {
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
index 5e26595d2d343..5f52f0a317d14 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
@@ -10,6 +10,7 @@
 #define SYCL_FUSION_PASSES_SYCLKERNELFUSION_H
 
 #include "Kernel.h"
+#include "target/TargetFusionInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -52,8 +53,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
   // locate our own metadata again.
   static constexpr auto MetadataKind = "sycl.kernel.fused";
   static constexpr auto ParameterMDKind = "sycl.kernel.param";
-  static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
-  static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
 
   using MDList = llvm::SmallVector<llvm::Metadata *, 16>;
 
@@ -112,6 +111,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
   llvm::Error
   fuseKernel(llvm::Module &M, llvm::Function &StubFunction,
              jit_compiler::SYCLModuleInfo *ModInfo,
+             llvm::TargetFusionInfo &TargetInfo,
              llvm::SmallPtrSetImpl<llvm::Function *> &ToCleanUp) const;
 
   void canonicalizeParameters(
@@ -119,11 +119,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
 
   Parameter getParamFromMD(llvm::Metadata *MD) const;
 
-  void addToFusedMetadata(
-      llvm::Function *InputFunction, const llvm::StringRef &Kind,
-      const llvm::ArrayRef<bool> IsArgPresentMask,
-      llvm::SmallVectorImpl<llvm::Metadata *> &FusedMDList) const;
-
   void attachFusedMetadata(
       llvm::Function *FusedFunction, const llvm::StringRef &Kind,
       const llvm::ArrayRef<llvm::Metadata *> FusedMetadata) const;
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
new file mode 100644
index 0000000000000..191ab676613bb
--- /dev/null
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -0,0 +1,225 @@
+//==---------------------- TargetFusionInfo.cpp ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetFusionInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+
+using namespace llvm;
+
+//
+// TargetFusionInfo
+//
+
+TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) {
+  llvm::Triple Tri(Mod->getTargetTriple());
+  if (Tri.isNVPTX()) {
+    return TargetFusionInfo(
+        std::shared_ptr<NVPTXTargetFusionInfo>(new NVPTXTargetFusionInfo(Mod)));
+  }
+  if (Tri.isSPIRV()) {
+    return TargetFusionInfo(
+        std::shared_ptr<SPIRVTargetFusionInfo>(new SPIRVTargetFusionInfo(Mod)));
+  }
+  assert(false && "Unsupported target for fusion");
+}
+
+//
+// SPIRVTargetFusionInfo
+//
+
+void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
+  KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL);
+}
+
+ArrayRef<StringRef> SPIRVTargetFusionInfo::getKernelMetadataKeys() {
+  static SmallVector<StringRef> Keys{
+      {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type",
+       "kernel_arg_base_type", "kernel_arg_type_qual"}};
+  return Keys;
+}
+
+void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) {
+  static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
+  static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
+  // Remove all existing calls of the ITT instrumentation functions. Insert new
+  // ones in the entry block of the fused kernel and every exit block if the
+  // functions are present in the module.
+  // We cannot use the existing SPIRITTAnnotations pass, because that pass might
+  // insert calls to functions not present in the module (e.g., ITT
+  // instrumentations for barriers). As the JITed module is not linked with
+  // libdevice anymore, the functions would remain unresolved and cause the
+  // driver to fail.
+  Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper);
+  Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper);
+  bool InsertWrappers =
+      ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
+       (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
+  auto *WrapperFuncTy = FunctionType::get(
+      Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false);
+  for (auto &BB : *KernelFunc) {
+    for (auto Inst = BB.begin(); Inst != BB.end();) {
+      if (auto *CB = dyn_cast<CallBase>(Inst)) {
+        if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
+          Inst = Inst->eraseFromParent();
+          continue;
+        }
+      }
+      ++Inst;
+    }
+    if (InsertWrappers) {
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+        auto *WrapperCall =
+            CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
+        WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
+      }
+    }
+  }
+  if (InsertWrappers) {
+    KernelFunc->getEntryBlock().getFirstInsertionPt();
+    auto *WrapperCall =
+        CallInst::Create(WrapperFuncTy, StartWrapperFunc, "",
+                         &*KernelFunc->getEntryBlock().getFirstInsertionPt());
+    WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
+  }
+}
+
+void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
+                                              int BarrierFlags) {
+  if (BarrierFlags == -1) {
+    return;
+  }
+  assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) &&
+         "Invalid barrier flags");
+
+  static const auto FnAttrs = AttributeSet::get(
+      LLVMMod->getContext(),
+      {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent),
+       Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)});
+
+  static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"};
+
+  Function *F = LLVMMod->getFunction(N);
+  if (!F) {
+    constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage;
+
+    auto *Ty = FunctionType::get(
+        Builder.getVoidTy(),
+        {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
+        false /* isVarArg*/);
+
+    F = Function::Create(Ty, Linkage, N, *LLVMMod);
+
+    F->setAttributes(
+        AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
+    F->setCallingConv(CallingConv::SPIR_FUNC);
+  }
+
+  // See
+  // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
+  SmallVector<Value *> Args{
+      Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+      Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+      Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) |
+                       ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))};
+
+  auto *BarrierCallInst = Builder.CreateCall(F, Args);
+  BarrierCallInst->setAttributes(
+      AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
+  BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC);
+}
+
+//
+// NVPTXTargetFusionInfo
+//
+
+void NVPTXTargetFusionInfo::notifyFunctionsDelete(
+    llvm::ArrayRef<Function *> Funcs) {
+  SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
+  SmallVector<MDNode *> ValidKernels;
+  auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
+  for (auto *Op : OldAnnotations->operands()) {
+    if (auto *TOp = dyn_cast<MDTuple>(Op)) {
+      if (auto *COp = dyn_cast_if_present<ConstantAsMetadata>(
+              TOp->getOperand(0).get())) {
+        if (!DeletedFuncs.contains(COp->getValue())) {
+          ValidKernels.push_back(Op);
+          // Add to the set to also remove duplicate entries.
+          DeletedFuncs.insert(COp->getValue());
+        }
+      }
+    }
+  }
+  LLVMMod->eraseNamedMetadata(OldAnnotations);
+  auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+  for (auto *Kernel : ValidKernels) {
+    NewAnnotations->addOperand(Kernel);
+  }
+}
+
+void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
+  auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+  auto *MDOne = ConstantAsMetadata::get(
+      ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
+  auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel");
+  auto *MDFunc = ConstantAsMetadata::get(KernelFunc);
+  SmallVector<Metadata *, 3> KernelMD({MDFunc, MDKernelString, MDOne});
+  auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD);
+  NVVMAnnotations->addOperand(Tuple);
+}
+
+ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() {
+  // FIXME: Check whether we need to take care of sycl_fixed_targets.
+  static SmallVector<StringRef> Keys{{"kernel_arg_buffer_location",
+                                      "kernel_arg_runtime_aligned",
+                                      "kernel_arg_exclusive_ptr"}};
+  return Keys;
+}
+
+void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
+                                              int BarrierFlags) {
+  if (BarrierFlags == -1) {
+    return;
+  }
+  // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX
+  // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0
+  // instruction, equivalent to the __syncthreads() call in CUDA."
+  Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {});
+}
+
+//
+// MetadataCollection
+//
+
+MetadataCollection::MetadataCollection(ArrayRef<StringRef> MDKeys)
+    : Keys{MDKeys}, Collection(MDKeys.size()) {}
+
+void MetadataCollection::collectFromFunction(
+    llvm::Function *Func, const ArrayRef<bool> IsArgPresentMask) {
+  for (auto &Key : Keys) {
+    // TODO: Do we want to assert for the presence of the metadata here?
+    if (auto *MD = Func->getMetadata(Key)) {
+      for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) {
+        if (std::get<0>(MaskedOps)) {
+          Collection[Key].emplace_back(std::get<1>(MaskedOps).get());
+        }
+      }
+    }
+  }
+}
+
+void MetadataCollection::attachToFunction(llvm::Function *Func) {
+  for (auto &Key : Keys) {
+    // Attach a list of fused metadata for a kind to the fused function.
+    auto *MDEntries = MDNode::get(Func->getContext(), Collection[Key]);
+    Func->setMetadata(Key, MDEntries);
+  }
+}
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h
new file mode 100644
index 0000000000000..653af4904e2ae
--- /dev/null
+++ b/sycl-fusion/passes/target/TargetFusionInfo.h
@@ -0,0 +1,137 @@
+//==-- TargetFusionInfo.h - Encapsule target-specific fusion functionality -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+
+class TargetFusionInfoImpl {
+
+public:
+  virtual ~TargetFusionInfoImpl() = default;
+
+  virtual void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) {
+    (void)Funcs;
+  }
+
+  virtual void addKernelFunction(Function *KernelFunc) { (void)KernelFunc; }
+
+  virtual void postProcessKernel(Function *KernelFunc) { (void)KernelFunc; }
+
+  virtual ArrayRef<StringRef> getKernelMetadataKeys() { return {}; }
+
+  virtual void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) = 0;
+
+protected:
+  explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {};
+
+  llvm::Module *LLVMMod;
+
+  friend class TargetFusionInfo;
+};
+
+class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  void addKernelFunction(Function *KernelFunc) override;
+
+  ArrayRef<StringRef> getKernelMetadataKeys() override;
+
+  void postProcessKernel(Function *KernelFunc) override;
+
+  void createBarrierCall(IRBuilderBase& Builder, int BarrierFlags) override;
+
+private:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
+};
+
+class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) override;
+
+  void addKernelFunction(Function *KernelFunc) override;
+
+  ArrayRef<StringRef> getKernelMetadataKeys() override;
+
+  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) override;
+
+private:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
+};
+
+///
+/// Common interface to target-specific logic around handling of kernel
+/// functions.
+class TargetFusionInfo {
+public:
+  ///
+  /// Create the correct target-specific implementation based on the target
+  /// triple of \p Module.
+  static TargetFusionInfo getTargetFusionInfo(llvm::Module *Module);
+
+  ///
+  /// Notify the target-specific implementation that set of functions \p Funcs
+  /// is about to be erased from the module. This should be called BEFORE
+  /// erasing the functions.
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) {
+    Impl->notifyFunctionsDelete(Funcs);
+  }
+
+  ///
+  /// Notify the target-specific implementation that the function \p KernelFunc
+  /// was added as a new kernel. This should be called AFTER the function has
+  /// been added.
+  void addKernelFunction(llvm::Function *KernelFunc) {
+    Impl->addKernelFunction(KernelFunc);
+  }
+
+  ///
+  /// Target-specific post-processing of the new kernel function \p KernelFunc.
+  /// This should be called AFTER the function has been added and defined.
+  void postProcessKernel(Function *KernelFunc) {
+    Impl->postProcessKernel(KernelFunc);
+  }
+
+  ///
+  /// Get the target-specific list of argument metadata attached to each
+  /// function that should be collected and attached to the fused kernel.
+  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() {
+    return Impl->getKernelMetadataKeys();
+  }
+
+  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) {
+    Impl->createBarrierCall(Builder, BarrierFlags);
+  }
+
+private:
+  using ImplPtr = std::shared_ptr<TargetFusionInfoImpl>;
+
+  TargetFusionInfo(ImplPtr &&I) : Impl{I} {}
+
+  ImplPtr Impl;
+};
+
+///
+/// Simple helper to collect a target-specific set of kernel argument metadata
+/// from input functions and attach it to a fused kernel.
+class MetadataCollection {
+public:
+  explicit MetadataCollection(llvm::ArrayRef<llvm::StringRef> MDKeys);
+
+  void collectFromFunction(llvm::Function *Func,
+                           const ArrayRef<bool> IsArgPresentMask);
+
+  void attachToFunction(llvm::Function *Func);
+
+private:
+  llvm::SmallVector<llvm::StringRef> Keys;
+
+  llvm::StringMap<llvm::SmallVector<llvm::Metadata *>> Collection;
+};
+} // namespace llvm

From ce6855a23a4da643c58cd1b0947d5f6044ea1f92 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 17 Jan 2023 08:19:21 +0000
Subject: [PATCH 08/16] [SYCL][Fusion] Do not require null terminator

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 .../jit-compiler/lib/translation/KernelTranslation.cpp       | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 31fa850d58d63..7e1318c70a4a9 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -111,7 +111,10 @@ KernelTranslator::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
   llvm::StringRef RawData(reinterpret_cast<const char *>(BinInfo.BinaryStart),
                           BinInfo.BinarySize);
   return llvm::parseBitcodeFile(
-      MemoryBuffer::getMemBuffer(RawData)->getMemBufferRef(), LLVMCtx);
+      MemoryBuffer::getMemBuffer(RawData, Kernel.Name,
+                                 /* RequiresNullTermnator*/ false)
+          ->getMemBufferRef(),
+      LLVMCtx);
 }
 
 llvm::Expected<std::unique_ptr<llvm::Module>>

From e89cb781a8ae68fa4542cec0a81c840297e2ba58 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 17 Jan 2023 14:42:40 +0000
Subject: [PATCH 09/16] [SYCL][Fusion] Refactor more target-specific code

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |  1 -
 .../lib/fusion/FusionPipeline.cpp             | 19 ++++-
 .../lib/translation/KernelTranslation.cpp     |  3 -
 sycl-fusion/passes/CMakeLists.txt             |  1 +
 sycl-fusion/passes/cleanup/Cleanup.cpp        | 17 ++--
 sycl-fusion/passes/cleanup/Cleanup.h          |  4 +-
 .../internalization/Internalization.cpp       | 61 ++++---------
 .../passes/kernel-fusion/SYCLKernelFusion.cpp | 13 +--
 sycl-fusion/passes/syclcp/SYCLCP.cpp          | 12 +--
 .../passes/target/TargetFusionInfo.cpp        | 60 +++++++++++--
 sycl-fusion/passes/target/TargetFusionInfo.h  | 85 +++++++++++++++----
 11 files changed, 177 insertions(+), 99 deletions(-)

diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 6c14ac182514a..821ad4cd36369 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -137,7 +137,6 @@ FusionResult KernelFusion::fuseKernels(
 
   SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName);
 
-  // TODO
   BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
 
   if (auto Error = translation::KernelTranslator::translateKernel(
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index 3ede007aa69a4..fb2e2b15200a7 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -23,6 +23,7 @@
 #ifndef NDEBUG
 #include "llvm/IR/Verifier.h"
 #endif // NDEBUG
+#include "llvm/ADT/Triple.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
@@ -35,6 +36,21 @@ using namespace llvm;
 using namespace jit_compiler;
 using namespace jit_compiler::fusion;
 
+static unsigned getFlatAddressSpace(Module &Mod) {
+  // Ideally, we could get this information from the TargetTransformInfo, but
+  // the SPIR-V backend does not yet seem to have an implementation for that.
+  llvm::Triple Tri(Mod.getTargetTriple());
+  if (Tri.isNVPTX()) {
+    return 0;
+  }
+  if (Tri.isSPIRV() || Tri.isSPIR()) {
+    return 4;
+  }
+  // Identical to the definition of "UninitializedAddressSpace" in
+  // "InferAddressSpaces.cpp".
+  return std::numeric_limits<unsigned>::max();
+}
+
 std::unique_ptr<SYCLModuleInfo>
 FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
                                 int BarriersFlags) {
@@ -86,9 +102,8 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
     // Run the InferAddressSpace pass to remove as many address-space casts
     // to/from generic address-space as possible, because these hinder
     // internalization.
-    // FIXME: TTI should tell the pass which address space to use.
     // Ideally, the static compiler should have performed that job.
-    constexpr unsigned FlatAddressSpace = 4;
+    unsigned FlatAddressSpace = getFlatAddressSpace(Mod);
     FPM.addPass(InferAddressSpacesPass(FlatAddressSpace));
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 7e1318c70a4a9..3db02dbfd717c 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -217,8 +217,5 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
     ASMStream.flush();
   }
 
-  llvm::dbgs() << "PTX size: " << PTXASM.size() << "\n";
-  llvm::dbgs() << "PTX:\n" << PTXASM << "\n";
-
   return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX);
 }
diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index 41398f6ee9e99..c74c3c40e34e0 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(SYCLKernelFusion MODULE
   syclcp/SYCLCP.cpp
   cleanup/Cleanup.cpp
   debug/PassDebug.cpp
+  target/TargetFusionInfo.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/sycl-fusion/passes/cleanup/Cleanup.cpp b/sycl-fusion/passes/cleanup/Cleanup.cpp
index 07fc86d9d0dd3..a5bc3e634c527 100644
--- a/sycl-fusion/passes/cleanup/Cleanup.cpp
+++ b/sycl-fusion/passes/cleanup/Cleanup.cpp
@@ -45,7 +45,8 @@ static void copyAttributesFrom(const BitVector &Mask, Function *NF,
                                        PAL.getRetAttrs(), Attributes));
 }
 
-static Function *createMaskedFunction(const BitVector &Mask, Function *F) {
+static Function *createMaskedFunction(const BitVector &Mask, Function *F,
+                                      TargetFusionInfo &TFI) {
   // Declare
   FunctionType *NFTy = createMaskedFunctionType(Mask, F->getFunctionType());
   Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(),
@@ -78,7 +79,9 @@ static Function *createMaskedFunction(const BitVector &Mask, Function *F) {
   }
 
   // Erase old function
+  TFI.notifyFunctionsDelete(F);
   F->eraseFromParent();
+  TFI.addKernelFunction(NF);
   return NF;
 }
 
@@ -104,9 +107,9 @@ static void updateArgUsageMask(jit_compiler::SYCLKernelInfo *Info,
 
 static void applyArgMask(const jit_compiler::ArgUsageMask &NewArgInfo,
                          const BitVector &Mask, Function *F,
-                         ModuleAnalysisManager &AM) {
+                         ModuleAnalysisManager &AM, TargetFusionInfo &TFI) {
   // Create the function without the masked-out args.
-  Function *NF = createMaskedFunction(Mask, F);
+  Function *NF = createMaskedFunction(Mask, F, TFI);
   // Update the unused args mask.
   jit_compiler::SYCLModuleInfo *ModuleInfo =
       AM.getResult<SYCLModuleInfoAnalysis>(*NF->getParent()).ModuleInfo;
@@ -125,9 +128,7 @@ static void maskMD(const BitVector &Mask, Function *F) {
   SmallVector<std::pair<unsigned, MDNode *>> MD;
   F->getAllMetadata(MD);
   for (const auto &Entry : MD) {
-    auto MDKind = Entry.first;
-    if (MDKind == F->getContext().getMDKindID("reqd_work_group_size") ||
-        MDKind == F->getContext().getMDKindID("work_group_size_hint")) {
+    if (Entry.second->getNumOperands() != Mask.size()) {
       // Some metadata, e.g., the metadata for reqd_work_group_size and
       // work_group_size_hint is independent from the number of arguments
       // and must not be filtered by the argument usage mask.
@@ -144,7 +145,7 @@ static void maskMD(const BitVector &Mask, Function *F) {
 
 void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo,
                        Function *F, ModuleAnalysisManager &AM,
-                       ArrayRef<StringRef> MDToErase) {
+                       TargetFusionInfo &TFI, ArrayRef<StringRef> MDToErase) {
   // Erase metadata.
   for (auto Key : MDToErase) {
     F->setMetadata(Key, nullptr);
@@ -158,5 +159,5 @@ void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo,
   // Update metadata.
   maskMD(CleanupMask, F);
   // Remove arguments.
-  applyArgMask(ArgUsageInfo, CleanupMask, F, AM);
+  applyArgMask(ArgUsageInfo, CleanupMask, F, AM, TFI);
 }
diff --git a/sycl-fusion/passes/cleanup/Cleanup.h b/sycl-fusion/passes/cleanup/Cleanup.h
index 49619e4b9af07..491d96f46a886 100644
--- a/sycl-fusion/passes/cleanup/Cleanup.h
+++ b/sycl-fusion/passes/cleanup/Cleanup.h
@@ -10,6 +10,7 @@
 #define SYCL_FUSION_PASSES_CLEANUP_H
 
 #include "Kernel.h"
+#include "target/TargetFusionInfo.h"
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/ADT/BitVector.h>
 #include <llvm/ADT/StringRef.h>
@@ -25,7 +26,8 @@ namespace llvm {
 /// @param[in] AM Module analysis manager.
 /// @param[in] EraseMD Keys of metadata to remove.
 void fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo, Function *F,
-                 ModuleAnalysisManager &AM, ArrayRef<StringRef> EraseMD);
+                 ModuleAnalysisManager &AM, TargetFusionInfo &TFI,
+                 ArrayRef<StringRef> EraseMD);
 } // namespace llvm
 
 #endif // SYCL_FUSION_PASSES_CLEANUP_H
diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp
index 61f3a0738921b..bca46ba43acb3 100644
--- a/sycl-fusion/passes/internalization/Internalization.cpp
+++ b/sycl-fusion/passes/internalization/Internalization.cpp
@@ -19,16 +19,12 @@
 #include "cleanup/Cleanup.h"
 #include "debug/PassDebug.h"
 #include "metadata/MDParsing.h"
+#include "target/TargetFusionInfo.h"
 
 #define DEBUG_TYPE "sycl-fusion"
 
 using namespace llvm;
 
-// Corresponds to definition of spir_private and spir_local in
-// "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
-constexpr static unsigned PrivateAS{0};
-constexpr static unsigned LocalAS{3};
-
 constexpr static StringLiteral PrivatePromotion{"private"};
 constexpr static StringLiteral LocalPromotion{"local"};
 constexpr static StringLiteral NoPromotion{"none"};
@@ -44,6 +40,8 @@ struct SYCLInternalizerImpl {
   StringRef Kind;
   /// Whether or not to create allocas.
   bool CreateAllocas;
+  /// Interface to target-specific information.
+  TargetFusionInfo TargetInfo;
 
   /// Implements internalization the pass run.
   PreservedAnalyses operator()(Module &M, ModuleAnalysisManager &AM) const;
@@ -338,11 +336,14 @@ Error SYCLInternalizerImpl::checkArgsPromotable(
 
 ///
 /// Function to perform the required cleaning actions.
-static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal) {
+static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal,
+                    const TargetFusionInfo &TFI) {
   if (!KeepOriginal) {
     NewF->takeName(OldF);
+    TFI.notifyFunctionsDelete(OldF);
     OldF->eraseFromParent();
   }
+  TFI.addKernelFunction(NewF);
 }
 
 void SYCLInternalizerImpl::promoteCall(CallBase *C, const Value *Val,
@@ -499,11 +500,6 @@ Value *replaceByNewAlloca(Argument *Arg, unsigned AS, std::size_t LocalSize) {
 Function *SYCLInternalizerImpl::promoteFunctionArgs(
     Function *OldF, ArrayRef<std::size_t> PromoteToLocal, bool CreateAllocas,
     bool KeepOriginal) const {
-  constexpr unsigned AddressSpaceBitWidth{32};
-
-  auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
-      IntegerType::get(OldF->getContext(), AddressSpaceBitWidth), AS));
-
   // We first declare the promoted function with the new signature.
   Function *NewF =
       getPromotedFunctionDeclaration(OldF, PromoteToLocal, AS,
@@ -542,32 +538,9 @@ Function *SYCLInternalizerImpl::promoteFunctionArgs(
     promoteValue(Arg, LocalSize);
   }
 
-  {
-    constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"};
-    if (auto *AddrspaceMD =
-            dyn_cast_or_null<MDNode>(NewF->getMetadata(KernelArgAddrSpaceMD))) {
-      // If we have kernel_arg_addr_space metadata in the original function,
-      // we should update it in the new one.
-      SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
-                                      AddrspaceMD->op_end()};
-      for (auto I : enumerate(PromoteToLocal)) {
-        if (I.value() == 0) {
-          continue;
-        }
-        const auto Index = I.index();
-        if (const auto *PtrTy =
-                dyn_cast<PointerType>(NewF->getArg(Index)->getType())) {
-          if (PtrTy->getAddressSpace() == LocalAS) {
-            NewInfo[Index] = NewAddrspace;
-          }
-        }
-      }
-      NewF->setMetadata(KernelArgAddrSpaceMD,
-                        MDNode::get(NewF->getContext(), NewInfo));
-    }
-  }
+  TargetInfo.updateAddressSpaceMetadata(NewF, PromoteToLocal, AS);
 
-  cleanup(OldF, NewF, KeepOriginal);
+  cleanup(OldF, NewF, KeepOriginal, TargetInfo);
 
   return NewF;
 }
@@ -625,7 +598,8 @@ SYCLInternalizerImpl::operator()(Module &M, ModuleAnalysisManager &AM) const {
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
-static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
+static void moduleCleanup(Module &M, ModuleAnalysisManager &AM,
+                          TargetFusionInfo &TFI) {
   SmallVector<Function *> ToProcess;
   for (auto &F : M) {
     if (F.hasMetadata(SYCLInternalizer::Key)) {
@@ -650,24 +624,25 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
         NewArgInfo.push_back(jit_compiler::ArgUsage::Used);
       }
     }
-    fullCleanup(NewArgInfo, F, AM,
+    fullCleanup(NewArgInfo, F, AM, TFI,
                 {SYCLInternalizer::Key, SYCLInternalizer::LocalSizeKey});
   }
 }
 
 PreservedAnalyses llvm::SYCLInternalizer::run(Module &M,
                                               ModuleAnalysisManager &AM) {
+  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
   // Private promotion
-  const PreservedAnalyses Tmp =
-      SYCLInternalizerImpl{PrivateAS, PrivatePromotion, true}(M, AM);
+  const PreservedAnalyses Tmp = SYCLInternalizerImpl{
+      TFI.getPrivateAddressSpace(), PrivatePromotion, true, TFI}(M, AM);
   // Local promotion
-  PreservedAnalyses Res =
-      SYCLInternalizerImpl{LocalAS, LocalPromotion, false}(M, AM);
+  PreservedAnalyses Res = SYCLInternalizerImpl{
+      TFI.getLocalAddressSpace(), LocalPromotion, false, TFI}(M, AM);
 
   Res.intersect(Tmp);
 
   if (!Res.areAllPreserved()) {
-    moduleCleanup(M, AM);
+    moduleCleanup(M, AM, TFI);
   }
   return Res;
 }
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
index d3dec7f16a81f..a0079affbaa06 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
@@ -423,8 +423,7 @@ Error SYCLKernelFusion::fuseKernel(
       ++ParamIndex;
     }
     // Add the metadata corresponding to the used arguments to the different
-    // lists. NOTE: We do not collect the "kernel_arg_name" metadata, because
-    // the kernel arguments receive new names in the fused kernel.
+    // lists.
     MDCollection.collectFromFunction(FF, UsedArgsMask);
 
     // Update the fused kernel's KernelInfo with information from this input
@@ -484,16 +483,12 @@ Error SYCLKernelFusion::fuseKernel(
   }
 
   // Attach names to the arguments. The name includes a prefix for the kernel
-  // from which this argument came. The names are also attached as metadata
-  // with kind "kernel_arg_name".
-  // NOTE: While the kernel_arg_name metadata is required, naming the
-  // parameters themselves is not necessary for functionality, it just improves
-  // readibility for debugging purposes.
-  SmallVector<Metadata *, 16> KernelArgNames;
+  // from which this argument came. Naming the parameters themselves is not
+  // necessary for functionality, it just improves readibility for debugging
+  // purposes.
   for (const auto &AI : llvm::enumerate(FusedFunction->args())) {
     auto &ArgName = FusedArgNames[AI.index()];
     AI.value().setName(ArgName);
-    KernelArgNames.push_back(MDString::get(LLVMCtx, ArgName));
   }
   // Attach the fused metadata collected from the different input
   // kernels to the fused function.
diff --git a/sycl-fusion/passes/syclcp/SYCLCP.cpp b/sycl-fusion/passes/syclcp/SYCLCP.cpp
index b520620c232d1..5cb9b00433ef7 100644
--- a/sycl-fusion/passes/syclcp/SYCLCP.cpp
+++ b/sycl-fusion/passes/syclcp/SYCLCP.cpp
@@ -41,7 +41,7 @@ static Expected<SmallVector<ConstantInfo>> getCPFromMD(Function *F) {
   MDNode *MD = F->getMetadata(SYCLCP::Key);
   if (!MD) {
     return createStringError(inconvertibleErrorCode(),
-                             "Private promotion metadata not available");
+                             "Constant progagation metadata not available");
   }
   for (auto I : enumerate(MD->operands())) {
     Expected<SmallVector<unsigned char>> Val =
@@ -205,7 +205,8 @@ static bool propagateConstants(Function *F, ArrayRef<ConstantInfo> Constants) {
   return Changed;
 }
 
-static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
+static void moduleCleanup(Module &M, ModuleAnalysisManager &AM,
+                          TargetFusionInfo &TFI) {
   SmallVector<Function *> ToProcess;
   for (auto &F : M) {
     if (F.hasMetadata(SYCLCP::Key)) {
@@ -219,14 +220,13 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
       if (const auto *MDS = dyn_cast<MDString>(I.value().get())) {
         // A value is masked-out if it has a non-empty MDString
         if (MDS->getLength() > 0) {
-          // And is either an integer or a FP number.
           NewArgInfo.push_back(jit_compiler::ArgUsage::Unused);
           continue;
         }
       }
       NewArgInfo.push_back(jit_compiler::ArgUsage::Used);
     }
-    fullCleanup(NewArgInfo, F, AM, {SYCLCP::Key});
+    fullCleanup(NewArgInfo, F, AM, TFI, {SYCLCP::Key});
   }
 }
 
@@ -249,8 +249,10 @@ PreservedAnalyses SYCLCP::run(Module &M, ModuleAnalysisManager &AM) {
     Changed = propagateConstants(F, *ConstantsOrErr) || Changed;
   }
 
+  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+
   if (Changed) {
-    moduleCleanup(M, AM);
+    moduleCleanup(M, AM, TFI);
   }
 
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
index 191ab676613bb..c17e2464ac323 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.cpp
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -25,7 +25,7 @@ TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) {
     return TargetFusionInfo(
         std::shared_ptr<NVPTXTargetFusionInfo>(new NVPTXTargetFusionInfo(Mod)));
   }
-  if (Tri.isSPIRV()) {
+  if (Tri.isSPIRV() || Tri.isSPIR()) {
     return TargetFusionInfo(
         std::shared_ptr<SPIRVTargetFusionInfo>(new SPIRVTargetFusionInfo(Mod)));
   }
@@ -36,18 +36,28 @@ TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) {
 // SPIRVTargetFusionInfo
 //
 
-void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
+void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) const {
   KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL);
 }
 
-ArrayRef<StringRef> SPIRVTargetFusionInfo::getKernelMetadataKeys() {
+ArrayRef<StringRef> SPIRVTargetFusionInfo::getKernelMetadataKeys() const {
+  // NOTE: We do not collect the "kernel_arg_name" metadata, because
+  // the kernel arguments receive new names in the fused kernel.
   static SmallVector<StringRef> Keys{
       {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type",
        "kernel_arg_base_type", "kernel_arg_type_qual"}};
   return Keys;
 }
 
-void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) {
+void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) const {
+  // Attach the kernel_arg_name metadata.
+  SmallVector<Metadata *, 16> KernelArgNames;
+  for (auto &P : KernelFunc->args()) {
+    KernelArgNames.push_back(MDString::get(LLVMMod->getContext(), P.getName()));
+  }
+  auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames);
+  KernelFunc->setMetadata("kernel_arg_name", ArgNameMD);
+
   static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
   static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
   // Remove all existing calls of the ITT instrumentation functions. Insert new
@@ -93,7 +103,7 @@ void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) {
 }
 
 void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
-                                              int BarrierFlags) {
+                                              int BarrierFlags) const {
   if (BarrierFlags == -1) {
     return;
   }
@@ -137,12 +147,44 @@ void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
   BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC);
 }
 
+void SPIRVTargetFusionInfo::updateAddressSpaceMetadata(
+    Function *KernelFunc, ArrayRef<size_t> LocalSize,
+    unsigned AddressSpace) const {
+  static constexpr unsigned AddressSpaceBitWidth{32};
+  static constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"};
+
+  auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
+      IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth),
+      AddressSpace));
+  if (auto *AddrspaceMD = dyn_cast_or_null<MDNode>(
+          KernelFunc->getMetadata(KernelArgAddrSpaceMD))) {
+    // If we have kernel_arg_addr_space metadata in the original function,
+    // we should update it in the new one.
+    SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
+                                    AddrspaceMD->op_end()};
+    for (auto I : enumerate(LocalSize)) {
+      if (I.value() == 0) {
+        continue;
+      }
+      const auto Index = I.index();
+      if (const auto *PtrTy =
+              dyn_cast<PointerType>(KernelFunc->getArg(Index)->getType())) {
+        if (PtrTy->getAddressSpace() == getLocalAddressSpace()) {
+          NewInfo[Index] = NewAddrspace;
+        }
+      }
+    }
+    KernelFunc->setMetadata(KernelArgAddrSpaceMD,
+                            MDNode::get(KernelFunc->getContext(), NewInfo));
+  }
+}
+
 //
 // NVPTXTargetFusionInfo
 //
 
 void NVPTXTargetFusionInfo::notifyFunctionsDelete(
-    llvm::ArrayRef<Function *> Funcs) {
+    llvm::ArrayRef<Function *> Funcs) const {
   SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
   SmallVector<MDNode *> ValidKernels;
   auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
@@ -165,7 +207,7 @@ void NVPTXTargetFusionInfo::notifyFunctionsDelete(
   }
 }
 
-void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
+void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) const {
   auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
   auto *MDOne = ConstantAsMetadata::get(
       ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
@@ -176,7 +218,7 @@ void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
   NVVMAnnotations->addOperand(Tuple);
 }
 
-ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() {
+ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() const {
   // FIXME: Check whether we need to take care of sycl_fixed_targets.
   static SmallVector<StringRef> Keys{{"kernel_arg_buffer_location",
                                       "kernel_arg_runtime_aligned",
@@ -185,7 +227,7 @@ ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() {
 }
 
 void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
-                                              int BarrierFlags) {
+                                              int BarrierFlags) const {
   if (BarrierFlags == -1) {
     return;
   }
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h
index 653af4904e2ae..c307de2dd764e 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.h
+++ b/sycl-fusion/passes/target/TargetFusionInfo.h
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
+#define SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
+
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
@@ -17,17 +20,33 @@ class TargetFusionInfoImpl {
 public:
   virtual ~TargetFusionInfoImpl() = default;
 
-  virtual void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) {
+  virtual void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const {
     (void)Funcs;
   }
 
-  virtual void addKernelFunction(Function *KernelFunc) { (void)KernelFunc; }
+  virtual void addKernelFunction(Function *KernelFunc) const {
+    (void)KernelFunc;
+  }
+
+  virtual void postProcessKernel(Function *KernelFunc) const {
+    (void)KernelFunc;
+  }
+
+  virtual ArrayRef<StringRef> getKernelMetadataKeys() const { return {}; }
+
+  virtual void createBarrierCall(IRBuilderBase &Builder,
+                                 int BarrierFlags) const = 0;
 
-  virtual void postProcessKernel(Function *KernelFunc) { (void)KernelFunc; }
+  virtual unsigned getPrivateAddressSpace() const = 0;
 
-  virtual ArrayRef<StringRef> getKernelMetadataKeys() { return {}; }
+  virtual unsigned getLocalAddressSpace() const = 0;
 
-  virtual void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) = 0;
+  virtual void updateAddressSpaceMetadata(Function *KernelFunc,
+                                          ArrayRef<size_t> LocalSize,
+                                          unsigned AddressSpace) const {
+    (void)KernelFunc;
+    (void)LocalSize;
+  }
 
 protected:
   explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {};
@@ -39,13 +58,23 @@ class TargetFusionInfoImpl {
 
 class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
 public:
-  void addKernelFunction(Function *KernelFunc) override;
+  void addKernelFunction(Function *KernelFunc) const override;
+
+  ArrayRef<StringRef> getKernelMetadataKeys() const override;
 
-  ArrayRef<StringRef> getKernelMetadataKeys() override;
+  void postProcessKernel(Function *KernelFunc) const override;
 
-  void postProcessKernel(Function *KernelFunc) override;
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override;
 
-  void createBarrierCall(IRBuilderBase& Builder, int BarrierFlags) override;
+  // Corresponds to definition of spir_private and spir_local in
+  // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
+
+  void updateAddressSpaceMetadata(Function *KernelFunc,
+                                  ArrayRef<size_t> LocalSize,
+                                  unsigned AddressSpace) const override;
 
 private:
   using TargetFusionInfoImpl::TargetFusionInfoImpl;
@@ -53,13 +82,19 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
 
 class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
 public:
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) override;
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override;
+
+  void addKernelFunction(Function *KernelFunc) const override;
 
-  void addKernelFunction(Function *KernelFunc) override;
+  ArrayRef<StringRef> getKernelMetadataKeys() const override;
 
-  ArrayRef<StringRef> getKernelMetadataKeys() override;
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override;
 
-  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) override;
+  // Corresponds to the definitions in the LLVM NVPTX backend user guide:
+  // https://llvm.org/docs/NVPTXUsage.html#address-spaces
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
 
 private:
   using TargetFusionInfoImpl::TargetFusionInfoImpl;
@@ -79,7 +114,7 @@ class TargetFusionInfo {
   /// Notify the target-specific implementation that set of functions \p Funcs
   /// is about to be erased from the module. This should be called BEFORE
   /// erasing the functions.
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) {
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const {
     Impl->notifyFunctionsDelete(Funcs);
   }
 
@@ -87,28 +122,40 @@ class TargetFusionInfo {
   /// Notify the target-specific implementation that the function \p KernelFunc
   /// was added as a new kernel. This should be called AFTER the function has
   /// been added.
-  void addKernelFunction(llvm::Function *KernelFunc) {
+  void addKernelFunction(llvm::Function *KernelFunc) const {
     Impl->addKernelFunction(KernelFunc);
   }
 
   ///
   /// Target-specific post-processing of the new kernel function \p KernelFunc.
   /// This should be called AFTER the function has been added and defined.
-  void postProcessKernel(Function *KernelFunc) {
+  void postProcessKernel(Function *KernelFunc) const {
     Impl->postProcessKernel(KernelFunc);
   }
 
   ///
   /// Get the target-specific list of argument metadata attached to each
   /// function that should be collected and attached to the fused kernel.
-  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() {
+  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() const {
     return Impl->getKernelMetadataKeys();
   }
 
-  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) {
+  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const {
     Impl->createBarrierCall(Builder, BarrierFlags);
   }
 
+  unsigned getPrivateAddressSpace() const {
+    return Impl->getPrivateAddressSpace();
+  }
+
+  unsigned getLocalAddressSpace() const { return Impl->getLocalAddressSpace(); }
+
+  void updateAddressSpaceMetadata(Function *KernelFunc,
+                                  ArrayRef<size_t> LocalSize,
+                                  unsigned AddressSpace) const {
+    Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace);
+  }
+
 private:
   using ImplPtr = std::shared_ptr<TargetFusionInfoImpl>;
 
@@ -135,3 +182,5 @@ class MetadataCollection {
   llvm::StringMap<llvm::SmallVector<llvm::Metadata *>> Collection;
 };
 } // namespace llvm
+
+#endif // SYCL_FUSION_PASSES_TARGETFUSIONINFO_H

From da50ca721d796d5e94a4b2782bafdd5d3c0e944a Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 14 Feb 2023 13:20:48 +0000
Subject: [PATCH 10/16] [SYCL][Fusion] Handle attributes for CUDA fusion

Handle reqd_work_group_size and work_group_size_hint attributes.

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/include/JITContext.h |  3 +-
 .../lib/translation/KernelTranslation.cpp     | 42 +++++++++++++++++
 .../lib/translation/KernelTranslation.h       |  2 +-
 .../lib/translation/SPIRVLLVMTranslation.cpp  | 37 ---------------
 .../lib/translation/SPIRVLLVMTranslation.h    | 18 +-------
 sycl/source/detail/jit_compiler.cpp           | 46 +++++++++++++++++--
 sycl/source/detail/jit_compiler.hpp           |  7 ++-
 sycl/source/detail/jit_device_binaries.cpp    |  6 ++-
 sycl/source/detail/jit_device_binaries.hpp    |  5 +-
 9 files changed, 102 insertions(+), 64 deletions(-)

diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index d4654f19820ce..c26312ceb70e6 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -65,8 +65,7 @@ class JITContext {
 
   llvm::LLVMContext *getLLVMContext();
 
-  KernelBinary &emplaceSPIRVBinary(std::string Binary,
-                                   BinaryFormat Format);
+  KernelBinary &emplaceSPIRVBinary(std::string Binary, BinaryFormat Format);
 
   std::optional<SYCLKernelInfo> getCacheEntry(CacheKeyT &Identifier) const;
 
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 3db02dbfd717c..1436f51f40aaa 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -9,6 +9,7 @@
 #include "KernelTranslation.h"
 #include "SPIRVLLVMTranslation.h"
 #include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -21,6 +22,43 @@ using namespace jit_compiler;
 using namespace jit_compiler::translation;
 using namespace llvm;
 
+///
+/// Get an attribute value consisting of NumValues scalar constant integers
+/// from the MDNode.
+static void getAttributeValues(std::vector<std::string> &Values, MDNode *MD) {
+  for (const auto &MDOp : MD->operands()) {
+    auto *ConstantMD = cast<ConstantAsMetadata>(MDOp);
+    auto *ConstInt = cast<ConstantInt>(ConstantMD->getValue());
+    Values.push_back(std::to_string(ConstInt->getZExtValue()));
+  }
+}
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size";
+// NOLINTNEXTLINE(readability-identifier-naming)
+static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint";
+
+///
+/// Restore kernel attributes for the kernel in Info from the metadata
+/// attached to its kernel function in the LLVM module Mod.
+/// Currently supported attributes:
+///   - reqd_work_group_size
+///   - work_group_size_hint
+static void restoreKernelAttributes(Module *Mod, SYCLKernelInfo &Info) {
+  auto *KernelFunction = Mod->getFunction(Info.Name);
+  assert(KernelFunction && "Kernel function not present in module");
+  if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) {
+    SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR};
+    getAttributeValues(ReqdAttr.Values, MD);
+    Info.Attributes.push_back(ReqdAttr);
+  }
+  if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) {
+    SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR};
+    getAttributeValues(HintAttr.Values, MD);
+    Info.Attributes.push_back(HintAttr);
+  }
+}
+
 llvm::Expected<std::unique_ptr<llvm::Module>>
 KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx,
                               std::vector<SYCLKernelInfo> &Kernels) {
@@ -100,6 +138,10 @@ KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx,
             "Number of address bits between SPIR-V modules does not match");
       }
     }
+    // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
+    // 'work_group_size_hint' from metadata attached to the kernel function and
+    // store it in the SYCLKernelInfo.
+    restoreKernelAttributes(Result.get(), Kernel);
   }
   return std::move(Result);
 }
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
index 3d6824edbdd37..276b6e536d269 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -23,7 +23,7 @@ class KernelTranslator {
   loadKernels(llvm::LLVMContext &LLVMCtx, std::vector<SYCLKernelInfo> &Kernels);
 
   static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod,
-                              JITContext &JITCtx, BinaryFormat Format);
+                                     JITContext &JITCtx, BinaryFormat Format);
 
 private:
   ///
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
index 1046fd6dc2907..a5effb33a85e1 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
@@ -11,9 +11,7 @@
 #include "Kernel.h"
 #include "LLVMSPIRVLib.h"
 #include "helper/ErrorHandling.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/raw_ostream.h"
@@ -24,37 +22,6 @@ using namespace jit_compiler;
 using namespace jit_compiler::translation;
 using namespace llvm;
 
-void SPIRVLLVMTranslator::getAttributeValues(std::vector<std::string> &Values,
-                                             MDNode *MD, size_t NumValues) {
-  assert(MD->getNumOperands() == NumValues && "Incorrect number of values");
-  for (const auto &MDOp : MD->operands()) {
-    auto *ConstantMD = cast<ConstantAsMetadata>(MDOp);
-    auto *ConstInt = cast<ConstantInt>(ConstantMD->getValue());
-    Values.push_back(std::to_string(ConstInt->getZExtValue()));
-  }
-}
-
-// NOLINTNEXTLINE(readability-identifier-naming)
-static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size";
-// NOLINTNEXTLINE(readability-identifier-naming)
-static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint";
-
-void SPIRVLLVMTranslator::restoreKernelAttributes(Module *Mod,
-                                                  SYCLKernelInfo &Info) {
-  auto *KernelFunction = Mod->getFunction(Info.Name);
-  assert(KernelFunction && "Kernel function not present in module");
-  if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) {
-    SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR};
-    getAttributeValues(ReqdAttr.Values, MD, 3);
-    Info.Attributes.push_back(ReqdAttr);
-  }
-  if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) {
-    SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR};
-    getAttributeValues(HintAttr.Values, MD, 3);
-    Info.Attributes.push_back(HintAttr);
-  }
-}
-
 SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() {
   static auto Opts = []() -> SPIRV::TranslatorOpts {
     // Options for translation between SPIR-V and LLVM IR.
@@ -113,10 +80,6 @@ SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
   }
   std::unique_ptr<Module> NewMod{LLVMMod};
 
-  // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
-  // 'work_group_size_hint' from metadata attached to the kernel function and
-  // store it in the SYCLKernelInfo.
-  restoreKernelAttributes(NewMod.get(), Kernel);
   return std::move(NewMod);
 }
 
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
index 440c00103b0d5..c8cdf2bf90ca0 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
@@ -30,24 +30,10 @@ class SPIRVLLVMTranslator {
   ///
   /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and
   /// return a pointer to its container.
-  static llvm::Expected<KernelBinary *> translateLLVMtoSPIRV(llvm::Module &Mod,
-                                                            JITContext &JITCtx);
+  static llvm::Expected<KernelBinary *>
+  translateLLVMtoSPIRV(llvm::Module &Mod, JITContext &JITCtx);
 
 private:
-  ///
-  /// Get an attribute value consisting of NumValues scalar constant integers
-  /// from the MDNode.
-  static void getAttributeValues(std::vector<std::string> &Values,
-                                 llvm::MDNode *MD, size_t NumValues);
-
-  ///
-  /// Restore kernel attributes for the kernel in Info from the metadata
-  /// attached to its kernel function in the LLVM module Mod.
-  /// Currently supported attributes:
-  ///   - reqd_work_group_size
-  ///   - work_group_size_hint
-  static void restoreKernelAttributes(llvm::Module *Mod, SYCLKernelInfo &Info);
-
   ///
   /// Default settings for the SPIRV translation options.
   static SPIRV::TranslatorOpts &translatorOpts();
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index fe645e43ab28a..8770f58172548 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -921,10 +921,33 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
 
   Binary.addProperty(std::move(ArgMaskPropSet));
 
+  if (Format == ::jit_compiler::BinaryFormat::PTX) {
+    // Add a program metadata property with the reqd_work_group_size attribute.
+    // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference.
+    auto ReqdWGS = std::find_if(
+        FusedKernelInfo.Attributes.begin(), FusedKernelInfo.Attributes.end(),
+        [](const ::jit_compiler::SYCLKernelAttribute &Attr) {
+          return Attr.AttributeName == "reqd_work_group_size";
+        });
+    if (ReqdWGS != FusedKernelInfo.Attributes.end()) {
+      auto Encoded = encodeReqdWorkGroupSize(*ReqdWGS);
+      std::stringstream PropName;
+      PropName << FusedKernelInfo.Name;
+      PropName << __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE;
+      PropertyContainer ReqdWorkGroupSizeProp{
+          PropName.str(), Encoded.data(), Encoded.size(),
+          pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY};
+      PropertySetContainer ProgramMetadata{
+          __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA};
+      ProgramMetadata.addProperty(std::move(ReqdWorkGroupSizeProp));
+      Binary.addProperty(std::move(ProgramMetadata));
+    }
+  }
+
   DeviceBinariesCollection Collection;
-  Collection.addDeviceBinary(std::move(Binary),
-                             FusedKernelInfo.BinaryInfo.BinaryStart,
-                             FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat);
+  Collection.addDeviceBinary(
+      std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart,
+      FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat);
 
   JITDeviceBinaries.push_back(std::move(Collection));
   return JITDeviceBinaries.back().getPIDeviceStruct();
@@ -960,6 +983,23 @@ std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
   return Encoded;
 }
 
+std::vector<uint8_t> jit_compiler::encodeReqdWorkGroupSize(
+    const ::jit_compiler::SYCLKernelAttribute &Attr) const {
+  assert(Attr.AttributeName == "reqd_work_group_size");
+  size_t NumBytes = sizeof(uint64_t) + (Attr.Values.size() * sizeof(uint32_t));
+  std::vector<uint8_t> Encoded(NumBytes, 0u);
+  uint8_t *Ptr = Encoded.data();
+  // Skip 64-bit wide size argument with value 0 at the start of the data.
+  // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference.
+  Ptr += sizeof(uint64_t);
+  for (const auto &Val : Attr.Values) {
+    uint32_t UVal = std::stoul(Val);
+    std::memcpy(Ptr, &UVal, sizeof(uint32_t));
+    Ptr += sizeof(uint32_t);
+  }
+  return Encoded;
+}
+
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index e02be562de3ee..4b299572bacbc 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -16,6 +16,7 @@ namespace jit_compiler {
 enum class BinaryFormat;
 class JITContext;
 struct SYCLKernelInfo;
+struct SYCLKernelAttribute;
 using ArgUsageMask = std::vector<unsigned char>;
 } // namespace jit_compiler
 
@@ -47,11 +48,15 @@ class jit_compiler {
   jit_compiler &operator=(const jit_compiler &&) = delete;
 
   pi_device_binaries
-  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, ::jit_compiler::BinaryFormat Format);
+  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
+                       ::jit_compiler::BinaryFormat Format);
 
   std::vector<uint8_t>
   encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const;
 
+  std::vector<uint8_t> encodeReqdWorkGroupSize(
+      const ::jit_compiler::SYCLKernelAttribute &Attr) const;
+
   // Manages the lifetime of the PI structs for device binaries.
   std::vector<DeviceBinariesCollection> JITDeviceBinaries;
 
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
index 59530a0e691ce..d0cc9e824bc07 100644
--- a/sycl/source/detail/jit_device_binaries.cpp
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -81,7 +81,8 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) {
 }
 
 pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
-    const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec, pi_device_binary_type Format) {
+    const unsigned char *BinaryStart, size_t BinarySize, const char *TargetSpec,
+    pi_device_binary_type Format) {
   pi_device_binary_struct DeviceBinary;
   DeviceBinary.Version = PI_DEVICE_BINARY_VERSION;
   DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL;
@@ -106,7 +107,8 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
 void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont,
                                                const unsigned char *BinaryStart,
                                                size_t BinarySize,
-                                               const char* TargetSpec, pi_device_binary_type Format) {
+                                               const char *TargetSpec,
+                                               pi_device_binary_type Format) {
   // Adding to the vectors might trigger reallocation, which would invalidate
   // the pointers used for PI structs if a PI struct has already been created
   // via getPIDeviceStruct(). Forbid calls to this method after the first PI
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
index 96079d3a25a19..cecab17870650 100644
--- a/sycl/source/detail/jit_device_binaries.hpp
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -111,7 +111,8 @@ class DeviceBinaryContainer {
 
   pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart,
                                             size_t BinarySize,
-                                            const char* TargetSpec, pi_device_binary_type Format);
+                                            const char *TargetSpec,
+                                            pi_device_binary_type Format);
 
 private:
   bool Fused = true;
@@ -138,7 +139,7 @@ class DeviceBinariesCollection {
 
   void addDeviceBinary(DeviceBinaryContainer &&Cont,
                        const unsigned char *BinaryStart, size_t BinarySize,
-                       const char* TargetSpec, pi_device_binary_type Format);
+                       const char *TargetSpec, pi_device_binary_type Format);
   pi_device_binaries getPIDeviceStruct();
 
 private:

From 57e02a664d84a20cf89daff2a2876d89d41e2b26 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 14 Feb 2023 13:39:10 +0000
Subject: [PATCH 11/16] [SYCL][Fusion] Cache and groom input binaries

Parse each input binary only once.

Groom the nvvm annotations for functions deleted before fusion.

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 .../jit-compiler/lib/fusion/ModuleHelper.cpp  |   8 ++
 .../lib/translation/KernelTranslation.cpp     | 104 +++++++++---------
 2 files changed, 60 insertions(+), 52 deletions(-)

diff --git a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
index c197fedf47e69..0a90600530f74 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
@@ -8,6 +8,7 @@
 
 #include "ModuleHelper.h"
 
+#include "target/TargetFusionInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -22,6 +23,13 @@ helper::ModuleHelper::cloneAndPruneModule(Module *Mod,
   SmallPtrSet<Function *, 16> UnusedFunctions;
   identifyUnusedFunctions(Mod, CGRoots, UnusedFunctions);
 
+  {
+    auto TFI = llvm::TargetFusionInfo::getTargetFusionInfo(Mod);
+    SmallVector<Function *> Unused{UnusedFunctions.begin(),
+                                   UnusedFunctions.end()};
+    TFI.notifyFunctionsDelete(Unused);
+  }
+
   // Clone the module, but use an external reference in place of the global
   // definition for unused functions.
   auto FunctionCloneMask = [&](const GlobalValue *GV) -> bool {
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 1436f51f40aaa..b877829782658 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -75,68 +75,68 @@ KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx,
     const unsigned char *ModulePtr = BinInfo.BinaryStart;
     size_t ModuleSize = BinInfo.BinarySize;
     BinaryBlob BinBlob{ModulePtr, ModuleSize};
-    if (ParsedBinaries.contains(BinBlob)) {
+    if (!ParsedBinaries.contains(BinBlob)) {
       // Multiple kernels can be stored in the same SPIR-V or LLVM IR module.
-      // If we encountered the same binary module before, skip.
+      // We only load if we did not encounter the same binary module before.
       // NOTE: We compare the pointer as well as the size, in case
       // a previous kernel only referenced part of the SPIR-V/LLVM IR module.
       // Not sure this can actually happen, but better safe than sorry.
-      continue;
-    }
-    // Simply load and translate the SPIR-V into the currently still empty
-    // module.
-    std::unique_ptr<llvm::Module> NewMod;
+      // Simply load and translate the SPIR-V into the currently still empty
+      // module.
+      std::unique_ptr<llvm::Module> NewMod;
 
-    switch (BinInfo.Format) {
-    case BinaryFormat::LLVM: {
-      auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel);
-      if (auto Err = ModOrError.takeError()) {
-        return std::move(Err);
+      switch (BinInfo.Format) {
+      case BinaryFormat::LLVM: {
+        auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel);
+        if (auto Err = ModOrError.takeError()) {
+          return std::move(Err);
+        }
+        NewMod = std::move(*ModOrError);
+        break;
       }
-      NewMod = std::move(*ModOrError);
-      break;
-    }
-    case BinaryFormat::SPIRV: {
-      auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel);
-      if (auto Err = ModOrError.takeError()) {
-        return std::move(Err);
+      case BinaryFormat::SPIRV: {
+        auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel);
+        if (auto Err = ModOrError.takeError()) {
+          return std::move(Err);
+        }
+        NewMod = std::move(*ModOrError);
+        break;
       }
-      NewMod = std::move(*ModOrError);
-      break;
-    }
-    default: {
-      return createStringError(
-          inconvertibleErrorCode(),
-          "Failed to load kernel from unsupported input format");
-    }
-    }
-
-    // We do not assume that the input binary information has the address bits
-    // set, but rather retrieve this information from the SPIR-V/LLVM module's
-    // data-layout.
-    BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
-
-    if (First) {
-      // We can simply assign the module we just loaded from SPIR-V to the
-      // empty pointer on the first iteration.
-      Result = std::move(NewMod);
-      // The first module will dictate the address bits for the remaining.
-      AddressBits = BinInfo.AddressBits;
-      First = false;
-    } else {
-      // We have already loaded some module, so now we need to
-      // link the module we just loaded with the result so far.
-      // FIXME: We allow duplicates to be overridden by the module
-      // read last. This could cause problems if different modules contain
-      // definitions with the same name, but different body/content.
-      // Check that this is not problematic.
-      Linker::linkModules(*Result, std::move(NewMod),
-                          Linker::Flags::OverrideFromSrc);
-      if (AddressBits != BinInfo.AddressBits) {
+      default: {
         return createStringError(
             inconvertibleErrorCode(),
-            "Number of address bits between SPIR-V modules does not match");
+            "Failed to load kernel from unsupported input format");
+      }
+      }
+
+      // We do not assume that the input binary information has the address bits
+      // set, but rather retrieve this information from the SPIR-V/LLVM module's
+      // data-layout.
+      BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
+
+      if (First) {
+        // We can simply assign the module we just loaded from SPIR-V to the
+        // empty pointer on the first iteration.
+        Result = std::move(NewMod);
+        // The first module will dictate the address bits for the remaining.
+        AddressBits = BinInfo.AddressBits;
+        First = false;
+      } else {
+        // We have already loaded some module, so now we need to
+        // link the module we just loaded with the result so far.
+        // FIXME: We allow duplicates to be overridden by the module
+        // read last. This could cause problems if different modules contain
+        // definitions with the same name, but different body/content.
+        // Check that this is not problematic.
+        Linker::linkModules(*Result, std::move(NewMod),
+                            Linker::Flags::OverrideFromSrc);
+        if (AddressBits != BinInfo.AddressBits) {
+          return createStringError(
+              inconvertibleErrorCode(),
+              "Number of address bits between SPIR-V modules does not match");
+        }
       }
+      ParsedBinaries.insert(BinBlob);
     }
     // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
     // 'work_group_size_hint' from metadata attached to the kernel function and

From 1d8bab8cd6c4eda25d06a5a59ea12f91a95035fa Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 14 Feb 2023 15:44:25 +0000
Subject: [PATCH 12/16] [SYCL][Fusion] Disable heterogeneous ND ranges on CUDA

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/CMakeLists.txt       |  1 +
 sycl-fusion/jit-compiler/include/JITContext.h |  1 -
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 15 ++++++++++-----
 sycl-fusion/passes/CMakeLists.txt             |  1 +
 sycl-fusion/passes/kernel-fusion/Builtins.cpp | 12 ------------
 sycl-fusion/passes/kernel-fusion/Builtins.h   |  4 ----
 sycl/source/detail/jit_compiler.cpp           |  2 +-
 sycl/source/detail/jit_compiler.hpp           |  2 +-
 8 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index f67eed5fc6c76..777f06b303e0c 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_library(sycl-fusion
    ScalarOpts
    InstCombine
    Target
+   TargetParser
    NVPTX
    X86
    MC
diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index c26312ceb70e6..eda8c83d8b21c 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -88,7 +88,6 @@ class JITContext {
   mutable MutexT CacheMutex;
 
   std::unordered_map<CacheKeyT, SYCLKernelInfo> Cache;
-  
 };
 } // namespace jit_compiler
 
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 821ad4cd36369..e2f321bbcb81b 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -56,6 +56,10 @@ FusionResult KernelFusion::fuseKernels(
     int BarriersFlags,
     const std::vector<jit_compiler::ParameterInternalization> &Internalization,
     const std::vector<jit_compiler::JITConstant> &Constants) {
+  // Initialize the configuration helper to make the options for this invocation
+  // available (on a per-thread basis).
+  ConfigHelper::setConfig(std::move(JITConfig));
+
   const auto NDRanges = gatherNDRanges(KernelInformation);
 
   if (!isValidCombination(NDRanges)) {
@@ -64,9 +68,12 @@ FusionResult KernelFusion::fuseKernels(
         "different global sizes in dimensions [2, N) and non-zero offsets"};
   }
 
-  // Initialize the configuration helper to make the options for this invocation
-  // available (on a per-thread basis).
-  ConfigHelper::setConfig(std::move(JITConfig));
+  bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges);
+
+  BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
+  if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) {
+    return FusionResult{"Heterogeneous ND ranges not supported for CUDA"};
+  }
 
   bool CachingEnabled = ConfigHelper::get<option::JITEnableCaching>();
   CacheKeyT CacheKey{KernelsToFuse,
@@ -137,8 +144,6 @@ FusionResult KernelFusion::fuseKernels(
 
   SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName);
 
-  BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
-
   if (auto Error = translation::KernelTranslator::translateKernel(
           FusedKernelInfo, *NewMod, JITCtx, TargetFormat)) {
     return errorToFusionResult(std::move(Error),
diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index c74c3c40e34e0..95f504091dc30 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -46,6 +46,7 @@ add_llvm_library(SYCLKernelFusionPasses
   Support
   TransformUtils
   Passes
+  TargetParser
 )
 
 target_include_directories(SYCLKernelFusionPasses
diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.cpp b/sycl-fusion/passes/kernel-fusion/Builtins.cpp
index 6de1c40e8f4cd..9cff120cd5888 100644
--- a/sycl-fusion/passes/kernel-fusion/Builtins.cpp
+++ b/sycl-fusion/passes/kernel-fusion/Builtins.cpp
@@ -595,18 +595,6 @@ jit_compiler::Remapper::remapBuiltins(Function *F, const NDRange &SrcNDRange,
   return Clone;
 }
 
-void jit_compiler::barrierCall(IRBuilderBase &Builder, int Flags) {
-  assert((Flags == 1 || Flags == 2 || Flags == 3) && "Invalid barrier flags");
-
-  // See
-  // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
-  createSPIRVCall(Builder, BarrierName,
-                  {Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-                   Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-                   Builder.getInt32(0x10 | (Flags % 2 == 1 ? 0x100 : 0x0) |
-                                    ((Flags >> 1 == 1 ? 0x200 : 0x0)))});
-}
-
 Value *jit_compiler::createSPIRVCall(IRBuilderBase &Builder,
                                      StringRef FunctionName,
                                      ArrayRef<Value *> Args) {
diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.h b/sycl-fusion/passes/kernel-fusion/Builtins.h
index bc8e186f14188..a55b6efab39f9 100644
--- a/sycl-fusion/passes/kernel-fusion/Builtins.h
+++ b/sycl-fusion/passes/kernel-fusion/Builtins.h
@@ -60,10 +60,6 @@ constexpr llvm::StringLiteral OffloadStartWrapperName{
 llvm::Value *getGlobalLinearID(llvm::IRBuilderBase &Builder,
                                const NDRange &FusedNDRange);
 
-///
-/// Creates a call to a barrier function.
-void barrierCall(llvm::IRBuilderBase &Builder, int Flags);
-
 ///
 /// @return A call to a SPIRV function, which will be declared if not already in
 /// the module.
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 8770f58172548..e4b16db373df8 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -799,7 +799,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
   JITConfig.set<::jit_compiler::option::JITEnableCaching>(
       detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get());
-      
+
   ::jit_compiler::BinaryFormat TargetFormat = getTargetFormat(Queue);
   JITConfig.set<::jit_compiler::option::JITTargetFormat>(TargetFormat);
 
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 4b299572bacbc..71a57723ea6bb 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -13,7 +13,7 @@
 #include <detail/scheduler/scheduler.hpp>
 
 namespace jit_compiler {
-enum class BinaryFormat;
+enum class BinaryFormat : uint32_t;
 class JITContext;
 struct SYCLKernelInfo;
 struct SYCLKernelAttribute;

From 4506db6bc0232ef4e7d0292813987ff4a0527002 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 14 Feb 2023 16:14:50 +0000
Subject: [PATCH 13/16] [SYCL][Fusion] Enable JIT caching for CUDA fusion

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/source/detail/jit_compiler.cpp | 11 ++++++++---
 sycl/source/detail/jit_compiler.hpp |  4 ++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index e4b16db373df8..d5742a3580445 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -847,9 +847,14 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
     auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
     detail::ProgramManager::getInstance().addImages(PIDeviceBinaries);
     Handle = OSUtil::getOSModuleHandle(PIDeviceBinaries->DeviceBinaries);
-  } else if (DebugEnabled) {
-    // TODO(Lukas): Create correct OSModuleHandle when using a cached binary.
-    std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
+    CachedModules.emplace(FusedKernelInfo.Name, Handle);
+  } else {
+    if (DebugEnabled) {
+      std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
+    }
+    // Retrieve an OSModuleHandle for the cached binary.
+    assert(CachedModules.count(FusedKernelInfo.Name) && "No cached binary");
+    Handle = CachedModules.at(FusedKernelInfo.Name);
   }
 
   // Create a kernel bundle for the fused kernel.
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 71a57723ea6bb..fae774cadd09a 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -12,6 +12,8 @@
 #include <detail/scheduler/commands.hpp>
 #include <detail/scheduler/scheduler.hpp>
 
+#include <unordered_map>
+
 namespace jit_compiler {
 enum class BinaryFormat : uint32_t;
 class JITContext;
@@ -60,6 +62,8 @@ class jit_compiler {
   // Manages the lifetime of the PI structs for device binaries.
   std::vector<DeviceBinariesCollection> JITDeviceBinaries;
 
+  std::unordered_map<std::string, OSModuleHandle> CachedModules;
+
   std::unique_ptr<::jit_compiler::JITContext> MJITContext;
 };
 

From 26978b2586cf5f096a8cbd15aae245110fe08bad Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 15 Feb 2023 12:56:58 +0000
Subject: [PATCH 14/16] [SYCL][Fusion] Catch empty standard arguments

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/source/detail/jit_compiler.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index d5742a3580445..de07a86623219 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -457,7 +457,10 @@ static ParamIterator preProcessArguments(
     // which will go out-of-scope before we execute the fused kernel. Therefore,
     // we need to copy the argument to a permant location and update the
     // argument.
-    Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+    if (Arg->Arg.MPtr) {
+      Arg->Arg.MPtr =
+          storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+    }
     // Standard layout arguments do not participate in identical argument
     // detection, but we still add it to the list here. As the SYCL runtime can
     // only check the raw bytes for identical content, but is unaware of the
@@ -474,6 +477,7 @@ static ParamIterator preProcessArguments(
         ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
         Arg->Arg.MPtr, Arg->Arg.MSize);
     return ++Arg;
+    
   }
   // First check if there's already another parameter with identical
   // value.

From aa2b629f4f98d1d16d7fbe4c8bce7b9785a177f3 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 7 Mar 2023 14:03:28 +0000
Subject: [PATCH 15/16] [SYCL][Fusion] Rebase and address feedback

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 clang/include/clang/Driver/Action.h           |   2 +-
 clang/lib/Driver/Action.cpp                   |   2 +-
 sycl-fusion/jit-compiler/CMakeLists.txt       |   7 +-
 sycl-fusion/jit-compiler/include/JITContext.h |   7 +-
 sycl-fusion/jit-compiler/include/Options.h    |   1 +
 sycl-fusion/jit-compiler/lib/JITContext.cpp   |  11 +-
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |  22 +
 .../lib/fusion/FusionPipeline.cpp             |   4 +-
 .../jit-compiler/lib/fusion/ModuleHelper.cpp  |   2 +-
 .../lib/translation/KernelTranslation.cpp     |  43 +-
 .../lib/translation/KernelTranslation.h       |  10 +-
 .../lib/translation/SPIRVLLVMTranslation.cpp  |   2 +-
 .../internalization/Internalization.cpp       |   2 +-
 .../passes/kernel-fusion/SYCLKernelFusion.cpp |  13 +-
 sycl-fusion/passes/syclcp/SYCLCP.cpp          |   2 +-
 .../passes/target/TargetFusionInfo.cpp        | 458 +++++++++++-------
 sycl-fusion/passes/target/TargetFusionInfo.h  | 131 +----
 sycl/source/detail/device_info.hpp            |   3 +-
 sycl/source/detail/jit_compiler.cpp           |  38 +-
 sycl/source/detail/scheduler/commands.hpp     |   2 +-
 20 files changed, 412 insertions(+), 350 deletions(-)

diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index 9dab2e32b2ffc..45e9133b7ed13 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -665,7 +665,7 @@ class OffloadWrapperJobAction : public JobAction {
 public:
   OffloadWrapperJobAction(ActionList &Inputs, types::ID Type);
   OffloadWrapperJobAction(Action *Input, types::ID OutputType,
-                          bool IsEmbeddedIR = false);
+                          bool EmbedIR = false);
 
   bool isEmbeddedIR() const { return EmbedIR; }
 
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index f55233c59a6db..4cb0225cad293 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -478,7 +478,7 @@ void OffloadWrapperJobAction::anchor() {}
 
 OffloadWrapperJobAction::OffloadWrapperJobAction(ActionList &Inputs,
                                                  types::ID Type)
-  : JobAction(OffloadWrapperJobClass, Inputs, Type) {}
+    : JobAction(OffloadWrapperJobClass, Inputs, Type), EmbedIR(false) {}
 
 OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input, types::ID Type,
                                                  bool IsEmbeddedIR)
diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index 777f06b303e0c..bf323239679b4 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -22,9 +22,8 @@ add_llvm_library(sycl-fusion
    InstCombine
    Target
    TargetParser
-   NVPTX
-   X86
    MC
+   ${LLVM_TARGETS_TO_BUILD}
 )
 
 target_include_directories(sycl-fusion
@@ -47,6 +46,10 @@ target_link_libraries(sycl-fusion
   ${CMAKE_THREAD_LIBS_INIT}
 )
 
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_PTX)
+endif()
+
 if (BUILD_SHARED_LIBS)
   if(NOT MSVC AND NOT APPLE)
     # Manage symbol visibility through the linker to make sure no LLVM symbols
diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index eda8c83d8b21c..69465a74d8371 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -39,7 +39,7 @@ using CacheKeyT =
 /// Wrapper around a kernel binary.
 class KernelBinary {
 public:
-  explicit KernelBinary(std::string Binary, BinaryFormat Format);
+  explicit KernelBinary(std::string &&Binary, BinaryFormat Format);
 
   jit_compiler::BinaryAddress address() const;
 
@@ -65,7 +65,10 @@ class JITContext {
 
   llvm::LLVMContext *getLLVMContext();
 
-  KernelBinary &emplaceSPIRVBinary(std::string Binary, BinaryFormat Format);
+  template <typename... Ts> KernelBinary &emplaceKernelBinary(Ts &&...Args) {
+    WriteLockT WriteLock{BinariesMutex};
+    return Binaries.emplace_back(std::forward<Ts>(Args)...);
+  }
 
   std::optional<SYCLKernelInfo> getCacheEntry(CacheKeyT &Identifier) const;
 
diff --git a/sycl-fusion/jit-compiler/include/Options.h b/sycl-fusion/jit-compiler/include/Options.h
index 4fe7787df00db..841a229adb7a3 100644
--- a/sycl-fusion/jit-compiler/include/Options.h
+++ b/sycl-fusion/jit-compiler/include/Options.h
@@ -10,6 +10,7 @@
 #define SYCL_FUSION_JIT_COMPILER_OPTIONS_H
 
 #include "Kernel.h"
+
 #include <memory>
 #include <unordered_map>
 
diff --git a/sycl-fusion/jit-compiler/lib/JITContext.cpp b/sycl-fusion/jit-compiler/lib/JITContext.cpp
index e1dda8b928c45..4499dd93f4d76 100644
--- a/sycl-fusion/jit-compiler/lib/JITContext.cpp
+++ b/sycl-fusion/jit-compiler/lib/JITContext.cpp
@@ -11,7 +11,7 @@
 
 using namespace jit_compiler;
 
-KernelBinary::KernelBinary(std::string Binary, BinaryFormat Fmt)
+KernelBinary::KernelBinary(std::string &&Binary, BinaryFormat Fmt)
     : Blob{std::move(Binary)}, Format{Fmt} {}
 
 jit_compiler::BinaryAddress KernelBinary::address() const {
@@ -29,15 +29,6 @@ JITContext::~JITContext() = default;
 
 llvm::LLVMContext *JITContext::getLLVMContext() { return LLVMCtx.get(); }
 
-KernelBinary &JITContext::emplaceSPIRVBinary(std::string Binary,
-                                             BinaryFormat Format) {
-  WriteLockT WriteLock{BinariesMutex};
-  // NOTE: With C++17, which returns a reference from emplace_back, the
-  // following code would be even simpler.
-  Binaries.emplace_back(std::move(Binary), Format);
-  return Binaries.back();
-}
-
 std::optional<SYCLKernelInfo>
 JITContext::getCacheEntry(CacheKeyT &Identifier) const {
   ReadLockT ReadLock{CacheMutex};
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index e2f321bbcb81b..be7515d935247 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -48,6 +48,22 @@ gatherNDRanges(llvm::ArrayRef<SYCLKernelInfo> KernelInformation) {
   return NDRanges;
 }
 
+static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
+  switch (TargetFormat) {
+  case BinaryFormat::SPIRV:
+    return true;
+  case BinaryFormat::PTX: {
+#ifdef FUSION_JIT_SUPPORT_PTX
+    return true;
+#else  // FUSION_JIT_SUPPORT_PTX
+    return false;
+#endif // FUSION_JIT_SUPPORT_PTX
+  }
+  default:
+    return false;
+  }
+}
+
 FusionResult KernelFusion::fuseKernels(
     JITContext &JITCtx, Config &&JITConfig,
     const std::vector<SYCLKernelInfo> &KernelInformation,
@@ -71,6 +87,12 @@ FusionResult KernelFusion::fuseKernels(
   bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges);
 
   BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
+
+  if (!isTargetFormatSupported(TargetFormat)) {
+    return FusionResult(
+        "Fusion output target format not supported by this build");
+  }
+
   if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) {
     return FusionResult{"Heterogeneous ND ranges not supported for CUDA"};
   }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index fb2e2b15200a7..965202fe5bbfa 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -23,8 +23,8 @@
 #ifndef NDEBUG
 #include "llvm/IR/Verifier.h"
 #endif // NDEBUG
-#include "llvm/ADT/Triple.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
@@ -103,7 +103,7 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
     // to/from generic address-space as possible, because these hinder
     // internalization.
     // Ideally, the static compiler should have performed that job.
-    unsigned FlatAddressSpace = getFlatAddressSpace(Mod);
+    const unsigned FlatAddressSpace = getFlatAddressSpace(Mod);
     FPM.addPass(InferAddressSpacesPass(FlatAddressSpace));
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
index 0a90600530f74..0d6ac7f48fbbe 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
@@ -24,7 +24,7 @@ helper::ModuleHelper::cloneAndPruneModule(Module *Mod,
   identifyUnusedFunctions(Mod, CGRoots, UnusedFunctions);
 
   {
-    auto TFI = llvm::TargetFusionInfo::getTargetFusionInfo(Mod);
+    TargetFusionInfo TFI{Mod};
     SmallVector<Function *> Unused{UnusedFunctions.begin(),
                                    UnusedFunctions.end()};
     TFI.notifyFunctionsDelete(Unused);
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index b877829782658..8b1f3d913026d 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "KernelTranslation.h"
+
 #include "SPIRVLLVMTranslation.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Constants.h"
@@ -182,7 +183,8 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
     break;
   }
   case BinaryFormat::PTX: {
-    llvm::Expected<KernelBinary *> BinaryOrError = translateToPTX(Mod, JITCtx);
+    llvm::Expected<KernelBinary *> BinaryOrError =
+        translateToPTX(Kernel, Mod, JITCtx);
     if (auto Error = BinaryOrError.takeError()) {
       return Error;
     }
@@ -215,12 +217,20 @@ KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) {
 }
 
 llvm::Expected<KernelBinary *>
-KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
-  // FIXME: Can we limit this to the NVPTX specific target?
-  llvm::InitializeAllTargets();
-  llvm::InitializeAllAsmParsers();
-  llvm::InitializeAllAsmPrinters();
-  llvm::InitializeAllTargetMCs();
+KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
+                                 JITContext &JITCtx) {
+#ifndef FUSION_JIT_SUPPORT_PTX
+  return createStringError(inconvertibleErrorCode(),
+                           "PTX translation not supported in this build");
+#else  // FUSION_JIT_SUPPORT_PTX
+  LLVMInitializeNVPTXTargetInfo();
+  LLVMInitializeNVPTXTarget();
+  LLVMInitializeNVPTXAsmPrinter();
+  LLVMInitializeNVPTXTargetMC();
+#endif // FUSION_JIT_SUPPORT_PTX
+
+  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
+  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
 
   std::string TargetTriple{"nvptx64-nvidia-cuda"};
 
@@ -231,13 +241,26 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
   if (!Target) {
     return createStringError(
         inconvertibleErrorCode(),
-        "Failed to load and translate SPIR-V module with error %s",
+        "Failed to load and translate PTX LLVM IR module with error %s",
         ErrorMessage.c_str());
   }
 
+  llvm::StringRef TargetCPU{"sm_50"};
+  llvm::StringRef TargetFeatures{"+sm_50,+ptx76"};
+  if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name)) {
+    if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      TargetCPU =
+          KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+    }
+    if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                           .getValueAsString();
+    }
+  }
+
   // FIXME: Check whether we can provide more accurate target information here
   auto *TargetMachine = Target->createTargetMachine(
-      TargetTriple, "sm_50", "+sm_50,+ptx76", {}, llvm::Reloc::PIC_,
+      TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
       std::nullopt, llvm::CodeGenOpt::Default);
 
   llvm::legacy::PassManager PM;
@@ -259,5 +282,5 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
     ASMStream.flush();
   }
 
-  return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX);
+  return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX);
 }
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
index 276b6e536d269..7e4816df9bf94 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+#ifndef SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H
+#define SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H
 
 #include "JITContext.h"
 #include "Kernel.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include <llvm/Support/Error.h>
+#include "llvm/Support/Error.h"
 #include <vector>
 
 namespace jit_compiler {
@@ -39,8 +41,10 @@ class KernelTranslator {
   static llvm::Expected<KernelBinary *> translateToSPIRV(llvm::Module &Mod,
                                                          JITContext &JITCtx);
 
-  static llvm::Expected<KernelBinary *> translateToPTX(llvm::Module &Mod,
-                                                       JITContext &JITCtx);
+  static llvm::Expected<KernelBinary *>
+  translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx);
 };
 } // namespace translation
 } // namespace jit_compiler
+
+#endif // SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
index a5effb33a85e1..4092f9dd96fc8 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
@@ -94,5 +94,5 @@ SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) {
         "Translation of LLVM IR to SPIR-V failed with error %s",
         ErrMsg.c_str());
   }
-  return &JITCtx.emplaceSPIRVBinary(BinaryStream.str(), BinaryFormat::SPIRV);
+  return &JITCtx.emplaceKernelBinary(BinaryStream.str(), BinaryFormat::SPIRV);
 }
diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp
index bca46ba43acb3..3d5c38c799e8f 100644
--- a/sycl-fusion/passes/internalization/Internalization.cpp
+++ b/sycl-fusion/passes/internalization/Internalization.cpp
@@ -631,7 +631,7 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM,
 
 PreservedAnalyses llvm::SYCLInternalizer::run(Module &M,
                                               ModuleAnalysisManager &AM) {
-  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+  TargetFusionInfo TFI{&M};
   // Private promotion
   const PreservedAnalyses Tmp = SYCLInternalizerImpl{
       TFI.getPrivateAddressSpace(), PrivatePromotion, true, TFI}(M, AM);
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
index a0079affbaa06..b61a0936eb32e 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
@@ -138,7 +138,7 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       AM.getResult<SYCLModuleInfoAnalysis>(M).ModuleInfo;
   assert(ModuleInfo && "No module information available");
 
-  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+  TargetFusionInfo TFI{&M};
 
   // Iterate over the functions in the module and locate all
   // stub functions identified by metadata.
@@ -456,11 +456,20 @@ Error SYCLKernelFusion::fuseKernel(
       FT, GlobalValue::LinkageTypes::ExternalLinkage,
       M.getDataLayout().getProgramAddressSpace(), KernelName->getString(), &M);
   {
+    auto DefaultAttr = FusedFunction->getAttributes();
+    // Add uniform function attributes, i.e., attributes with identical value on
+    // each input function, to the fused function.
+    auto *FirstFunction = InputFunctions.front().F;
+    for (const auto &UniformKey : TargetInfo.getUniformKernelAttributes()) {
+      if (FirstFunction->hasFnAttribute(UniformKey)) {
+        DefaultAttr = DefaultAttr.addFnAttribute(
+            LLVMCtx, FirstFunction->getFnAttribute(UniformKey));
+      }
+    }
     // Add the collected parameter attributes to the fused function.
     // Copying the parameter attributes from their original definition in the
     // input kernels should be safe and they most likely can't be deducted later
     // on, as no caller is present in the module.
-    auto DefaultAttr = FusedFunction->getAttributes();
     auto FusedFnAttrs =
         AttributeList::get(LLVMCtx, DefaultAttr.getFnAttrs(),
                            DefaultAttr.getRetAttrs(), FusedParamAttributes);
diff --git a/sycl-fusion/passes/syclcp/SYCLCP.cpp b/sycl-fusion/passes/syclcp/SYCLCP.cpp
index 5cb9b00433ef7..b928a33760da0 100644
--- a/sycl-fusion/passes/syclcp/SYCLCP.cpp
+++ b/sycl-fusion/passes/syclcp/SYCLCP.cpp
@@ -249,7 +249,7 @@ PreservedAnalyses SYCLCP::run(Module &M, ModuleAnalysisManager &AM) {
     Changed = propagateConstants(F, *ConstantsOrErr) || Changed;
   }
 
-  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+  TargetFusionInfo TFI{&M};
 
   if (Changed) {
     moduleCleanup(M, AM, TFI);
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
index c17e2464ac323..27514bbb80269 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.cpp
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -7,234 +7,336 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetFusionInfo.h"
-#include "llvm/ADT/Triple.h"
+
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/TargetParser/Triple.h"
 
-using namespace llvm;
+namespace llvm {
+class TargetFusionInfoImpl {
 
-//
-// TargetFusionInfo
-//
+public:
+  explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {};
 
-TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) {
-  llvm::Triple Tri(Mod->getTargetTriple());
-  if (Tri.isNVPTX()) {
-    return TargetFusionInfo(
-        std::shared_ptr<NVPTXTargetFusionInfo>(new NVPTXTargetFusionInfo(Mod)));
-  }
-  if (Tri.isSPIRV() || Tri.isSPIR()) {
-    return TargetFusionInfo(
-        std::shared_ptr<SPIRVTargetFusionInfo>(new SPIRVTargetFusionInfo(Mod)));
-  }
-  assert(false && "Unsupported target for fusion");
-}
+  virtual ~TargetFusionInfoImpl() = default;
+
+  virtual void notifyFunctionsDelete(
+      [[maybe_unused]] llvm::ArrayRef<Function *> Funcs) const {}
+
+  virtual void addKernelFunction([[maybe_unused]] Function *KernelFunc) const {}
+
+  virtual void postProcessKernel([[maybe_unused]] Function *KernelFunc) const {}
+
+  virtual ArrayRef<StringRef> getKernelMetadataKeys() const { return {}; }
+
+  virtual ArrayRef<StringRef> getUniformKernelAttributes() const { return {}; }
+
+  virtual void createBarrierCall(IRBuilderBase &Builder,
+                                 int BarrierFlags) const = 0;
+
+  virtual unsigned getPrivateAddressSpace() const = 0;
+
+  virtual unsigned getLocalAddressSpace() const = 0;
+
+  virtual void
+  updateAddressSpaceMetadata([[maybe_unused]] Function *KernelFunc,
+                             [[maybe_unused]] ArrayRef<size_t> LocalSize,
+                             [[maybe_unused]] unsigned AddressSpace) const {}
+
+protected:
+  llvm::Module *LLVMMod;
+};
 
 //
 // SPIRVTargetFusionInfo
 //
+class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
 
-void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) const {
-  KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL);
-}
-
-ArrayRef<StringRef> SPIRVTargetFusionInfo::getKernelMetadataKeys() const {
-  // NOTE: We do not collect the "kernel_arg_name" metadata, because
-  // the kernel arguments receive new names in the fused kernel.
-  static SmallVector<StringRef> Keys{
-      {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type",
-       "kernel_arg_base_type", "kernel_arg_type_qual"}};
-  return Keys;
-}
+  void addKernelFunction(Function *KernelFunc) const override {
+    KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL);
+  }
 
-void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) const {
-  // Attach the kernel_arg_name metadata.
-  SmallVector<Metadata *, 16> KernelArgNames;
-  for (auto &P : KernelFunc->args()) {
-    KernelArgNames.push_back(MDString::get(LLVMMod->getContext(), P.getName()));
+  ArrayRef<StringRef> getKernelMetadataKeys() const override {
+    // NOTE: We do not collect the "kernel_arg_name" metadata, because
+    // the kernel arguments receive new names in the fused kernel.
+    static SmallVector<StringRef> Keys{
+        {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type",
+         "kernel_arg_base_type", "kernel_arg_type_qual"}};
+    return Keys;
   }
-  auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames);
-  KernelFunc->setMetadata("kernel_arg_name", ArgNameMD);
-
-  static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
-  static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
-  // Remove all existing calls of the ITT instrumentation functions. Insert new
-  // ones in the entry block of the fused kernel and every exit block if the
-  // functions are present in the module.
-  // We cannot use the existing SPIRITTAnnotations pass, because that pass might
-  // insert calls to functions not present in the module (e.g., ITT
-  // instrumentations for barriers). As the JITed module is not linked with
-  // libdevice anymore, the functions would remain unresolved and cause the
-  // driver to fail.
-  Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper);
-  Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper);
-  bool InsertWrappers =
-      ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
-       (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
-  auto *WrapperFuncTy = FunctionType::get(
-      Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false);
-  for (auto &BB : *KernelFunc) {
-    for (auto Inst = BB.begin(); Inst != BB.end();) {
-      if (auto *CB = dyn_cast<CallBase>(Inst)) {
-        if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
-          Inst = Inst->eraseFromParent();
-          continue;
+
+  void postProcessKernel(Function *KernelFunc) const override {
+    // Attach the kernel_arg_name metadata.
+    SmallVector<Metadata *> KernelArgNames;
+    for (auto &P : KernelFunc->args()) {
+      KernelArgNames.push_back(
+          MDString::get(LLVMMod->getContext(), P.getName()));
+    }
+    auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames);
+    KernelFunc->setMetadata("kernel_arg_name", ArgNameMD);
+
+    static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
+    static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
+    // Remove all existing calls of the ITT instrumentation functions. Insert
+    // new ones in the entry block of the fused kernel and every exit block if
+    // the functions are present in the module. We cannot use the existing
+    // SPIRITTAnnotations pass, because that pass might insert calls to
+    // functions not present in the module (e.g., ITT instrumentations for
+    // barriers). As the JITed module is not linked with libdevice anymore, the
+    // functions would remain unresolved and cause the driver to fail.
+    Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper);
+    Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper);
+    bool InsertWrappers =
+        ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
+         (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
+    auto *WrapperFuncTy = FunctionType::get(
+        Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false);
+    for (auto &BB : *KernelFunc) {
+      for (auto Inst = BB.begin(); Inst != BB.end();) {
+        if (auto *CB = dyn_cast<CallBase>(Inst)) {
+          if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
+            Inst = Inst->eraseFromParent();
+            continue;
+          }
+        }
+        ++Inst;
+      }
+      if (InsertWrappers) {
+        if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+          auto *WrapperCall =
+              CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
+          WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
         }
       }
-      ++Inst;
     }
     if (InsertWrappers) {
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-        auto *WrapperCall =
-            CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
-        WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-      }
+      KernelFunc->getEntryBlock().getFirstInsertionPt();
+      auto *WrapperCall =
+          CallInst::Create(WrapperFuncTy, StartWrapperFunc, "",
+                           &*KernelFunc->getEntryBlock().getFirstInsertionPt());
+      WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
     }
   }
-  if (InsertWrappers) {
-    KernelFunc->getEntryBlock().getFirstInsertionPt();
-    auto *WrapperCall =
-        CallInst::Create(WrapperFuncTy, StartWrapperFunc, "",
-                         &*KernelFunc->getEntryBlock().getFirstInsertionPt());
-    WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-  }
-}
 
-void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
-                                              int BarrierFlags) const {
-  if (BarrierFlags == -1) {
-    return;
-  }
-  assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) &&
-         "Invalid barrier flags");
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override {
+    if (BarrierFlags == -1) {
+      return;
+    }
+    assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) &&
+           "Invalid barrier flags");
+
+    static const auto FnAttrs = AttributeSet::get(
+        LLVMMod->getContext(),
+        {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent),
+         Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)});
 
-  static const auto FnAttrs = AttributeSet::get(
-      LLVMMod->getContext(),
-      {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent),
-       Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)});
+    static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"};
 
-  static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"};
+    Function *F = LLVMMod->getFunction(N);
+    if (!F) {
+      constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage;
 
-  Function *F = LLVMMod->getFunction(N);
-  if (!F) {
-    constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage;
+      auto *Ty = FunctionType::get(
+          Builder.getVoidTy(),
+          {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
+          false /* isVarArg*/);
 
-    auto *Ty = FunctionType::get(
-        Builder.getVoidTy(),
-        {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
-        false /* isVarArg*/);
+      F = Function::Create(Ty, Linkage, N, *LLVMMod);
 
-    F = Function::Create(Ty, Linkage, N, *LLVMMod);
+      F->setAttributes(
+          AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
+      F->setCallingConv(CallingConv::SPIR_FUNC);
+    }
+
+    // See
+    // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
+    SmallVector<Value *> Args{
+        Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+        Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+        Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) |
+                         ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))};
 
-    F->setAttributes(
+    auto *BarrierCallInst = Builder.CreateCall(F, Args);
+    BarrierCallInst->setAttributes(
         AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
-    F->setCallingConv(CallingConv::SPIR_FUNC);
+    BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC);
   }
 
-  // See
-  // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
-  SmallVector<Value *> Args{
-      Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-      Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-      Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) |
-                       ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))};
-
-  auto *BarrierCallInst = Builder.CreateCall(F, Args);
-  BarrierCallInst->setAttributes(
-      AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
-  BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC);
-}
+  // Corresponds to definition of spir_private and spir_local in
+  // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
 
-void SPIRVTargetFusionInfo::updateAddressSpaceMetadata(
-    Function *KernelFunc, ArrayRef<size_t> LocalSize,
-    unsigned AddressSpace) const {
-  static constexpr unsigned AddressSpaceBitWidth{32};
-  static constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"};
-
-  auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
-      IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth),
-      AddressSpace));
-  if (auto *AddrspaceMD = dyn_cast_or_null<MDNode>(
-          KernelFunc->getMetadata(KernelArgAddrSpaceMD))) {
-    // If we have kernel_arg_addr_space metadata in the original function,
-    // we should update it in the new one.
-    SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
-                                    AddrspaceMD->op_end()};
-    for (auto I : enumerate(LocalSize)) {
-      if (I.value() == 0) {
-        continue;
-      }
-      const auto Index = I.index();
-      if (const auto *PtrTy =
-              dyn_cast<PointerType>(KernelFunc->getArg(Index)->getType())) {
-        if (PtrTy->getAddressSpace() == getLocalAddressSpace()) {
-          NewInfo[Index] = NewAddrspace;
+  void updateAddressSpaceMetadata(Function *KernelFunc,
+                                  ArrayRef<size_t> LocalSize,
+                                  unsigned AddressSpace) const override {
+    static constexpr unsigned AddressSpaceBitWidth{32};
+    static constexpr StringLiteral KernelArgAddrSpaceMD{
+        "kernel_arg_addr_space"};
+
+    auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
+        IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth),
+        AddressSpace));
+    if (auto *AddrspaceMD = dyn_cast_or_null<MDNode>(
+            KernelFunc->getMetadata(KernelArgAddrSpaceMD))) {
+      // If we have kernel_arg_addr_space metadata in the original function,
+      // we should update it in the new one.
+      SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
+                                      AddrspaceMD->op_end()};
+      for (auto I : enumerate(LocalSize)) {
+        if (I.value() == 0) {
+          continue;
+        }
+        const auto Index = I.index();
+        if (const auto *PtrTy =
+                dyn_cast<PointerType>(KernelFunc->getArg(Index)->getType())) {
+          if (PtrTy->getAddressSpace() == getLocalAddressSpace()) {
+            NewInfo[Index] = NewAddrspace;
+          }
         }
       }
+      KernelFunc->setMetadata(KernelArgAddrSpaceMD,
+                              MDNode::get(KernelFunc->getContext(), NewInfo));
     }
-    KernelFunc->setMetadata(KernelArgAddrSpaceMD,
-                            MDNode::get(KernelFunc->getContext(), NewInfo));
   }
-}
+};
 
 //
 // NVPTXTargetFusionInfo
 //
+class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
 
-void NVPTXTargetFusionInfo::notifyFunctionsDelete(
-    llvm::ArrayRef<Function *> Funcs) const {
-  SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
-  SmallVector<MDNode *> ValidKernels;
-  auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
-  for (auto *Op : OldAnnotations->operands()) {
-    if (auto *TOp = dyn_cast<MDTuple>(Op)) {
-      if (auto *COp = dyn_cast_if_present<ConstantAsMetadata>(
-              TOp->getOperand(0).get())) {
-        if (!DeletedFuncs.contains(COp->getValue())) {
-          ValidKernels.push_back(Op);
-          // Add to the set to also remove duplicate entries.
-          DeletedFuncs.insert(COp->getValue());
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
+    SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
+    SmallVector<MDNode *> ValidKernels;
+    auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
+    for (auto *Op : OldAnnotations->operands()) {
+      if (auto *TOp = dyn_cast<MDTuple>(Op)) {
+        if (auto *COp = dyn_cast_if_present<ConstantAsMetadata>(
+                TOp->getOperand(0).get())) {
+          if (!DeletedFuncs.contains(COp->getValue())) {
+            ValidKernels.push_back(Op);
+            // Add to the set to also remove duplicate entries.
+            DeletedFuncs.insert(COp->getValue());
+          }
         }
       }
     }
+    LLVMMod->eraseNamedMetadata(OldAnnotations);
+    auto *NewAnnotations =
+        LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+    for (auto *Kernel : ValidKernels) {
+      NewAnnotations->addOperand(Kernel);
+    }
+  }
+
+  void addKernelFunction(Function *KernelFunc) const override {
+    auto *NVVMAnnotations =
+        LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+    auto *MDOne = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
+    auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel");
+    auto *MDFunc = ConstantAsMetadata::get(KernelFunc);
+    SmallVector<Metadata *, 3> KernelMD({MDFunc, MDKernelString, MDOne});
+    auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD);
+    NVVMAnnotations->addOperand(Tuple);
+  }
+
+  ArrayRef<StringRef> getKernelMetadataKeys() const override {
+    // FIXME: Check whether we need to take care of sycl_fixed_targets.
+    static SmallVector<StringRef> Keys{{"kernel_arg_buffer_location",
+                                        "kernel_arg_runtime_aligned",
+                                        "kernel_arg_exclusive_ptr"}};
+    return Keys;
+  }
+
+  ArrayRef<StringRef> getUniformKernelAttributes() const override {
+    static SmallVector<StringRef> Keys{
+        {"target-cpu", "target-features", "uniform-work-group-size"}};
+    return Keys;
+  }
+
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override {
+    if (BarrierFlags == -1) {
+      return;
+    }
+    // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX
+    // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0
+    // instruction, equivalent to the __syncthreads() call in CUDA."
+    Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {});
+  }
+
+  // Corresponds to the definitions in the LLVM NVPTX backend user guide:
+  // https://llvm.org/docs/NVPTXUsage.html#address-spaces
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
+};
+
+//
+// TargetFusionInfo
+//
+
+TargetFusionInfo::TargetFusionInfo(llvm::Module *Mod) {
+  llvm::Triple Tri(Mod->getTargetTriple());
+  if (Tri.isNVPTX()) {
+    Impl = std::make_shared<NVPTXTargetFusionInfo>(Mod);
+    return;
   }
-  LLVMMod->eraseNamedMetadata(OldAnnotations);
-  auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
-  for (auto *Kernel : ValidKernels) {
-    NewAnnotations->addOperand(Kernel);
+  if (Tri.isSPIRV() || Tri.isSPIR()) {
+    Impl = std::make_shared<SPIRVTargetFusionInfo>(Mod);
+    return;
   }
+  llvm_unreachable("Unsupported target for fusion");
 }
 
-void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) const {
-  auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
-  auto *MDOne = ConstantAsMetadata::get(
-      ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
-  auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel");
-  auto *MDFunc = ConstantAsMetadata::get(KernelFunc);
-  SmallVector<Metadata *, 3> KernelMD({MDFunc, MDKernelString, MDOne});
-  auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD);
-  NVVMAnnotations->addOperand(Tuple);
+void TargetFusionInfo::notifyFunctionsDelete(
+    llvm::ArrayRef<Function *> Funcs) const {
+  Impl->notifyFunctionsDelete(Funcs);
 }
 
-ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() const {
-  // FIXME: Check whether we need to take care of sycl_fixed_targets.
-  static SmallVector<StringRef> Keys{{"kernel_arg_buffer_location",
-                                      "kernel_arg_runtime_aligned",
-                                      "kernel_arg_exclusive_ptr"}};
-  return Keys;
+void TargetFusionInfo::addKernelFunction(llvm::Function *KernelFunc) const {
+  Impl->addKernelFunction(KernelFunc);
 }
 
-void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
-                                              int BarrierFlags) const {
-  if (BarrierFlags == -1) {
-    return;
-  }
-  // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX
-  // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0
-  // instruction, equivalent to the __syncthreads() call in CUDA."
-  Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {});
+void TargetFusionInfo::postProcessKernel(Function *KernelFunc) const {
+  Impl->postProcessKernel(KernelFunc);
+}
+
+llvm::ArrayRef<llvm::StringRef>
+TargetFusionInfo::getKernelMetadataKeys() const {
+  return Impl->getKernelMetadataKeys();
+}
+
+void TargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
+                                         int BarrierFlags) const {
+  Impl->createBarrierCall(Builder, BarrierFlags);
+}
+
+unsigned TargetFusionInfo::getPrivateAddressSpace() const {
+  return Impl->getPrivateAddressSpace();
+}
+
+unsigned TargetFusionInfo::getLocalAddressSpace() const {
+  return Impl->getLocalAddressSpace();
+}
+
+void TargetFusionInfo::updateAddressSpaceMetadata(Function *KernelFunc,
+                                                  ArrayRef<size_t> LocalSize,
+                                                  unsigned AddressSpace) const {
+  Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace);
+}
+
+llvm::ArrayRef<llvm::StringRef>
+TargetFusionInfo::getUniformKernelAttributes() const {
+  return Impl->getUniformKernelAttributes();
 }
 
 //
@@ -265,3 +367,5 @@ void MetadataCollection::attachToFunction(llvm::Function *Func) {
     Func->setMetadata(Key, MDEntries);
   }
 }
+
+} // namespace llvm
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h
index c307de2dd764e..f88476c01ebc3 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.h
+++ b/sycl-fusion/passes/target/TargetFusionInfo.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
-#define SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
+#ifndef SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
+#define SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -15,90 +15,7 @@
 
 namespace llvm {
 
-class TargetFusionInfoImpl {
-
-public:
-  virtual ~TargetFusionInfoImpl() = default;
-
-  virtual void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const {
-    (void)Funcs;
-  }
-
-  virtual void addKernelFunction(Function *KernelFunc) const {
-    (void)KernelFunc;
-  }
-
-  virtual void postProcessKernel(Function *KernelFunc) const {
-    (void)KernelFunc;
-  }
-
-  virtual ArrayRef<StringRef> getKernelMetadataKeys() const { return {}; }
-
-  virtual void createBarrierCall(IRBuilderBase &Builder,
-                                 int BarrierFlags) const = 0;
-
-  virtual unsigned getPrivateAddressSpace() const = 0;
-
-  virtual unsigned getLocalAddressSpace() const = 0;
-
-  virtual void updateAddressSpaceMetadata(Function *KernelFunc,
-                                          ArrayRef<size_t> LocalSize,
-                                          unsigned AddressSpace) const {
-    (void)KernelFunc;
-    (void)LocalSize;
-  }
-
-protected:
-  explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {};
-
-  llvm::Module *LLVMMod;
-
-  friend class TargetFusionInfo;
-};
-
-class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
-public:
-  void addKernelFunction(Function *KernelFunc) const override;
-
-  ArrayRef<StringRef> getKernelMetadataKeys() const override;
-
-  void postProcessKernel(Function *KernelFunc) const override;
-
-  void createBarrierCall(IRBuilderBase &Builder,
-                         int BarrierFlags) const override;
-
-  // Corresponds to definition of spir_private and spir_local in
-  // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
-  unsigned getPrivateAddressSpace() const override { return 0; }
-  unsigned getLocalAddressSpace() const override { return 3; }
-
-  void updateAddressSpaceMetadata(Function *KernelFunc,
-                                  ArrayRef<size_t> LocalSize,
-                                  unsigned AddressSpace) const override;
-
-private:
-  using TargetFusionInfoImpl::TargetFusionInfoImpl;
-};
-
-class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
-public:
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override;
-
-  void addKernelFunction(Function *KernelFunc) const override;
-
-  ArrayRef<StringRef> getKernelMetadataKeys() const override;
-
-  void createBarrierCall(IRBuilderBase &Builder,
-                         int BarrierFlags) const override;
-
-  // Corresponds to the definitions in the LLVM NVPTX backend user guide:
-  // https://llvm.org/docs/NVPTXUsage.html#address-spaces
-  unsigned getPrivateAddressSpace() const override { return 0; }
-  unsigned getLocalAddressSpace() const override { return 3; }
-
-private:
-  using TargetFusionInfoImpl::TargetFusionInfoImpl;
-};
+class TargetFusionInfoImpl;
 
 ///
 /// Common interface to target-specific logic around handling of kernel
@@ -108,59 +25,49 @@ class TargetFusionInfo {
   ///
   /// Create the correct target-specific implementation based on the target
   /// triple of \p Module.
-  static TargetFusionInfo getTargetFusionInfo(llvm::Module *Module);
+  explicit TargetFusionInfo(llvm::Module *Module);
 
   ///
   /// Notify the target-specific implementation that set of functions \p Funcs
   /// is about to be erased from the module. This should be called BEFORE
   /// erasing the functions.
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const {
-    Impl->notifyFunctionsDelete(Funcs);
-  }
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const;
 
   ///
   /// Notify the target-specific implementation that the function \p KernelFunc
   /// was added as a new kernel. This should be called AFTER the function has
   /// been added.
-  void addKernelFunction(llvm::Function *KernelFunc) const {
-    Impl->addKernelFunction(KernelFunc);
-  }
+  void addKernelFunction(llvm::Function *KernelFunc) const;
 
   ///
   /// Target-specific post-processing of the new kernel function \p KernelFunc.
   /// This should be called AFTER the function has been added and defined.
-  void postProcessKernel(Function *KernelFunc) const {
-    Impl->postProcessKernel(KernelFunc);
-  }
+  void postProcessKernel(Function *KernelFunc) const;
 
   ///
   /// Get the target-specific list of argument metadata attached to each
   /// function that should be collected and attached to the fused kernel.
-  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() const {
-    return Impl->getKernelMetadataKeys();
-  }
+  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() const;
+
+  ///
+  /// Get the target-specific list of kernel function attributes that are
+  /// uniform across all input kernels and should be attached to the fused
+  /// kernel.
+  llvm::ArrayRef<llvm::StringRef> getUniformKernelAttributes() const;
 
-  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const {
-    Impl->createBarrierCall(Builder, BarrierFlags);
-  }
+  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const;
 
-  unsigned getPrivateAddressSpace() const {
-    return Impl->getPrivateAddressSpace();
-  }
+  unsigned getPrivateAddressSpace() const;
 
-  unsigned getLocalAddressSpace() const { return Impl->getLocalAddressSpace(); }
+  unsigned getLocalAddressSpace() const;
 
   void updateAddressSpaceMetadata(Function *KernelFunc,
                                   ArrayRef<size_t> LocalSize,
-                                  unsigned AddressSpace) const {
-    Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace);
-  }
+                                  unsigned AddressSpace) const;
 
 private:
   using ImplPtr = std::shared_ptr<TargetFusionInfoImpl>;
 
-  TargetFusionInfo(ImplPtr &&I) : Impl{I} {}
-
   ImplPtr Impl;
 };
 
@@ -183,4 +90,4 @@ class MetadataCollection {
 };
 } // namespace llvm
 
-#endif // SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
+#endif // SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 4a0d004ff8a11..7c96bef0ec5b8 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -762,7 +762,8 @@ struct get_device_info_impl<
     // and LevelZero.
     (void)dev;
     return (Plugin.getBackend() == backend::ext_oneapi_level_zero) ||
-           (Plugin.getBackend() == backend::opencl);
+           (Plugin.getBackend() == backend::opencl) ||
+           (Plugin.getBackend() == backend::ext_oneapi_cuda);
 #else  // SYCL_EXT_CODEPLAY_KERNEL_FUSION
     (void)dev;
     (void)Plugin;
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index de07a86623219..ae172158e5e67 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -62,21 +62,16 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
                   backend::ext_oneapi_cuda;
   if (isNvidia) {
     auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
-    std::vector<kernel_id> KernelIds;
-    KernelIds.push_back(KernelID);
+    std::vector<kernel_id> KernelIds{KernelID};
     auto DeviceImages =
         ProgramManager::getInstance().getRawDeviceImages(KernelIds);
-    const RTDeviceBinaryImage *DeviceImage = nullptr;
-    for (auto *DI : DeviceImages) {
-      // We are looking for a device image with LLVM IR format and target spec
-      // "llvm_nvptx64", which has been set by the offload-wrapper action.
-      if (DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
-          DI->getRawData().DeviceTargetSpec == std::string("llvm_nvptx64")) {
-        DeviceImage = DI;
-        break;
-      }
-    }
-    if (!DeviceImage) {
+    auto DeviceImage = std::find_if(
+        DeviceImages.begin(), DeviceImages.end(), [](RTDeviceBinaryImage *DI) {
+          return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
+                 DI->getRawData().DeviceTargetSpec ==
+                     std::string("llvm_nvptx64");
+        });
+    if (DeviceImage == DeviceImages.end()) {
       return {nullptr, nullptr};
     }
     auto ContextImpl = Queue->getContextImplPtr();
@@ -84,9 +79,9 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
     auto DeviceImpl = Queue->getDeviceImplPtr();
     auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
     RT::PiProgram Program =
-        detail::ProgramManager::getInstance().createPIProgram(*DeviceImage,
+        detail::ProgramManager::getInstance().createPIProgram(**DeviceImage,
                                                               Context, Device);
-    return {DeviceImage, Program};
+    return {*DeviceImage, Program};
   }
 
   const RTDeviceBinaryImage *DeviceImage = nullptr;
@@ -460,6 +455,11 @@ static ParamIterator preProcessArguments(
     if (Arg->Arg.MPtr) {
       Arg->Arg.MPtr =
           storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+      // Propagate values of scalar parameters as constants to the JIT
+      // compiler.
+      JITConstants.emplace_back(
+          ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
+          Arg->Arg.MPtr, Arg->Arg.MSize);
     }
     // Standard layout arguments do not participate in identical argument
     // detection, but we still add it to the list here. As the SYCL runtime can
@@ -468,16 +468,10 @@ static ParamIterator preProcessArguments(
     // not be materialized by the JIT compiler. Instead of removing some
     // standard layout arguments due to identity and missing some in case the
     // materialization is not possible, we rely on constant propagation to
-    // replace standard layout arguments by constants (see below).
+    // replace standard layout arguments by constants.
     NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
                                     true);
-    // Propagate values of scalar parameters as constants to the JIT
-    // compiler.
-    JITConstants.emplace_back(
-        ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
-        Arg->Arg.MPtr, Arg->Arg.MSize);
     return ++Arg;
-    
   }
   // First check if there's already another parameter with identical
   // value.
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 482c7a1d7d309..cdabd6f1f0c68 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -289,7 +289,7 @@ class Command {
   // XPTI instrumentation. Copy code location details to the internal struct.
   // Memory is allocated in this method and released in destructor.
   void copySubmissionCodeLocation();
-  
+
   /// Clear all dependency events for device and host dependencies. This should
   /// only be used if a command is about to be deleted without being executed
   /// before that.

From d323beb24f82f4a1de7b8549df19c7f5a5cbada9 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 22 Mar 2023 16:40:10 +0000
Subject: [PATCH 16/16] [SYCL][Fusion] Update linkage graph diagram

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 clang/lib/Driver/Driver.cpp | 106 ++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 5c60b7b33fbd8..afb7103e2c1ed 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5506,6 +5506,8 @@ class OffloadingActionBuilder final {
         //   s - device code split requested
         //   r - relocatable device code is requested
         //   f - link object output type is TY_Tempfilelist (fat archive)
+        //   e - Embedded IR for fusion (-fsycl-embed-ir) was requested
+        //       and target is NVPTX.
         //   * - "all other cases"
         //     - no condition means output/input is "always" present
         // First symbol indicates output/input type
@@ -5525,58 +5527,58 @@ class OffloadingActionBuilder final {
         //                |             |
         //                |             |
         //         .---------------------------------------.
-        //         |               PostLink                |
-        //         .---------------------------------------.
-        //                           [+*]                [+]
-        //                             |                  |
-        //                             |                  |
-        //                             |---------         |
-        //                             |        |         |
-        //                             |        |         |
-        //                             |      [+!rf]      |
-        //                             |  .-------------. |
-        //                             |  | llvm-foreach| |
-        //                             |  .-------------. |
-        //                             |        |         |
-        //                            [+*]    [+!rf]      |
-        //                      .-----------------.       |
-        //                      | FileTableTform  |       |
-        //                      | (extract "Code")|       |
-        //                      .-----------------.       |
-        //                              [-]               |-----------
-        //           --------------------|                           |
-        //           |                   |                           |
-        //           |                   |-----------------          |
-        //           |                   |                |          |
-        //           |                   |               [-!rf]      |
-        //           |                   |         .--------------.  |
-        //           |                   |         |FileTableTform|  |
-        //           |                   |         |   (merge)    |  |
-        //           |                   |         .--------------.  |
-        //           |                   |               [-]         |-------
-        //           |                   |                |          |      |
-        //           |                   |                |    ------|      |
-        //           |                   |        --------|    |            |
-        //          [.]                 [-*]   [-!rf]        [+!rf]         |
-        //   .---------------.  .-------------------. .--------------.      |
-        //   | finalizeNVPTX  | |  SPIRVTranslator  | |FileTableTform|      |
-        //   | finalizeAMDGCN | |                   | |   (merge)    |      |
-        //   .---------------.  .-------------------. . -------------.      |
-        //          [.]             [-as]      [-!a]         |              |
-        //           |                |          |           |              |
-        //           |              [-s]         |           |              |
-        //           |       .----------------.  |           |              |
-        //           |       | BackendCompile |  |           |              |
-        //           |       .----------------.  |     ------|              |
-        //           |              [-s]         |     |                    |
-        //           |                |          |     |                    |
-        //           |              [-a]      [-!a]  [-!rf]                 |
-        //           |              .--------------------.                  |
-        //           -----------[-n]|   FileTableTform   |[+*]--------------|
-        //                          |  (replace "Code")  |
-        //                          .--------------------.
-        //                                      |
-        //                                    [+*]
+        //         |               PostLink                |[+e]----------------
+        //         .---------------------------------------.                   |
+        //                           [+*]                [+]                   |
+        //                             |                  |                    |
+        //                             |                  |                    |
+        //                             |---------         |                    |
+        //                             |        |         |                    |
+        //                             |        |         |                    |
+        //                             |      [+!rf]      |                    |
+        //                             |  .-------------. |                    |
+        //                             |  | llvm-foreach| |                    |
+        //                             |  .-------------. |                    |
+        //                             |        |         |                    |
+        //                            [+*]    [+!rf]      |                    |
+        //                      .-----------------.       |                    |
+        //                      | FileTableTform  |       |                    |
+        //                      | (extract "Code")|       |                    |
+        //                      .-----------------.       |                    |
+        //                              [-]               |-----------         |
+        //           --------------------|                           |         |
+        //           |                   |                           |         |
+        //           |                   |-----------------          |         |
+        //           |                   |                |          |         |
+        //           |                   |               [-!rf]      |         |
+        //           |                   |         .--------------.  |         |
+        //           |                   |         |FileTableTform|  |         |
+        //           |                   |         |   (merge)    |  |         |
+        //           |                   |         .--------------.  |         |
+        //           |                   |               [-]         |-------  |
+        //           |                   |                |          |      |  |
+        //           |                   |                |    ------|      |  |
+        //           |                   |        --------|    |            |  |
+        //          [.]                 [-*]   [-!rf]        [+!rf]         |  |
+        //   .---------------.  .-------------------. .--------------.      |  |
+        //   | finalizeNVPTX  | |  SPIRVTranslator  | |FileTableTform|      |  |
+        //   | finalizeAMDGCN | |                   | |   (merge)    |      |  |
+        //   .---------------.  .-------------------. . -------------.      |  |
+        //          [.]             [-as]      [-!a]         |              |  |
+        //           |                |          |           |              |  |
+        //           |              [-s]         |           |              |  |
+        //           |       .----------------.  |           |              |  |
+        //           |       | BackendCompile |  |           |              |  |
+        //           |       .----------------.  |     ------|              |  |
+        //           |              [-s]         |     |                    |  |
+        //           |                |          |     |                    |  |
+        //           |              [-a]      [-!a]  [-!rf]                 |  |
+        //           |              .--------------------.                  |  |
+        //           -----------[-n]|   FileTableTform   |[+*]--------------|  |
+        //                          |  (replace "Code")  |                     |
+        //                          .--------------------.                     |
+        //                                      |      -------------------------
+        //                                    [+*]     | [+e]
         //         .--------------------------------------.
         //         |            OffloadWrapper            |
         //         .--------------------------------------.