From 94ac8d587c4228f03ffd43657d52c843e0b3d7cd Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Tue, 20 Jun 2023 13:02:35 -0700
Subject: [PATCH 01/14] Fix frontend issues after PR#8280

---
 clang/lib/CodeGen/CGBuiltin.cpp | 9 +++++++--
 clang/lib/CodeGen/CGCall.cpp    | 5 +++--
 2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 57d3bc03bc264..8013a8db2e47a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -22144,7 +22144,8 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
     // Even if the current function doesn't have a clang builtin, create
     // an 'fpbuiltin-max-error' attribute for it; unless it's marked with
     // an NoBuiltin attribute.
-    if (!FD->hasAttr<NoBuiltinAttr>()) {
+    if (!FD->hasAttr<NoBuiltinAttr>() &&
+        FD->getNameInfo().getName().isIdentifier()) {
       Name = FD->getName();
       FPAccuracyIntrinsicID =
           llvm::StringSwitch<unsigned>(Name)
@@ -22155,7 +22156,11 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
               .Case("frem", llvm::Intrinsic::fpbuiltin_frem)
               .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
               .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
-              .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt);
+              .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt)
+              .Default(0);
+      if (!FPAccuracyIntrinsicID) {
+        return nullptr;
+      }
     } else {
       return nullptr;
     }
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 866c7d72ad6f1..5cf934407653b 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5620,8 +5620,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // Emit the actual call/invoke instruction.
   llvm::CallBase *CI;
   if (!InvokeDest) {
-    if (!getLangOpts().FPAccuracyFuncMap.empty() ||
-        !getLangOpts().FPAccuracyVal.empty()) {
+    if ((!getLangOpts().FPAccuracyFuncMap.empty() ||
+         !getLangOpts().FPAccuracyVal.empty()) &&
+        isa_and_nonnull<FunctionDecl>(TargetDecl)) {
       const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl);
       assert(FD && "expecting a function");
       CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD);

From f235c4441c7528ea8afc9fd5c10a5654aea64613 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Wed, 28 Jun 2023 14:38:31 -0700
Subject: [PATCH 02/14] [SYCL] Split device images based on accuracy level
 provided in option

This PR reuses optional kernel features mechanism to provide this
splitting logic based on accuracy level:
1. When frontend emits fp intrinsic call and attaches the maximum error
   attribute we also attach "sycl_used_aspects" metadata to the call
   instruction with a value which corresponds to high, medium, low, sycl
   or cuda. Mapping for those values is needed to be visible for SYCL
   device compiler only and we intentionally don't put those values to
   aspects enum because we don't need aspects because of the reasons I
   described above.
2. Make SYCLPropagateAspectsUsage to propagate sycl_used_aspects
   metadata from instructions to kernel.
3. Don't add internal aspects into the requirements, because we don't need
   processing of these fake aspects (with negative values) in the SYCL RT.
After these changes splitting functionality based on sycl_used_aspects
metadata is available for free.

More details:
Currently accruracy level can be controlled using the following options.
For entire translation unit:
-ffp-accuracy=high
-ffp-accuracy=medium
-ffp-accuracy=low
-ffp-accuracy=sycl
-ffp-accuracy=cuda

For particular funcions in the translation unit:
-ffp-accuracy=low:sin,cos

Whenever frontend sees a math function in a kernel or a device function
it emits fp intrinsic call with attached callsite attribute indicating
value of the maximum error. llvm-spirv is going to translate this
builtins to regular __ocl intrinsics and translate callsite attribute to
decorator (which is a new spirv extension). If that extension is not supported
by the backend, it is going to emit an error. Error is emitted also in
the case if backend supports the extension but can't compile the kernel because
it doesn't have corresponding implemenation of math function complying with
required maximum error.

Aspects corrsponding to different levels of accuracy are not suitable in
this case because aforementioned options are sycl program compilation options, i.e.
it doesn't make sense to provide an opportunity to the user to write
something like this:
if (dev.has(aspect::ext_oneapi_fp_intrinsic_accuracy_high)) {
  /* submit kernel using high accuracy intrinsics */
}

But on our side we still would like to put kernels and device functions
to different images based on required accuracy level. It is necessary because
some backends may support, for example, low and medium accuracy but don't
support high accuracy. In this case we want to make kernels using low
and medium accuracy levels buildable, so we can't put kernels requiring
high accuracy and low/medidum accuracy together.
---
 clang/lib/CodeGen/CGBuiltin.cpp               |   8 +-
 clang/lib/CodeGen/CGCall.cpp                  |  19 ++-
 clang/lib/CodeGen/CGSYCLRuntime.h             |  13 ++
 clang/lib/CodeGen/CodeGenModule.cpp           |  10 +-
 clang/lib/CodeGen/CodeGenModule.h             |  13 +-
 .../SYCLLowerIR/SYCLPropagateAspectsUsage.cpp |   7 +
 .../sycl-post-link/SYCLDeviceRequirements.cpp |  17 ++-
 .../optional_kernel_features/fp-accuracy.cpp  | 138 ++++++++++++++++++
 8 files changed, 207 insertions(+), 18 deletions(-)
 create mode 100644 sycl/test/optional_kernel_features/fp-accuracy.cpp

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 8013a8db2e47a..83c6b69c64dc7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16,6 +16,7 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CGRecordLayout.h"
+#include "CGSYCLRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
@@ -513,12 +514,17 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
   // TODO: Replace AttrList with a single attribute. The call can only have a
   // single FPAccuracy attribute.
   llvm::AttributeList AttrList;
+  // "sycl_used_aspects" metadata associated with the call.
+  SmallVector<llvm::Metadata *, 4> AspectsMD;
   // sincos() doesn't return a value, but it still has a type associated with
   // it that corresponds to the operand type.
   CGF.CGM.getFPAccuracyFuncAttributes(
-      Name, AttrList, ID,
+      Name, AttrList, AspectsMD, ID,
       Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType());
   CI->setAttributes(AttrList);
+  if (!AspectsMD.empty())
+    CI->setMetadata("sycl_used_aspects",
+                    llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectsMD));
   return CI;
 }
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 5cf934407653b..9e09a00d6a3aa 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -17,6 +17,7 @@
 #include "CGCXXABI.h"
 #include "CGCleanup.h"
 #include "CGRecordLayout.h"
+#include "CGSYCLRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
@@ -1846,8 +1847,18 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
       .Case("cuda", llvm::fp::FPAccuracy::CUDA);
 }
 
+static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
+  return llvm::StringSwitch<int32_t>(FPAccuracyStr)
+      .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high)
+      .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium)
+      .Case("low", SYCLInternalAspect::fp_intrinsic_accuracy_low)
+      .Case("sycl", SYCLInternalAspect::fp_intrinsic_accuracy_sycl)
+      .Case("cuda", SYCLInternalAspect::fp_intrinsic_accuracy_cuda);
+}
+
 void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
-    StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID,
+    StringRef Name, llvm::AttrBuilder &FuncAttrs,
+    SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
     const llvm::Type *FuncType) {
   // Priority is given to to the accuracy specific to the function.
   // So, if the command line is something like this:
@@ -1864,6 +1875,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(FuncMapIt->second));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      if (getLangOpts().SYCLIsDevice)
+        MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+            Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second))));
     }
   }
   if (FuncAttrs.attrs().size() == 0)
@@ -1872,6 +1886,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      if (getLangOpts().SYCLIsDevice)
+        MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+            Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal))));
     }
 }
 
diff --git a/clang/lib/CodeGen/CGSYCLRuntime.h b/clang/lib/CodeGen/CGSYCLRuntime.h
index 71bc45599516f..095931af0a70e 100644
--- a/clang/lib/CodeGen/CGSYCLRuntime.h
+++ b/clang/lib/CodeGen/CGSYCLRuntime.h
@@ -23,6 +23,19 @@ namespace CodeGen {
 
 class CodeGenModule;
 
+// These aspects are internal and used for device image splitting purposes only.
+// They are not exposed to the DPCPP users through "aspect" enum. That's why
+// they are intentionally assigned negative values to filter them out at the
+// stage of embedding used aspects as device requirements to the executable.
+// We don't pass these internal aspects to the DPCPP RT.
+enum SYCLInternalAspect : int32_t {
+  fp_intrinsic_accuracy_high = -1,
+  fp_intrinsic_accuracy_medium = -2,
+  fp_intrinsic_accuracy_low = -3,
+  fp_intrinsic_accuracy_sycl = -4,
+  fp_intrinsic_accuracy_cuda = -5,
+};
+
 class CGSYCLRuntime {
 protected:
   CodeGenModule &CGM;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 6178ce2840f79..8f4884a4a23e7 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -7882,12 +7882,12 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
   NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx);
 }
 
-void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name,
-                                                llvm::AttributeList &AttrList,
-                                                unsigned ID,
-                                                const llvm::Type *FuncType) {
+void CodeGenModule::getFPAccuracyFuncAttributes(
+    StringRef Name, llvm::AttributeList &AttrList,
+    SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
+    const llvm::Type *FuncType) {
   llvm::AttrBuilder FuncAttrs(getLLVMContext());
-  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, ID, FuncType);
+  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MDs, ID, FuncType);
   AttrList = llvm::AttributeList::get(
       getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 5d1521da2da63..20109372d854a 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1594,8 +1594,9 @@ class CodeGenModule : public CodeGenTypeCache {
   void moveLazyEmissionStates(CodeGenModule *NewBuilder);
 
   void getFPAccuracyFuncAttributes(StringRef Name,
-                                   llvm::AttributeList &AttrList, unsigned ID,
-                                   const llvm::Type *FuncType);
+                                   llvm::AttributeList &AttrList,
+                                   SmallVector<llvm::Metadata *, 4> &MDs,
+                                   unsigned ID, const llvm::Type *FuncType);
 
 private:
   llvm::Constant *GetOrCreateLLVMFunction(
@@ -1791,10 +1792,10 @@ class CodeGenModule : public CodeGenTypeCache {
                                     bool AttrOnCallSite,
                                     llvm::AttrBuilder &FuncAttrs);
 
-  void getDefaultFunctionFPAccuracyAttributes(StringRef Name,
-                                              llvm::AttrBuilder &FuncAttrs,
-                                              unsigned ID,
-                                              const llvm::Type *FuncType);
+  void getDefaultFunctionFPAccuracyAttributes(
+      StringRef Name, llvm::AttrBuilder &FuncAttrs,
+      SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
+      const llvm::Type *FuncType);
 
   llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
                                                StringRef Suffix);
diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
index e846b9141d63c..4b37c267f2353 100644
--- a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
+++ b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
@@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I,
     Result.insert(Aspects.begin(), Aspects.end());
   }
 
+  if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) {
+    for (const MDOperand &MDOp : InstApsects->operands()) {
+      const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();
+      Result.insert(cast<ConstantInt>(C)->getSExtValue());
+    }
+  }
+
   return Result;
 }
 
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
index 4aa28dc4ff643..a9c791877a079 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
@@ -22,10 +22,10 @@ void llvm::getSYCLDeviceRequirements(
     const module_split::ModuleDesc &MD,
     std::map<StringRef, util::PropertyValue> &Requirements) {
   auto ExtractIntegerFromMDNodeOperand = [=](const MDNode *N,
-                                             unsigned OpNo) -> unsigned {
+                                             unsigned OpNo) -> int32_t {
     Constant *C =
         cast<ConstantAsMetadata>(N->getOperand(OpNo).get())->getValue();
-    return static_cast<uint32_t>(C->getUniqueInteger().getZExtValue());
+    return static_cast<int32_t>(C->getUniqueInteger().getSExtValue());
   };
 
   // { LLVM-IR metadata name , [SYCL/Device requirements] property name }, see:
@@ -41,10 +41,16 @@ void llvm::getSYCLDeviceRequirements(
     std::set<uint32_t> Values;
     for (const Function &F : MD.getModule()) {
       if (const MDNode *MDN = F.getMetadata(MDName)) {
-        for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I)
-          Values.insert(ExtractIntegerFromMDNodeOperand(MDN, I));
+        for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) {
+          // Don't put internal aspects (with negative integer value) into the
+          // requirements, they are used only for device image splitting.
+          auto Val = ExtractIntegerFromMDNodeOperand(MDN, I);
+          if (Val >= 0)
+            Values.insert(Val);
+        }
       }
     }
+
     // We don't need the "fixed_target" property if it's empty
     if (std::string(MDName) == "sycl_fixed_targets" && Values.empty())
       continue;
@@ -64,10 +70,11 @@ void llvm::getSYCLDeviceRequirements(
     if (auto *MDN = F->getMetadata("intel_reqd_sub_group_size")) {
       assert(MDN->getNumOperands() == 1);
       auto MDValue = ExtractIntegerFromMDNodeOperand(MDN, 0);
+      assert(MDValue >= 0);
       if (!SubGroupSize)
         SubGroupSize = MDValue;
       else
-        assert(*SubGroupSize == MDValue);
+        assert(*SubGroupSize == static_cast<uint32_t>(MDValue));
     }
   }
   // Do not attach reqd_sub_group_size if there is no attached metadata
diff --git a/sycl/test/optional_kernel_features/fp-accuracy.cpp b/sycl/test/optional_kernel_features/fp-accuracy.cpp
new file mode 100644
index 0000000000000..795fd60bf7aa0
--- /dev/null
+++ b/sycl/test/optional_kernel_features/fp-accuracy.cpp
@@ -0,0 +1,138 @@
+// RUN: %clangxx %s -o %test.bc -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=low:tan -ffp-accuracy=cuda:exp,acos -ffp-accuracy=sycl:log,asin  -fno-math-errno  -fsycl -fsycl-device-only
+// RUN: sycl-post-link -split=auto -symbols %test.bc -o %test.table
+// RUN: FileCheck %s -input-file=%test.table --check-prefixes CHECK-TABLE
+// RUN: FileCheck %s -input-file=%test_0.sym --check-prefixes CHECK-M0-SYMS
+// RUN: FileCheck %s -input-file=%test_1.sym --check-prefixes CHECK-M1-SYMS
+// RUN: FileCheck %s -input-file=%test_2.sym --check-prefixes CHECK-M2-SYMS
+// RUN: FileCheck %s -input-file=%test_3.sym --check-prefixes CHECK-M3-SYMS
+// RUN: FileCheck %s -input-file=%test_4.sym --check-prefixes CHECK-M4-SYMS
+// RUN: FileCheck %s -input-file=%test_5.sym --check-prefixes CHECK-M5-SYMS
+
+// Tests that kernels which use different fp-accuracy level end up in different
+// device images.
+
+// CHECK-TABLE: Code
+// CHECK-TABLE-NEXT: _0.sym
+// CHECK-TABLE-NEXT: _1.sym
+// CHECK-TABLE-NEXT: _2.sym
+// CHECK-TABLE-NEXT: _3.sym
+// CHECK-TABLE-NEXT: _4.sym
+// CHECK-TABLE-NEXT: _5.sym
+// CHECK-TABLE-NEXT: _6.sym
+// CHECK-TABLE-EMPTY:
+
+// CHECK-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
+// CHECK-M0-SYMS-NEXT: Kernel1
+// CHECK-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
+// CHECK-M0-SYMS-NEXT: Kernel7
+// CHECK-M0-SYMS-EMPTY:
+
+// CHECK-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2
+// CHECK-M1-SYMS-NEXT: Kernel2
+// CHECK-M1-SYMS-EMPTY:
+
+// CHECK-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel3
+// CHECK-M2-SYMS-NEXT: Kernel3
+// CHECK-M2-SYMS-EMPTY:
+
+// CHECK-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel6
+// CHECK-M3-SYMS-NEXT: Kernel6
+// CHECK-M3-SYMS-EMPTY:
+
+// CHECK-M4-SYMS: __pf_kernel_wrapper{{.*}}Kernel4
+// CHECK-M4-SYMS-NEXT: Kernel4
+// CHECK-M4-SYMS-EMPTY:
+
+// CHECK-M5-SYMS: __pf_kernel_wrapper{{.*}}Kernel5
+// CHECK-M5-SYMS-NEXT: Kernel5
+// CHECK-M5-SYMS-EMPTY:
+
+// CHECK-M6-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
+// CHECK-M6-SYMS-NEXT: Kernel0
+// CHECK-M6-SYMS-EMPTY:
+
+#include <array>
+#include <cmath>
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr access::mode sycl_read = access::mode::read;
+constexpr access::mode sycl_write = access::mode::write;
+
+int main() {
+  const size_t array_size = 4;
+  std::array<double, array_size> D = {{1., 2., 3., 4.}}, E;
+  queue deviceQueue;
+  range<1> numOfItems{array_size};
+  double Value = 5.;
+  buffer<double, 1> bufferOut(E.data(), numOfItems);
+
+  // Kernel0 doesn't use math functions.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel0>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = Value; });
+  });
+
+  // Kernel1 uses high-accuracy sin.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel1>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sin(Value); });
+  });
+
+  // Kernel2 uses medium-accuracy cos.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel2>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::cos(Value); });
+  });
+
+  // Kernel3 uses low-accuracy tan.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel3>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::tan(Value); });
+  });
+
+  // Kernel4 uses cuda-accuracy exp and sycl-accuracy log.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel4>(numOfItems, [=](id<1> wiID) {
+      accessorOut[wiID] = std::log(std::exp(Value));
+    });
+  });
+
+  // Kernel5 uses cuda-accuracy acos.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel5>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::acos(Value); });
+  });
+
+  // Kernel6 uses sycl-accuracy asin.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel6>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::asin(Value); });
+  });
+
+  // Kernel7 uses high-accuracy sqrt.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel7>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sqrt(Value); });
+  });
+
+  return 0;
+}

From 649fd15d888ac2273cf59e08330a221489247c9c Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Fri, 7 Jul 2023 12:11:18 -0700
Subject: [PATCH 03/14] Add frontend test

---
 clang/test/CodeGenSYCL/fp-accuracy.cpp | 102 +++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 clang/test/CodeGenSYCL/fp-accuracy.cpp

diff --git a/clang/test/CodeGenSYCL/fp-accuracy.cpp b/clang/test/CodeGenSYCL/fp-accuracy.cpp
new file mode 100644
index 0000000000000..911842b395db5
--- /dev/null
+++ b/clang/test/CodeGenSYCL/fp-accuracy.cpp
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1  -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown -disable-llvm-passes %s -o - | FileCheck %s
+
+// Tests that sycl_used_aspects metadata is attached to the fpbuiltin call based on -ffp-accuracy option.
+
+#include "Inputs/sycl.hpp"
+
+extern "C" SYCL_EXTERNAL double sin(double);
+extern "C" SYCL_EXTERNAL double cos(double);
+extern "C" SYCL_EXTERNAL double tan(double);
+extern "C" SYCL_EXTERNAL double log(double);
+extern "C" SYCL_EXTERNAL double exp(double);
+extern "C" SYCL_EXTERNAL double acos(double);
+extern "C" SYCL_EXTERNAL double asin(double);
+extern "C" SYCL_EXTERNAL double sqrt(double);
+
+using namespace sycl;
+
+int main() {
+  const unsigned array_size = 4;
+  double Value = .5;
+  queue deviceQueue;
+  range<1> numOfItems{array_size};
+
+  // Kernel0 doesn't use math functions.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel0>(numOfItems,
+    [=](id<1> wiID) {
+      (void)Value;
+    });
+  });
+
+  // Kernel1 uses high-accuracy sin.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel1>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT1:[0-9]+]]
+      (void)sin(Value);
+    });
+  });
+
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel2>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT2:[0-9]+]]
+      (void)cos(Value);
+    });
+  });
+
+  // Kernel3 uses low-accuracy tan.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel3>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT3:[0-9]+]]
+      (void)tan(Value);
+    });
+  });
+
+  // Kernel4 uses cuda-accuracy exp and sycl-accuracy log.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel4>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT4:[0-9]+]]
+// CHECK: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT5:[0-9]+]]
+      (void)log(exp(Value));
+    });
+  });
+  deviceQueue.wait();
+
+  // Kernel5 uses cuda-accuracy acos.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel5>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT4:[0-9]+]]
+      (void)acos(Value);
+    });
+  });
+
+  // Kernel6 uses sycl-accuracy asin.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel6>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT5:[0-9]+]]
+      (void)asin(Value);
+    });
+  });
+
+  // Kernel7 uses high-accuracy sqrt.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel7>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT1:[0-9]+]]
+      (void)sqrt(Value);
+    });
+  });
+  return 0;
+}
+
+// CHECK: [[ASPECT1]] = !{i32 -1}
+// CHECK: [[ASPECT2]] = !{i32 -2}
+// CHECK: [[ASPECT3]] = !{i32 -3}
+// CHECK: [[ASPECT4]] = !{i32 -5}
+// CHECK: [[ASPECT5]] = !{i32 -4}

From 404f82eaa7ba9dc7dbd32839e2e261089aa39a4d Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Fri, 7 Jul 2023 15:52:16 -0700
Subject: [PATCH 04/14] Metadata propagation test

---
 .../PropagateAspectsUsage/call-graph-inst.ll  | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll

diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll
new file mode 100644
index 0000000000000..06f276b919c5c
--- /dev/null
+++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll
@@ -0,0 +1,66 @@
+; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s
+;
+; Test checks that the pass is able to propagate information about aspects
+; used in the instruction through a call graph
+;
+;   K1  K2
+;  /  \/  \
+; F1  F2   F3
+;
+; F1 doesn't use optional type.
+; F2 uses optional A.
+; F3 uses optional B.
+
+%Optional.A = type { i32 }
+%Optional.B = type { i32 }
+
+; CHECK: spir_kernel void @kernel1() !sycl_used_aspects ![[#ID1:]]
+define spir_kernel void @kernel1() {
+  call spir_func void @func1()
+  call spir_func void @func2()
+  ret void
+}
+
+; CHECK: spir_kernel void @kernel2() !sycl_used_aspects ![[#ID2:]]
+define spir_kernel void @kernel2() {
+  call spir_func void @func2()
+  call spir_func void @func3()
+  ret void
+}
+
+; CHECK: spir_func void @func1() {
+define spir_func void @func1() {
+  %tmp = alloca i32
+  ret void
+}
+
+declare void @llvm.fpbuiltin.f64()
+
+; CHECK: spir_func void @func2() !sycl_used_aspects ![[#ID1]] {
+define spir_func void @func2() {
+  %tmp1 = alloca %Optional.A
+  call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !3
+  ret void
+}
+
+; CHECK: spir_func void @func3() !sycl_used_aspects ![[#ID3:]] {
+define spir_func void @func3() {
+  %tmp = alloca %Optional.B
+  call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !4
+  ret void
+}
+
+!sycl_types_that_use_aspects = !{!0, !1}
+!0 = !{!"Optional.A", i32 1}
+!1 = !{!"Optional.B", i32 2}
+
+!sycl_aspects = !{!2}
+!2 = !{!"fp64", i32 6}
+!3 = !{i32 -1}
+!4 = !{i32 -2}
+
+; CHECK: ![[#ID1]] = !{i32 1, i32 -1}
+; CHECK: ![[#ID2]] = !{i32 1, i32 -1, i32 2, i32 -2}
+; CHECK: ![[#ID3]] = !{i32 2, i32 -2}
+
+

From d1077406b89ef3602cfafe7da7e0817b3a6c83d3 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Mon, 10 Jul 2023 12:07:59 -0700
Subject: [PATCH 05/14] Add info to design documentation

---
 sycl/doc/design/OptionalDeviceFeatures.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/sycl/doc/design/OptionalDeviceFeatures.md b/sycl/doc/design/OptionalDeviceFeatures.md
index 3bb601abb12c6..9c03825b3b1db 100644
--- a/sycl/doc/design/OptionalDeviceFeatures.md
+++ b/sycl/doc/design/OptionalDeviceFeatures.md
@@ -1148,6 +1148,24 @@ Kernel has a required sub-group size of '32' but device does not support this
 sub-group size.
 ```
 
+### SYCL internal aspects for device image splitting
+
+There are scenarios when we would like to split device images based on
+optional kernel features but we don't want to expose corresponding
+aspects to the user. Internal SYCL aspects are used for this purpose.
+
+To differentiate them from regular aspects, internal aspects are assigned
+negative values. If optional feature is used in the kernel then SYCL
+device compiler adds value of internal aspect to 'sycl_used_aspects' metadata,
+it gets propagated through the call graph and participates in device image
+splitting together with regular aspects but it's not passed to the SYCL runtime,
+it is filtered out when generating a set of device requirements.
+
+New value can be added to 'SYCLInternalAspect' enum to introduce new internal
+aspect.
+
+Example of internal aspects usage is splitting device images based on floating
+point accuracy level for math functions provided by user using -ffp-accuracy option.
 
 ## Appendix: Adding an attribute to 8-byte `atomic_ref`
 

From a537ca92d30d83e9a06a24209e61f7c1789611d0 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Mon, 10 Jul 2023 13:11:58 -0700
Subject: [PATCH 06/14] Address review comments

---
 clang/lib/CodeGen/CGBuiltin.cpp               |  9 +++++----
 clang/lib/CodeGen/CGCall.cpp                  | 19 ++++++++++---------
 clang/lib/CodeGen/CodeGenModule.cpp           | 11 ++++++-----
 clang/lib/CodeGen/CodeGenModule.h             | 12 ++++++------
 .../PropagateAspectsUsage/call-graph-inst.ll  |  6 +++---
 sycl/doc/design/OptionalDeviceFeatures.md     |  3 ++-
 6 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 83c6b69c64dc7..6bca890464aee 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -515,16 +515,17 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
   // single FPAccuracy attribute.
   llvm::AttributeList AttrList;
   // "sycl_used_aspects" metadata associated with the call.
-  SmallVector<llvm::Metadata *, 4> AspectsMD;
+  llvm::Metadata *AspectMD;
   // sincos() doesn't return a value, but it still has a type associated with
   // it that corresponds to the operand type.
   CGF.CGM.getFPAccuracyFuncAttributes(
-      Name, AttrList, AspectsMD, ID,
+      Name, AttrList, AspectMD, ID,
       Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType());
   CI->setAttributes(AttrList);
-  if (!AspectsMD.empty())
+
+  if (CGF.getLangOpts().SYCLIsDevice && AspectMD)
     CI->setMetadata("sycl_used_aspects",
-                    llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectsMD));
+                    llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectMD));
   return CI;
 }
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 9e09a00d6a3aa..968962f8c5c23 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1848,6 +1848,9 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
 }
 
 static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
+  assert(FPAccuracyStr.equals("high") || FPAccuracyStr.equals("medium") ||
+         FPAccuracyStr.equals("low") || FPAccuracyStr.equals("sycl") ||
+         FPAccuracyStr.equals("cuda"));
   return llvm::StringSwitch<int32_t>(FPAccuracyStr)
       .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high)
       .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium)
@@ -1857,9 +1860,8 @@ static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
 }
 
 void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
-    StringRef Name, llvm::AttrBuilder &FuncAttrs,
-    SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
-    const llvm::Type *FuncType) {
+    StringRef Name, llvm::AttrBuilder &FuncAttrs, llvm::Metadata *&MD,
+    unsigned ID, const llvm::Type *FuncType) {
   // Priority is given to to the accuracy specific to the function.
   // So, if the command line is something like this:
   // 'clang -fp-accuracy = high -fp-accuracy = low:[sin]'.
@@ -1868,6 +1870,7 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
   // To ensure that, first check if Name has a required accuracy by visiting
   // the 'FPAccuracyFuncMap'; if no accuracy is mapped to Name (FuncAttrs
   // is empty), then set its accuracy from the TU's accuracy value.
+  MD = nullptr;
   if (!getLangOpts().FPAccuracyFuncMap.empty()) {
     auto FuncMapIt = getLangOpts().FPAccuracyFuncMap.find(Name.str());
     if (FuncMapIt != getLangOpts().FPAccuracyFuncMap.end()) {
@@ -1875,9 +1878,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(FuncMapIt->second));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
-      if (getLangOpts().SYCLIsDevice)
-        MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
-            Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second))));
+      MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+          Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second)));
     }
   }
   if (FuncAttrs.attrs().size() == 0)
@@ -1886,9 +1888,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
-      if (getLangOpts().SYCLIsDevice)
-        MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
-            Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal))));
+      MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+          Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal)));
     }
 }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 8f4884a4a23e7..41b5a2160ace4 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -7882,12 +7882,13 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
   NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx);
 }
 
-void CodeGenModule::getFPAccuracyFuncAttributes(
-    StringRef Name, llvm::AttributeList &AttrList,
-    SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
-    const llvm::Type *FuncType) {
+void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name,
+                                                llvm::AttributeList &AttrList,
+                                                llvm::Metadata *&MD,
+                                                unsigned ID,
+                                                const llvm::Type *FuncType) {
   llvm::AttrBuilder FuncAttrs(getLLVMContext());
-  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MDs, ID, FuncType);
+  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MD, ID, FuncType);
   AttrList = llvm::AttributeList::get(
       getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 20109372d854a..56e9a9358ba3e 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1595,8 +1595,8 @@ class CodeGenModule : public CodeGenTypeCache {
 
   void getFPAccuracyFuncAttributes(StringRef Name,
                                    llvm::AttributeList &AttrList,
-                                   SmallVector<llvm::Metadata *, 4> &MDs,
-                                   unsigned ID, const llvm::Type *FuncType);
+                                   llvm::Metadata *&MDs, unsigned ID,
+                                   const llvm::Type *FuncType);
 
 private:
   llvm::Constant *GetOrCreateLLVMFunction(
@@ -1792,10 +1792,10 @@ class CodeGenModule : public CodeGenTypeCache {
                                     bool AttrOnCallSite,
                                     llvm::AttrBuilder &FuncAttrs);
 
-  void getDefaultFunctionFPAccuracyAttributes(
-      StringRef Name, llvm::AttrBuilder &FuncAttrs,
-      SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
-      const llvm::Type *FuncType);
+  void getDefaultFunctionFPAccuracyAttributes(StringRef Name,
+                                              llvm::AttrBuilder &FuncAttrs,
+                                              llvm::Metadata *&MD, unsigned ID,
+                                              const llvm::Type *FuncType);
 
   llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
                                                StringRef Suffix);
diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll
index 06f276b919c5c..47df6a804eabb 100644
--- a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll
+++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll
@@ -7,9 +7,9 @@
 ;  /  \/  \
 ; F1  F2   F3
 ;
-; F1 doesn't use optional type.
-; F2 uses optional A.
-; F3 uses optional B.
+; F1 doesn't use optional type and doesn't have instruction with attached 'sycl_used_aspects' metadata.
+; F2 uses optional A and has instruction with attached 'sycl_used_aspects' metadata.
+; F3 uses optional B and has instruction with attached 'sycl_used_aspects' metadata.
 
 %Optional.A = type { i32 }
 %Optional.B = type { i32 }
diff --git a/sycl/doc/design/OptionalDeviceFeatures.md b/sycl/doc/design/OptionalDeviceFeatures.md
index 9c03825b3b1db..c3a1202d4ec6c 100644
--- a/sycl/doc/design/OptionalDeviceFeatures.md
+++ b/sycl/doc/design/OptionalDeviceFeatures.md
@@ -553,7 +553,8 @@ type because the front-end does not include that type in the
 `!sycl_types_that_use_aspects` set.  If a function references the `double`
 type, the implementation implicitly assumes that the function uses
 `aspect::fp64` and adds that aspect to the function's `!sycl_used_aspects`
-set.
+set. If `!sycl_used_aspects` is attached to instruction then it is also added
+to the function's `!sycl_used_aspects` set.
 
 **NOTE**: This scan of the IR will require comparing the type referenced by
 each IR instruction with the names of the types in the

From c4b0c5685c47c3e5107d81323e217ef213ce27c1 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Wed, 12 Jul 2023 09:47:27 -0700
Subject: [PATCH 07/14] Remove unnecessary include

---
 clang/lib/CodeGen/CGBuiltin.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6bca890464aee..1911bcf226ef0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16,7 +16,6 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CGRecordLayout.h"
-#include "CGSYCLRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"

From 756690c11102154dab55ea2f93cb24b5ccd4a8f7 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Wed, 12 Jul 2023 09:48:14 -0700
Subject: [PATCH 08/14] Initialize local variable to avoid static analyzer
 issues

---
 clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 1911bcf226ef0..78266f28db6c9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -514,7 +514,7 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
   // single FPAccuracy attribute.
   llvm::AttributeList AttrList;
   // "sycl_used_aspects" metadata associated with the call.
-  llvm::Metadata *AspectMD;
+  llvm::Metadata *AspectMD = nullptr;
   // sincos() doesn't return a value, but it still has a type associated with
   // it that corresponds to the operand type.
   CGF.CGM.getFPAccuracyFuncAttributes(

From 4dd7de1c97a88698abd26991f5a5076dc55977ea Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Wed, 12 Jul 2023 10:07:26 -0700
Subject: [PATCH 09/14] Remove unnecessary fix

---
 clang/lib/CodeGen/CGCall.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 968962f8c5c23..8deb1b01e0dcb 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5639,8 +5639,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   llvm::CallBase *CI;
   if (!InvokeDest) {
     if ((!getLangOpts().FPAccuracyFuncMap.empty() ||
-         !getLangOpts().FPAccuracyVal.empty()) &&
-        isa_and_nonnull<FunctionDecl>(TargetDecl)) {
+         !getLangOpts().FPAccuracyVal.empty())) {
       const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl);
       assert(FD && "expecting a function");
       CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD);

From 9408518666dbed15f62bd36de1a1cb4b185e2e9d Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Wed, 12 Jul 2023 10:39:25 -0700
Subject: [PATCH 10/14] Remove parentheses

---
 clang/lib/CodeGen/CGCall.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 8deb1b01e0dcb..2db8906ba9cb3 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5638,8 +5638,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // Emit the actual call/invoke instruction.
   llvm::CallBase *CI;
   if (!InvokeDest) {
-    if ((!getLangOpts().FPAccuracyFuncMap.empty() ||
-         !getLangOpts().FPAccuracyVal.empty())) {
+    if (!getLangOpts().FPAccuracyFuncMap.empty() ||
+         !getLangOpts().FPAccuracyVal.empty()) {
       const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl);
       assert(FD && "expecting a function");
       CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD);

From 9807f6403652c1d03285f9fce2fda130719d7397 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Wed, 12 Jul 2023 12:50:17 -0700
Subject: [PATCH 11/14] Add additional RUN lines for TU and mixed cases

---
 clang/test/CodeGenSYCL/fp-accuracy.cpp        |  53 +++--
 .../optional_kernel_features/fp-accuracy.cpp  | 196 +++++++++++++-----
 2 files changed, 181 insertions(+), 68 deletions(-)

diff --git a/clang/test/CodeGenSYCL/fp-accuracy.cpp b/clang/test/CodeGenSYCL/fp-accuracy.cpp
index 911842b395db5..02b5d9c283431 100644
--- a/clang/test/CodeGenSYCL/fp-accuracy.cpp
+++ b/clang/test/CodeGenSYCL/fp-accuracy.cpp
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1  -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1  -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown -disable-llvm-passes %s -o - | FileCheck --check-prefix CHECK-FUNC %s
+// RUN: %clang_cc1  -fsycl-is-device -ffp-builtin-accuracy=high -emit-llvm -triple spir64-unknown-unknown -disable-llvm-passes %s -o - | FileCheck --check-prefix CHECK-TU %s
+// RUN: %clang_cc1  -fsycl-is-device -ffp-builtin-accuracy=medium -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=cuda:exp -ffp-builtin-accuracy=sycl:log -emit-llvm -triple spir64-unknown-unknown -disable-llvm-passes %s -o - | FileCheck --check-prefix CHECK-MIX %s
 
 // Tests that sycl_used_aspects metadata is attached to the fpbuiltin call based on -ffp-accuracy option.
 
@@ -33,7 +35,9 @@ int main() {
   deviceQueue.submit([&](handler& cgh) {
     cgh.parallel_for<class Kernel1>(numOfItems,
     [=](id<1> wiID) {
-// CHECK: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT1:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
+// CHECK-MIX: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
       (void)sin(Value);
     });
   });
@@ -41,7 +45,9 @@ int main() {
   deviceQueue.submit([&](handler& cgh) {
     cgh.parallel_for<class Kernel2>(numOfItems,
     [=](id<1> wiID) {
-// CHECK: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT2:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]]
       (void)cos(Value);
     });
   });
@@ -50,7 +56,9 @@ int main() {
   deviceQueue.submit([&](handler& cgh) {
     cgh.parallel_for<class Kernel3>(numOfItems,
     [=](id<1> wiID) {
-// CHECK: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT3:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[LOW_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
       (void)tan(Value);
     });
   });
@@ -59,8 +67,12 @@ int main() {
   deviceQueue.submit([&](handler& cgh) {
     cgh.parallel_for<class Kernel4>(numOfItems,
     [=](id<1> wiID) {
-// CHECK: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT4:[0-9]+]]
-// CHECK: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT5:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]]
+// CHECK-MIX: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]]
       (void)log(exp(Value));
     });
   });
@@ -70,7 +82,9 @@ int main() {
   deviceQueue.submit([&](handler& cgh) {
     cgh.parallel_for<class Kernel5>(numOfItems,
     [=](id<1> wiID) {
-// CHECK: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT4:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
       (void)acos(Value);
     });
   });
@@ -79,7 +93,9 @@ int main() {
   deviceQueue.submit([&](handler& cgh) {
     cgh.parallel_for<class Kernel6>(numOfItems,
     [=](id<1> wiID) {
-// CHECK: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT5:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
       (void)asin(Value);
     });
   });
@@ -88,15 +104,24 @@ int main() {
   deviceQueue.submit([&](handler& cgh) {
     cgh.parallel_for<class Kernel7>(numOfItems,
     [=](id<1> wiID) {
-// CHECK: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT1:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
       (void)sqrt(Value);
     });
   });
   return 0;
 }
 
-// CHECK: [[ASPECT1]] = !{i32 -1}
-// CHECK: [[ASPECT2]] = !{i32 -2}
-// CHECK: [[ASPECT3]] = !{i32 -3}
-// CHECK: [[ASPECT4]] = !{i32 -5}
-// CHECK: [[ASPECT5]] = !{i32 -4}
+// CHECK-FUNC: [[HIGH_ACC]] = !{i32 -1}
+// CHECK-FUNC: [[MEDIUM_ACC]] = !{i32 -2}
+// CHECK-FUNC: [[LOW_ACC]] = !{i32 -3}
+// CHECK-FUNC: [[CUDA_ACC]] = !{i32 -5}
+// CHECK-FUNC: [[SYCL_ACC]] = !{i32 -4}
+
+// CHECK-TU: [[HIGH_ACC]] = !{i32 -1}
+
+// CHECK-MIX: [[HIGH_ACC]] = !{i32 -1}
+// CHECK-MIX: [[MEDIUM_ACC]] = !{i32 -2}
+// CHECK-MIX: [[CUDA_ACC]] = !{i32 -5}
+// CHECK-MIX: [[SYCL_ACC]] = !{i32 -4}
\ No newline at end of file
diff --git a/sycl/test/optional_kernel_features/fp-accuracy.cpp b/sycl/test/optional_kernel_features/fp-accuracy.cpp
index 795fd60bf7aa0..80acc2baa893f 100644
--- a/sycl/test/optional_kernel_features/fp-accuracy.cpp
+++ b/sycl/test/optional_kernel_features/fp-accuracy.cpp
@@ -1,55 +1,128 @@
-// RUN: %clangxx %s -o %test.bc -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=low:tan -ffp-accuracy=cuda:exp,acos -ffp-accuracy=sycl:log,asin  -fno-math-errno  -fsycl -fsycl-device-only
-// RUN: sycl-post-link -split=auto -symbols %test.bc -o %test.table
-// RUN: FileCheck %s -input-file=%test.table --check-prefixes CHECK-TABLE
-// RUN: FileCheck %s -input-file=%test_0.sym --check-prefixes CHECK-M0-SYMS
-// RUN: FileCheck %s -input-file=%test_1.sym --check-prefixes CHECK-M1-SYMS
-// RUN: FileCheck %s -input-file=%test_2.sym --check-prefixes CHECK-M2-SYMS
-// RUN: FileCheck %s -input-file=%test_3.sym --check-prefixes CHECK-M3-SYMS
-// RUN: FileCheck %s -input-file=%test_4.sym --check-prefixes CHECK-M4-SYMS
-// RUN: FileCheck %s -input-file=%test_5.sym --check-prefixes CHECK-M5-SYMS
-
 // Tests that kernels which use different fp-accuracy level end up in different
 // device images.
 
-// CHECK-TABLE: Code
-// CHECK-TABLE-NEXT: _0.sym
-// CHECK-TABLE-NEXT: _1.sym
-// CHECK-TABLE-NEXT: _2.sym
-// CHECK-TABLE-NEXT: _3.sym
-// CHECK-TABLE-NEXT: _4.sym
-// CHECK-TABLE-NEXT: _5.sym
-// CHECK-TABLE-NEXT: _6.sym
-// CHECK-TABLE-EMPTY:
-
-// CHECK-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
-// CHECK-M0-SYMS-NEXT: Kernel1
-// CHECK-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
-// CHECK-M0-SYMS-NEXT: Kernel7
-// CHECK-M0-SYMS-EMPTY:
-
-// CHECK-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2
-// CHECK-M1-SYMS-NEXT: Kernel2
-// CHECK-M1-SYMS-EMPTY:
-
-// CHECK-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel3
-// CHECK-M2-SYMS-NEXT: Kernel3
-// CHECK-M2-SYMS-EMPTY:
-
-// CHECK-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel6
-// CHECK-M3-SYMS-NEXT: Kernel6
-// CHECK-M3-SYMS-EMPTY:
-
-// CHECK-M4-SYMS: __pf_kernel_wrapper{{.*}}Kernel4
-// CHECK-M4-SYMS-NEXT: Kernel4
-// CHECK-M4-SYMS-EMPTY:
-
-// CHECK-M5-SYMS: __pf_kernel_wrapper{{.*}}Kernel5
-// CHECK-M5-SYMS-NEXT: Kernel5
-// CHECK-M5-SYMS-EMPTY:
-
-// CHECK-M6-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
-// CHECK-M6-SYMS-NEXT: Kernel0
-// CHECK-M6-SYMS-EMPTY:
+// 1. Accuracy is specified for particular math functions.
+// RUN: %clangxx %s -o %test_func.bc -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=low:tan -ffp-accuracy=cuda:exp,acos -ffp-accuracy=sycl:log,asin  -fno-math-errno  -fsycl -fsycl-device-only
+// RUN: sycl-post-link -split=auto -symbols %test_func.bc -o %test_func.table
+// RUN: FileCheck %s -input-file=%test_func.table --check-prefixes CHECK-FUNC-TABLE
+// RUN: FileCheck %s -input-file=%test_func_0.sym --check-prefixes CHECK-FUNC-M0-SYMS
+// RUN: FileCheck %s -input-file=%test_func_1.sym --check-prefixes CHECK-FUNC-M1-SYMS
+// RUN: FileCheck %s -input-file=%test_func_2.sym --check-prefixes CHECK-FUNC-M2-SYMS
+// RUN: FileCheck %s -input-file=%test_func_3.sym --check-prefixes CHECK-FUNC-M3-SYMS
+// RUN: FileCheck %s -input-file=%test_func_4.sym --check-prefixes CHECK-FUNC-M4-SYMS
+// RUN: FileCheck %s -input-file=%test_func_5.sym --check-prefixes CHECK-FUNC-M5-SYMS
+
+// 2. Accuracy is specified for TU.
+// RUN: %clangxx %s -o %test_tu.bc -ffp-accuracy=high -fno-math-errno -fsycl -fsycl-device-only
+// RUN: sycl-post-link -split=auto -symbols %test_tu.bc -o %test_tu.table
+// RUN: FileCheck %s -input-file=%test_tu.table --check-prefixes CHECK-TU-TABLE
+// RUN: FileCheck %s -input-file=%test_tu_0.sym --check-prefixes CHECK-TU-M0-SYMS
+// RUN: FileCheck %s -input-file=%test_tu_1.sym --check-prefixes CHECK-TU-M1-SYMS
+
+// 3. Mixed case.
+// RUN: %clangxx %s -o %test_mix.bc -ffp-accuracy=medium -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=cuda:exp -ffp-accuracy=sycl:log  -fno-math-errno  -fsycl -fsycl-device-only
+// RUN: sycl-post-link -split=auto -symbols %test_mix.bc -o %test_mix.table
+// RUN: FileCheck %s -input-file=%test_mix.table --check-prefixes CHECK-MIX-TABLE
+// RUN: FileCheck %s -input-file=%test_mix_0.sym --check-prefixes CHECK-MIX-M0-SYMS
+// RUN: FileCheck %s -input-file=%test_mix_1.sym --check-prefixes CHECK-MIX-M1-SYMS
+// RUN: FileCheck %s -input-file=%test_mix_2.sym --check-prefixes CHECK-MIX-M2-SYMS
+// RUN: FileCheck %s -input-file=%test_mix_3.sym --check-prefixes CHECK-MIX-M3-SYMS
+
+// CHECK-FUNC-TABLE: Code
+// CHECK-FUNC-TABLE-NEXT: _0.sym
+// CHECK-FUNC-TABLE-NEXT: _1.sym
+// CHECK-FUNC-TABLE-NEXT: _2.sym
+// CHECK-FUNC-TABLE-NEXT: _3.sym
+// CHECK-FUNC-TABLE-NEXT: _4.sym
+// CHECK-FUNC-TABLE-NEXT: _5.sym
+// CHECK-FUNC-TABLE-NEXT: _6.sym
+// CHECK-FUNC-TABLE-EMPTY:
+
+// CHECK-TU-TABLE: Code
+// CHECK-TU-TABLE-NEXT: _0.sym
+// CHECK-TU-TABLE-NEXT: _1.sym
+// CHECK-TU-TABLE-EMPTY:
+
+// CHECK-MIX-TABLE: Code
+// CHECK-MIX-TABLE-NEXT: _0.sym
+// CHECK-MIX-TABLE-NEXT: _1.sym
+// CHECK-MIX-TABLE-NEXT: _2.sym
+// CHECK-MIX-TABLE-NEXT: _3.sym
+// CHECK-MIX-TABLE-EMPTY:
+
+// CHECK-FUNC-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
+// CHECK-FUNC-M0-SYMS-NEXT: Kernel1
+// CHECK-FUNC-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
+// CHECK-FUNC-M0-SYMS-NEXT: Kernel7
+// CHECK-FUNC-M0-SYMS-EMPTY:
+
+// CHECK-FUNC-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2
+// CHECK-FUNC-M1-SYMS-NEXT: Kernel2
+// CHECK-FUNC-M1-SYMS-EMPTY:
+
+// CHECK-FUNC-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel3
+// CHECK-FUNC-M2-SYMS-NEXT: Kernel3
+// CHECK-FUNC-M2-SYMS-EMPTY:
+
+// CHECK-FUNC-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel6
+// CHECK-FUNC-M3-SYMS-NEXT: Kernel6
+// CHECK-FUNC-M3-SYMS-EMPTY:
+
+// CHECK-FUNC-M4-SYMS: __pf_kernel_wrapper{{.*}}Kernel4
+// CHECK-FUNC-M4-SYMS-NEXT: Kernel4
+// CHECK-FUNC-M4-SYMS-EMPTY:
+
+// CHECK-FUNC-M5-SYMS: __pf_kernel_wrapper{{.*}}Kernel5
+// CHECK-FUNC-M5-SYMS-NEXT: Kernel5
+// CHECK-FUNC-M5-SYMS-EMPTY:
+
+// CHECK-FUNC-M6-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
+// CHECK-FUNC-M6-SYMS-NEXT: Kernel0
+// CHECK-FUNC-M6-SYMS-EMPTY:
+
+// CHECK-TU-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
+// CHECK-TU-M0-SYMS-NEXT: Kernel1
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel2
+// CHECK-TU-M0-SYMS-NEXT: Kernel2
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel3
+// CHECK-TU-M0-SYMS-NEXT: Kernel3
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel4
+// CHECK-TU-M0-SYMS-NEXT: Kernel4
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel5
+// CHECK-TU-M0-SYMS-NEXT: Kernel5
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel6
+// CHECK-TU-M0-SYMS-NEXT: Kernel6
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
+// CHECK-TU-M0-SYMS-NEXT: Kernel7
+// CHECK-TU-M6-SYMS-EMPTY:
+
+// CHECK-TU-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
+// CHECK-TU-M1-SYMS-NEXT: Kernel0
+// CHECK-TU-M1-SYMS-EMPTY:
+
+// CHECK-MIX-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
+// CHECK-MIX-M0-SYMS-NEXT: Kernel1
+// CHECK-MIX-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
+// CHECK-MIX-M0-SYMS-NEXT: Kernel7
+// CHECK-MIX-M0-SYMS-EMPTY:
+
+// CHECK-MIX-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2
+// CHECK-MIX-M1-SYMS-NEXT: Kernel2
+// CHECK-MIX-M1-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel3
+// CHECK-MIX-M1-SYMS-NEXT: Kernel3
+// CHECK-MIX-M1-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel5
+// CHECK-MIX-M1-SYMS-NEXT: Kernel5
+// CHECK-MIX-M1-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel6
+// CHECK-MIX-M1-SYMS-NEXT: Kernel6
+// CHECK-MIX-M1-SYMS-EMPTY:
+
+// CHECK-MIX-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel4
+// CHECK-MIX-M2-SYMS-NEXT: Kernel4
+// CHECK-MIX-M2-SYMS-EMPTY:
+
+// CHECK-MIX-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
+// CHECK-MIX-M3-SYMS-NEXT: Kernel0
+// CHECK-MIX-M3-SYMS-EMPTY:
 
 #include <array>
 #include <cmath>
@@ -85,7 +158,10 @@ int main() {
         numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sin(Value); });
   });
 
-  // Kernel2 uses medium-accuracy cos.
+  // Kernel2 uses:
+  // 1. medium-accuracy cos
+  // 2. high-accuracy cos
+  // 3. medium-accuracy cos
   deviceQueue.submit([&](handler &cgh) {
     auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
 
@@ -93,7 +169,10 @@ int main() {
         numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::cos(Value); });
   });
 
-  // Kernel3 uses low-accuracy tan.
+  // Kernel3 uses:
+  // 1. low-accuracy tan
+  // 2. high-accuracy tan
+  // 3. medium-accuracy tan.
   deviceQueue.submit([&](handler &cgh) {
     auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
 
@@ -101,7 +180,10 @@ int main() {
         numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::tan(Value); });
   });
 
-  // Kernel4 uses cuda-accuracy exp and sycl-accuracy log.
+  // Kernel4 uses:
+  // 1. cuda-accuracy exp and sycl-accuracy log.
+  // 2. high-accuracy exp and high-accuracy log.
+  // 3. cuda-accuracy exp and sycl-accuracy log.
   deviceQueue.submit([&](handler &cgh) {
     auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
 
@@ -110,7 +192,10 @@ int main() {
     });
   });
 
-  // Kernel5 uses cuda-accuracy acos.
+  // Kernel5 uses:
+  // 1. cuda-accuracy acos.
+  // 1. high-accuracy acos.
+  // 1. medium-accuracy acos.
   deviceQueue.submit([&](handler &cgh) {
     auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
 
@@ -118,7 +203,10 @@ int main() {
         numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::acos(Value); });
   });
 
-  // Kernel6 uses sycl-accuracy asin.
+  // Kernel6 uses:
+  // 1. sycl-accuracy acos.
+  // 1. high-accuracy acos.
+  // 1. medium-accuracy acos.
   deviceQueue.submit([&](handler &cgh) {
     auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
 

From a13c1ffcecdee21ad2ed27e6f7119205784a5003 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Wed, 12 Jul 2023 13:03:08 -0700
Subject: [PATCH 12/14] Format

---
 clang/lib/CodeGen/CGCall.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 2db8906ba9cb3..dd7d277a82b04 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5639,7 +5639,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   llvm::CallBase *CI;
   if (!InvokeDest) {
     if (!getLangOpts().FPAccuracyFuncMap.empty() ||
-         !getLangOpts().FPAccuracyVal.empty()) {
+        !getLangOpts().FPAccuracyVal.empty()) {
       const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl);
       assert(FD && "expecting a function");
       CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD);

From c3afa411dda8b1ffb8c5baa35abcb5a9d558e1ab Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Wed, 12 Jul 2023 13:05:02 -0700
Subject: [PATCH 13/14] Fix EOL

---
 clang/test/CodeGenSYCL/fp-accuracy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/CodeGenSYCL/fp-accuracy.cpp b/clang/test/CodeGenSYCL/fp-accuracy.cpp
index 02b5d9c283431..74ead5aec61d0 100644
--- a/clang/test/CodeGenSYCL/fp-accuracy.cpp
+++ b/clang/test/CodeGenSYCL/fp-accuracy.cpp
@@ -124,4 +124,4 @@ int main() {
 // CHECK-MIX: [[HIGH_ACC]] = !{i32 -1}
 // CHECK-MIX: [[MEDIUM_ACC]] = !{i32 -2}
 // CHECK-MIX: [[CUDA_ACC]] = !{i32 -5}
-// CHECK-MIX: [[SYCL_ACC]] = !{i32 -4}
\ No newline at end of file
+// CHECK-MIX: [[SYCL_ACC]] = !{i32 -4}

From 8e025fdfa80859f83be2d06a520e4c26a6f3f022 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Thu, 13 Jul 2023 09:02:31 -0700
Subject: [PATCH 14/14] Address review comments

---
 clang/lib/CodeGen/CGBuiltin.cpp        | 37 +++++++++++++-------------
 clang/lib/CodeGen/CGCall.cpp           |  1 -
 clang/lib/CodeGen/CGSYCLRuntime.h      |  4 +--
 clang/test/CodeGenSYCL/fp-accuracy.cpp |  8 +++---
 4 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 78266f28db6c9..d70896c30ba4b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -22150,26 +22150,22 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
     // Even if the current function doesn't have a clang builtin, create
     // an 'fpbuiltin-max-error' attribute for it; unless it's marked with
     // an NoBuiltin attribute.
-    if (!FD->hasAttr<NoBuiltinAttr>() &&
-        FD->getNameInfo().getName().isIdentifier()) {
-      Name = FD->getName();
-      FPAccuracyIntrinsicID =
-          llvm::StringSwitch<unsigned>(Name)
-              .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
-              .Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv)
-              .Case("fmul", llvm::Intrinsic::fpbuiltin_fmul)
-              .Case("fsub", llvm::Intrinsic::fpbuiltin_fsub)
-              .Case("frem", llvm::Intrinsic::fpbuiltin_frem)
-              .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
-              .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
-              .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt)
-              .Default(0);
-      if (!FPAccuracyIntrinsicID) {
-        return nullptr;
-      }
-    } else {
+    if (FD->hasAttr<NoBuiltinAttr>() ||
+        !FD->getNameInfo().getName().isIdentifier())
       return nullptr;
-    }
+
+    Name = FD->getName();
+    FPAccuracyIntrinsicID =
+        llvm::StringSwitch<unsigned>(Name)
+            .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
+            .Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv)
+            .Case("fmul", llvm::Intrinsic::fpbuiltin_fmul)
+            .Case("fsub", llvm::Intrinsic::fpbuiltin_fsub)
+            .Case("frem", llvm::Intrinsic::fpbuiltin_frem)
+            .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
+            .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
+            .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt)
+            .Default(0);
   } else {
     // The function has a clang builtin. Create an attribute for it
     // only if it has an fpbuiltin intrinsic.
@@ -22249,6 +22245,9 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
       break;
     }
   }
+  if (!FPAccuracyIntrinsicID)
+    return nullptr;
+
   Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType());
   return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs),
                                    FPAccuracyIntrinsicID);
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index dd7d277a82b04..15b84cb73875d 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1870,7 +1870,6 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
   // To ensure that, first check if Name has a required accuracy by visiting
   // the 'FPAccuracyFuncMap'; if no accuracy is mapped to Name (FuncAttrs
   // is empty), then set its accuracy from the TU's accuracy value.
-  MD = nullptr;
   if (!getLangOpts().FPAccuracyFuncMap.empty()) {
     auto FuncMapIt = getLangOpts().FPAccuracyFuncMap.find(Name.str());
     if (FuncMapIt != getLangOpts().FPAccuracyFuncMap.end()) {
diff --git a/clang/lib/CodeGen/CGSYCLRuntime.h b/clang/lib/CodeGen/CGSYCLRuntime.h
index 095931af0a70e..85a18e4bb590e 100644
--- a/clang/lib/CodeGen/CGSYCLRuntime.h
+++ b/clang/lib/CodeGen/CGSYCLRuntime.h
@@ -24,10 +24,10 @@ namespace CodeGen {
 class CodeGenModule;
 
 // These aspects are internal and used for device image splitting purposes only.
-// They are not exposed to the DPCPP users through "aspect" enum. That's why
+// They are not exposed to the SYCL users through "aspect" enum. That's why
 // they are intentionally assigned negative values to filter them out at the
 // stage of embedding used aspects as device requirements to the executable.
-// We don't pass these internal aspects to the DPCPP RT.
+// We don't pass these internal aspects to the SYCL RT.
 enum SYCLInternalAspect : int32_t {
   fp_intrinsic_accuracy_high = -1,
   fp_intrinsic_accuracy_medium = -2,
diff --git a/clang/test/CodeGenSYCL/fp-accuracy.cpp b/clang/test/CodeGenSYCL/fp-accuracy.cpp
index 74ead5aec61d0..322b7f8ac65a7 100644
--- a/clang/test/CodeGenSYCL/fp-accuracy.cpp
+++ b/clang/test/CodeGenSYCL/fp-accuracy.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1  -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown -disable-llvm-passes %s -o - | FileCheck --check-prefix CHECK-FUNC %s
-// RUN: %clang_cc1  -fsycl-is-device -ffp-builtin-accuracy=high -emit-llvm -triple spir64-unknown-unknown -disable-llvm-passes %s -o - | FileCheck --check-prefix CHECK-TU %s
-// RUN: %clang_cc1  -fsycl-is-device -ffp-builtin-accuracy=medium -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=cuda:exp -ffp-builtin-accuracy=sycl:log -emit-llvm -triple spir64-unknown-unknown -disable-llvm-passes %s -o - | FileCheck --check-prefix CHECK-MIX %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-FUNC %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-TU %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=medium -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=cuda:exp -ffp-builtin-accuracy=sycl:log -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-MIX %s
 
 // Tests that sycl_used_aspects metadata is attached to the fpbuiltin call based on -ffp-accuracy option.
 
-#include "Inputs/sycl.hpp"
+#include "sycl.hpp"
 
 extern "C" SYCL_EXTERNAL double sin(double);
 extern "C" SYCL_EXTERNAL double cos(double);