[SYCL] Split device images based on accuracy level provided in option (#10140)

againull · web-flow · commit 8d77da7660a2 · 2023-07-14T10:52:15.000-03:00
This PR reuses optional kernel features mechanism to split device image
based on accuracy level provided using -ffp-accuracy compilation option
(introduced in PR#8280):
1. When frontend emits fp intrinsic call and attaches the maximum error
   attribute we also attach "sycl_used_aspects" metadata to the call
   instruction with a value which corresponds to high, medium, low, sycl
   or cuda. Mapping for those values is needed to be visible for SYCL
   device compiler only and we intentionally don't put those values to
   aspects enum because we don't need these aspects to be visible to 
the user in this case (because of the reasons described in Details
below).
2. Make SYCLPropagateAspectsUsage to propagate sycl_used_aspects
   metadata from instructions to kernel.
3. Don't add internal aspects into the device requirements, because we
don't need processing of these internal aspects (with negative values)
in the SYCL RT.


Splitting functionality based on sycl_used_aspects metadata is available
for free.

Details:
Currently accruracy level can be controlled using the following options.
For entire translation unit:
-ffp-accuracy=high
-ffp-accuracy=medium
-ffp-accuracy=low
-ffp-accuracy=sycl
-ffp-accuracy=cuda

For particular funcions in the translation unit:
-ffp-accuracy=low:sin,cos

Whenever frontend sees a math function in a kernel or a device function
it emits fp intrinsic call with attached callsite attribute indicating
value of the maximum error. llvm-spirv is going to translate this
builtins to regular __ocl intrinsics and translate callsite attribute to
decorator (which is a new spirv extension). If that extension is not
supported
by the backend, it is going to emit an error. Error is emitted also in
the case if backend supports the extension but can't compile the kernel
because
it doesn't have corresponding implemenation of math function complying
with
required maximum error.

Aspects corrsponding to different levels of accuracy are not suitable in
this case because aforementioned options are sycl program compilation
options, i.e.
it doesn't make sense to provide an opportunity to the user to write
something like this:
if (dev.has(aspect::ext_oneapi_fp_intrinsic_accuracy_high)) {
  /* submit kernel using high accuracy intrinsics */
}

But on our side we still would like to put kernels and device functions
to different images based on required accuracy level. It is necessary
because
some backends may support, for example, low and medium accuracy but
don't
support high accuracy.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -540,12 +540,18 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
   // TODO: Replace AttrList with a single attribute. The call can only have a
   // single FPAccuracy attribute.
   llvm::AttributeList AttrList;
+  // "sycl_used_aspects" metadata associated with the call.
+  llvm::Metadata *AspectMD = nullptr;
   // sincos() doesn't return a value, but it still has a type associated with
   // it that corresponds to the operand type.
   CGF.CGM.getFPAccuracyFuncAttributes(
-      Name, AttrList, ID,
+      Name, AttrList, AspectMD, ID,
       Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType());
   CI->setAttributes(AttrList);
+
+  if (CGF.getLangOpts().SYCLIsDevice && AspectMD)
+    CI->setMetadata("sycl_used_aspects",
+                    llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectMD));
   return CI;
 }
 
@@ -22418,21 +22424,22 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
     // Even if the current function doesn't have a clang builtin, create
     // an 'fpbuiltin-max-error' attribute for it; unless it's marked with
     // an NoBuiltin attribute.
-    if (!FD->hasAttr<NoBuiltinAttr>()) {
-      Name = FD->getName();
-      FPAccuracyIntrinsicID =
-          llvm::StringSwitch<unsigned>(Name)
-              .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
-              .Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv)
-              .Case("fmul", llvm::Intrinsic::fpbuiltin_fmul)
-              .Case("fsub", llvm::Intrinsic::fpbuiltin_fsub)
-              .Case("frem", llvm::Intrinsic::fpbuiltin_frem)
-              .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
-              .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
-              .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt);
-    } else {
+    if (FD->hasAttr<NoBuiltinAttr>() ||
+        !FD->getNameInfo().getName().isIdentifier())
       return nullptr;
-    }
+
+    Name = FD->getName();
+    FPAccuracyIntrinsicID =
+        llvm::StringSwitch<unsigned>(Name)
+            .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
+            .Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv)
+            .Case("fmul", llvm::Intrinsic::fpbuiltin_fmul)
+            .Case("fsub", llvm::Intrinsic::fpbuiltin_fsub)
+            .Case("frem", llvm::Intrinsic::fpbuiltin_frem)
+            .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
+            .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
+            .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt)
+            .Default(0);
   } else {
     // The function has a clang builtin. Create an attribute for it
     // only if it has an fpbuiltin intrinsic.
@@ -22512,6 +22519,9 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
       break;
     }
   }
+  if (!FPAccuracyIntrinsicID)
+    return nullptr;
+
   Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType());
   return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs),
                                    FPAccuracyIntrinsicID);
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
@@ -17,6 +17,7 @@
 #include "CGCXXABI.h"
 #include "CGCleanup.h"
 #include "CGRecordLayout.h"
+#include "CGSYCLRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
@@ -1860,9 +1861,21 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
       .Case("cuda", llvm::fp::FPAccuracy::CUDA);
 }
 
+static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
+  assert(FPAccuracyStr.equals("high") || FPAccuracyStr.equals("medium") ||
+         FPAccuracyStr.equals("low") || FPAccuracyStr.equals("sycl") ||
+         FPAccuracyStr.equals("cuda"));
+  return llvm::StringSwitch<int32_t>(FPAccuracyStr)
+      .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high)
+      .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium)
+      .Case("low", SYCLInternalAspect::fp_intrinsic_accuracy_low)
+      .Case("sycl", SYCLInternalAspect::fp_intrinsic_accuracy_sycl)
+      .Case("cuda", SYCLInternalAspect::fp_intrinsic_accuracy_cuda);
+}
+
 void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
-    StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID,
-    const llvm::Type *FuncType) {
+    StringRef Name, llvm::AttrBuilder &FuncAttrs, llvm::Metadata *&MD,
+    unsigned ID, const llvm::Type *FuncType) {
   // Priority is given to to the accuracy specific to the function.
   // So, if the command line is something like this:
   // 'clang -fp-accuracy = high -fp-accuracy = low:[sin]'.
@@ -1878,6 +1891,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(FuncMapIt->second));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+          Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second)));
     }
   }
   if (FuncAttrs.attrs().size() == 0)
@@ -1886,6 +1901,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+          Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal)));
     }
 }
 
diff --git a/clang/lib/CodeGen/CGSYCLRuntime.h b/clang/lib/CodeGen/CGSYCLRuntime.h
@@ -23,6 +23,19 @@ namespace CodeGen {
 
 class CodeGenModule;
 
+// These aspects are internal and used for device image splitting purposes only.
+// They are not exposed to the SYCL users through "aspect" enum. That's why
+// they are intentionally assigned negative values to filter them out at the
+// stage of embedding used aspects as device requirements to the executable.
+// We don't pass these internal aspects to the SYCL RT.
+enum SYCLInternalAspect : int32_t {
+  fp_intrinsic_accuracy_high = -1,
+  fp_intrinsic_accuracy_medium = -2,
+  fp_intrinsic_accuracy_low = -3,
+  fp_intrinsic_accuracy_sycl = -4,
+  fp_intrinsic_accuracy_cuda = -5,
+};
+
 class CGSYCLRuntime {
 protected:
   CodeGenModule &CGM;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -8122,10 +8122,11 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
 
 void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name,
                                                 llvm::AttributeList &AttrList,
+                                                llvm::Metadata *&MD,
                                                 unsigned ID,
                                                 const llvm::Type *FuncType) {
   llvm::AttrBuilder FuncAttrs(getLLVMContext());
-  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, ID, FuncType);
+  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MD, ID, FuncType);
   AttrList = llvm::AttributeList::get(
       getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
@@ -1589,7 +1589,8 @@ class CodeGenModule : public CodeGenTypeCache {
   void moveLazyEmissionStates(CodeGenModule *NewBuilder);
 
   void getFPAccuracyFuncAttributes(StringRef Name,
-                                   llvm::AttributeList &AttrList, unsigned ID,
+                                   llvm::AttributeList &AttrList,
+                                   llvm::Metadata *&MDs, unsigned ID,
                                    const llvm::Type *FuncType);
 
 private:
@@ -1789,7 +1790,7 @@ class CodeGenModule : public CodeGenTypeCache {
 
   void getDefaultFunctionFPAccuracyAttributes(StringRef Name,
                                               llvm::AttrBuilder &FuncAttrs,
-                                              unsigned ID,
+                                              llvm::Metadata *&MD, unsigned ID,
                                               const llvm::Type *FuncType);
 
   llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
diff --git a/clang/test/CodeGenSYCL/fp-accuracy.cpp b/clang/test/CodeGenSYCL/fp-accuracy.cpp
@@ -0,0 +1,127 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-FUNC %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-TU %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=medium -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=cuda:exp -ffp-builtin-accuracy=sycl:log -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-MIX %s
+
+// Tests that sycl_used_aspects metadata is attached to the fpbuiltin call based on -ffp-accuracy option.
+
+#include "sycl.hpp"
+
+extern "C" SYCL_EXTERNAL double sin(double);
+extern "C" SYCL_EXTERNAL double cos(double);
+extern "C" SYCL_EXTERNAL double tan(double);
+extern "C" SYCL_EXTERNAL double log(double);
+extern "C" SYCL_EXTERNAL double exp(double);
+extern "C" SYCL_EXTERNAL double acos(double);
+extern "C" SYCL_EXTERNAL double asin(double);
+extern "C" SYCL_EXTERNAL double sqrt(double);
+
+using namespace sycl;
+
+int main() {
+  const unsigned array_size = 4;
+  double Value = .5;
+  queue deviceQueue;
+  range<1> numOfItems{array_size};
+
+  // Kernel0 doesn't use math functions.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel0>(numOfItems,
+    [=](id<1> wiID) {
+      (void)Value;
+    });
+  });
+
+  // Kernel1 uses high-accuracy sin.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel1>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
+// CHECK-MIX: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
+      (void)sin(Value);
+    });
+  });
+
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel2>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]]
+      (void)cos(Value);
+    });
+  });
+
+  // Kernel3 uses low-accuracy tan.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel3>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[LOW_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
+      (void)tan(Value);
+    });
+  });
+
+  // Kernel4 uses cuda-accuracy exp and sycl-accuracy log.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel4>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]]
+// CHECK-MIX: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]]
+      (void)log(exp(Value));
+    });
+  });
+  deviceQueue.wait();
+
+  // Kernel5 uses cuda-accuracy acos.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel5>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
+      (void)acos(Value);
+    });
+  });
+
+  // Kernel6 uses sycl-accuracy asin.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel6>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
+      (void)asin(Value);
+    });
+  });
+
+  // Kernel7 uses high-accuracy sqrt.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel7>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+      (void)sqrt(Value);
+    });
+  });
+  return 0;
+}
+
+// CHECK-FUNC: [[HIGH_ACC]] = !{i32 -1}
+// CHECK-FUNC: [[MEDIUM_ACC]] = !{i32 -2}
+// CHECK-FUNC: [[LOW_ACC]] = !{i32 -3}
+// CHECK-FUNC: [[CUDA_ACC]] = !{i32 -5}
+// CHECK-FUNC: [[SYCL_ACC]] = !{i32 -4}
+
+// CHECK-TU: [[HIGH_ACC]] = !{i32 -1}
+
+// CHECK-MIX: [[HIGH_ACC]] = !{i32 -1}
+// CHECK-MIX: [[MEDIUM_ACC]] = !{i32 -2}
+// CHECK-MIX: [[CUDA_ACC]] = !{i32 -5}
+// CHECK-MIX: [[SYCL_ACC]] = !{i32 -4}
diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
@@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I,
     Result.insert(Aspects.begin(), Aspects.end());
   }
 
+  if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) {
+    for (const MDOperand &MDOp : InstApsects->operands()) {
+      const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();
+      Result.insert(cast<ConstantInt>(C)->getSExtValue());
+    }
+  }
+
   return Result;
 }
 
diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll
@@ -0,0 +1,66 @@
+; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s
+;
+; Test checks that the pass is able to propagate information about aspects
+; used in the instruction through a call graph
+;
+;   K1  K2
+;  /  \/  \
+; F1  F2   F3
+;
+; F1 doesn't use optional type and doesn't have instruction with attached 'sycl_used_aspects' metadata.
+; F2 uses optional A and has instruction with attached 'sycl_used_aspects' metadata.
+; F3 uses optional B and has instruction with attached 'sycl_used_aspects' metadata.
+
+%Optional.A = type { i32 }
+%Optional.B = type { i32 }
+
+; CHECK: spir_kernel void @kernel1() !sycl_used_aspects ![[#ID1:]]
+define spir_kernel void @kernel1() {
+  call spir_func void @func1()
+  call spir_func void @func2()
+  ret void
+}
+
+; CHECK: spir_kernel void @kernel2() !sycl_used_aspects ![[#ID2:]]
+define spir_kernel void @kernel2() {
+  call spir_func void @func2()
+  call spir_func void @func3()
+  ret void
+}
+
+; CHECK: spir_func void @func1() {
+define spir_func void @func1() {
+  %tmp = alloca i32
+  ret void
+}
+
+declare void @llvm.fpbuiltin.f64()
+
+; CHECK: spir_func void @func2() !sycl_used_aspects ![[#ID1]] {
+define spir_func void @func2() {
+  %tmp1 = alloca %Optional.A
+  call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !3
+  ret void
+}
+
+; CHECK: spir_func void @func3() !sycl_used_aspects ![[#ID3:]] {
+define spir_func void @func3() {
+  %tmp = alloca %Optional.B
+  call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !4
+  ret void
+}
+
+!sycl_types_that_use_aspects = !{!0, !1}
+!0 = !{!"Optional.A", i32 1}
+!1 = !{!"Optional.B", i32 2}
+
+!sycl_aspects = !{!2}
+!2 = !{!"fp64", i32 6}
+!3 = !{i32 -1}
+!4 = !{i32 -2}
+
+; CHECK: ![[#ID1]] = !{i32 1, i32 -1}
+; CHECK: ![[#ID2]] = !{i32 1, i32 -1, i32 2, i32 -2}
+; CHECK: ![[#ID3]] = !{i32 2, i32 -2}
+
+
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
diff --git a/sycl/doc/design/OptionalDeviceFeatures.md b/sycl/doc/design/OptionalDeviceFeatures.md
diff --git a/sycl/test/optional_kernel_features/fp-accuracy.cpp b/sycl/test/optional_kernel_features/fp-accuracy.cpp

Original file line number	Diff line number	Diff line change
`@@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I,`
`255`	`255`	`Result.insert(Aspects.begin(), Aspects.end());`
`256`	`256`	`}`
`257`	`257`
	`258`	`+ if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) {`
	`259`	`+ for (const MDOperand &MDOp : InstApsects->operands()) {`
	`260`	`+ const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();`
	`261`	`+ Result.insert(cast<ConstantInt>(C)->getSExtValue());`
	`262`	`+ }`
	`263`	`+ }`
	`264`	`+`
`258`	`265`	`return Result;`
`259`	`266`	`}`
`260`	`267`