From 2c4695c05e3a7197cd57ce81893c4cce9956fa73 Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Wed, 18 Jun 2025 11:53:52 -0400 Subject: [PATCH 1/7] [DirectX] add support for i64 buffer load/stores --- .../Target/DirectX/DXILIntrinsicExpansion.cpp | 179 +++++++++++++----- llvm/test/CodeGen/DirectX/BufferLoadDouble.ll | 4 +- llvm/test/CodeGen/DirectX/BufferLoadInt64.ll | 56 ++++++ .../test/CodeGen/DirectX/BufferStoreDouble.ll | 43 +++++ llvm/test/CodeGen/DirectX/BufferStoreInt64.ll | 46 +++++ 5 files changed, 281 insertions(+), 47 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/BufferLoadInt64.ll create mode 100644 llvm/test/CodeGen/DirectX/BufferStoreInt64.ll diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index f99e8e7ccdc5d..eb9268e78a9ad 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -70,15 +71,17 @@ static bool isIntrinsicExpansion(Function &F) { case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_fadd: return true; - case Intrinsic::dx_resource_load_typedbuffer: - // We need to handle doubles and vector of doubles. - return F.getReturnType() - ->getStructElementType(0) - ->getScalarType() - ->isDoubleTy(); - case Intrinsic::dx_resource_store_typedbuffer: - // We need to handle doubles and vector of doubles. - return F.getFunctionType()->getParamType(2)->getScalarType()->isDoubleTy(); + case Intrinsic::dx_resource_load_typedbuffer: { + // We need to handle i64, doubles, and vectors of them. + Type *ScalarTy = + F.getReturnType()->getStructElementType(0)->getScalarType(); + return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64); + } + case Intrinsic::dx_resource_store_typedbuffer: { + // We need to handle i64 and doubles and vectors of i64 and doubles. + Type *ScalarTy = F.getFunctionType()->getParamType(2)->getScalarType(); + return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64); + } } return false; } @@ -545,13 +548,15 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) { IRBuilder<> Builder(Orig); Type *BufferTy = Orig->getType()->getStructElementType(0); - assert(BufferTy->getScalarType()->isDoubleTy() && - "Only expand double or double2"); + Type *ScalarTy = BufferTy->getScalarType(); + bool IsDouble = ScalarTy->isDoubleTy(); + assert(IsDouble || ScalarTy->isIntegerTy(64) && + "Only expand double or int64 scalars or vectors"); unsigned ExtractNum = 2; if (auto *VT = dyn_cast(BufferTy)) { assert(VT->getNumElements() == 2 && - "TypedBufferLoad double vector has wrong size"); + "TypedBufferLoad vector must be size 2"); ExtractNum = 4; } @@ -570,22 +575,54 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) { ExtractElements.push_back( Builder.CreateExtractElement(Extract, Builder.getInt32(I))); - // combine into double(s) + // combine into double(s) or int64(s) Value *Result = PoisonValue::get(BufferTy); for (unsigned I = 0; I < ExtractNum; I += 2) { - Value *Dbl = - Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble, - {ExtractElements[I], ExtractElements[I + 1]}); + Value *Combined = nullptr; + if (IsDouble) { + // For doubles, use dx_asdouble intrinsic + Combined = + Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble, + {ExtractElements[I], ExtractElements[I + 1]}); + } else { + // For int64, manually combine two int32s + // First, zero-extend both values to i64 + Value *Lo = Builder.CreateZExt(ExtractElements[I], Builder.getInt64Ty()); + Value *Hi = + Builder.CreateZExt(ExtractElements[I + 1], Builder.getInt64Ty()); + // Shift the high bits left by 32 bits + Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32)); + // OR the high and low bits together + Combined = Builder.CreateOr(Lo, ShiftedHi); + } + if (ExtractNum == 4) - Result = - Builder.CreateInsertElement(Result, Dbl, Builder.getInt32(I / 2)); + Result = Builder.CreateInsertElement(Result, Combined, + Builder.getInt32(I / 2)); else - Result = Dbl; + Result = Combined; } Value *CheckBit = nullptr; for (User *U : make_early_inc_range(Orig->users())) { - auto *EVI = cast(U); + if (auto *Ret = dyn_cast(U)) { + // For return instructions, we need to handle the case where the function + // is directly returning the result of the call + Type *RetTy = Ret->getFunction()->getReturnType(); + Value *StructRet = PoisonValue::get(RetTy); + StructRet = Builder.CreateInsertValue(StructRet, Result, {0}); + Value *CheckBitForRet = Builder.CreateExtractValue(Load, {1}); + StructRet = Builder.CreateInsertValue(StructRet, CheckBitForRet, {1}); + Ret->setOperand(0, StructRet); + continue; + } + auto *EVI = dyn_cast(U); + if (!EVI) { + // If it's not a ReturnInst or ExtractValueInst, we don't know how to + // handle it + llvm_unreachable("Unexpected user of typedbufferload"); + } + ArrayRef Indices = EVI->getIndices(); assert(Indices.size() == 1); @@ -609,38 +646,90 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) { IRBuilder<> Builder(Orig); Type *BufferTy = Orig->getFunctionType()->getParamType(2); - assert(BufferTy->getScalarType()->isDoubleTy() && - "Only expand double or double2"); + Type *ScalarTy = BufferTy->getScalarType(); + bool IsDouble = ScalarTy->isDoubleTy(); + assert((IsDouble || ScalarTy->isIntegerTy(64)) && + "Only expand double or int64 scalars or vectors"); unsigned ExtractNum = 2; if (auto *VT = dyn_cast(BufferTy)) { assert(VT->getNumElements() == 2 && - "TypedBufferStore double vector has wrong size"); + "TypedBufferStore vector must be size 2"); ExtractNum = 4; } + if (IsDouble) { + Type *SplitElementTy = Builder.getInt32Ty(); + if (ExtractNum == 4) + SplitElementTy = VectorType::get(SplitElementTy, 2, false); + + // Handle double type(s) - keep original behavior + auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy); + Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble, + {Orig->getOperand(2)}); + // create our vector + Value *LowBits = Builder.CreateExtractValue(Split, 0); + Value *HighBits = Builder.CreateExtractValue(Split, 1); + Value *Val; + if (ExtractNum == 2) { + Val = PoisonValue::get(VectorType::get(Builder.getInt32Ty(), 2, false)); + Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); + Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); + } else + Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3}); + + Builder.CreateIntrinsic(Builder.getVoidTy(), + Intrinsic::dx_resource_store_typedbuffer, + {Orig->getOperand(0), Orig->getOperand(1), Val}); + } else { + // Handle int64 type(s) + Value *InputVal = Orig->getOperand(2); + Value *Val; + + if (ExtractNum == 4) { + // Handle vector of int64 + Type *Int32x4Ty = VectorType::get(Builder.getInt32Ty(), 4, false); + Val = PoisonValue::get(Int32x4Ty); + + for (unsigned I = 0; I < 2; ++I) { + // Extract each int64 element + Value *Int64Val = + Builder.CreateExtractElement(InputVal, Builder.getInt32(I)); + + // Get low 32 bits by truncating to i32 + Value *LowBits = Builder.CreateTrunc(Int64Val, Builder.getInt32Ty()); + + // Get high 32 bits by shifting right by 32 and truncating + Value *ShiftedVal = Builder.CreateLShr(Int64Val, Builder.getInt64(32)); + Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty()); + + // Insert into our final vector + Val = + Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(I * 2)); + Val = Builder.CreateInsertElement(Val, HighBits, + Builder.getInt32(I * 2 + 1)); + } + } else { + // Handle scalar int64 + Type *Int32x2Ty = VectorType::get(Builder.getInt32Ty(), 2, false); + Val = PoisonValue::get(Int32x2Ty); + + // Get low 32 bits by truncating to i32 + Value *LowBits = Builder.CreateTrunc(InputVal, Builder.getInt32Ty()); + + // Get high 32 bits by shifting right by 32 and truncating + Value *ShiftedVal = Builder.CreateLShr(InputVal, Builder.getInt64(32)); + Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty()); + + // Insert into our final vector + Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); + Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); + } + + Builder.CreateIntrinsic(Builder.getVoidTy(), + Intrinsic::dx_resource_store_typedbuffer, + {Orig->getOperand(0), Orig->getOperand(1), Val}); + } - Type *SplitElementTy = Builder.getInt32Ty(); - if (ExtractNum == 4) - SplitElementTy = VectorType::get(SplitElementTy, 2, false); - - // split our double(s) - auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy); - Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble, - Orig->getOperand(2)); - // create our vector - Value *LowBits = Builder.CreateExtractValue(Split, 0); - Value *HighBits = Builder.CreateExtractValue(Split, 1); - Value *Val; - if (ExtractNum == 2) { - Val = PoisonValue::get(VectorType::get(SplitElementTy, 2, false)); - Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); - Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); - } else - Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3}); - - Builder.CreateIntrinsic(Builder.getVoidTy(), - Intrinsic::dx_resource_store_typedbuffer, - {Orig->getOperand(0), Orig->getOperand(1), Val}); Orig->eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll index 80a071a66364b..af3ec9df37967 100644 --- a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll +++ b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s -target triple = "dxil-pc-shadermodel6.6-compute" +target triple = "dxil-pc-shadermodel6.2-compute" define void @loadf64() { ; check the handle from binding is unchanged @@ -88,4 +88,4 @@ define void @loadf64WithCheckBit() { ; CHECK-NOT: extractvalue { double, i1 } %cb = extractvalue {double, i1} %load0, 1 ret void -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll new file mode 100644 index 0000000000000..cea475524945c --- /dev/null +++ b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.2-compute" + +define { i64, i1 } @loadi64() { +; CHECK-LABEL: define { i64, i1 } @loadi64() { +; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) +; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { i64, i1 } [[TMP9]], i1 [[TMP10]], 1 +; CHECK-NEXT: ret { i64, i1 } [[TMP11]] +; + %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) + %result = call { i64, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_i64_1_0_0t( + target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0) + ret { i64, i1 } %result +} + +define { <2 x i64>, i1 } @loadv2i64() { +; CHECK-LABEL: define { <2 x i64>, i1 } @loadv2i64() { +; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) +; CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 32 +; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[TMP13]], 32 +; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { <2 x i64>, i1 } poison, <2 x i64> [[TMP16]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { <2 x i64>, i1 } [[TMP17]], i1 [[TMP18]], 1 +; CHECK-NEXT: ret { <2 x i64>, i1 } [[TMP19]] +; + %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) + %result = call { <2 x i64>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t( + target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0) + ret { <2 x i64>, i1 } %result +} diff --git a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll index 9c3dab0cc1e46..882948b6dce74 100644 --- a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll +++ b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll @@ -45,3 +45,46 @@ define void @storev2f64(<2 x double> %0) { <2 x double> %0) ret void } + +define { double, i1 } @loadAndReturnf64() { +; CHECK-LABEL: define { double, i1 } @loadAndReturnf64() { +; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) +; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_f64_1_0_0t(target("dx.TypedBuffer", double, 1, 0, 0) [[BUFFER]], i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { double, i1 } poison, double [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP6]], i1 [[TMP7]], 1 +; CHECK-NEXT: ret { double, i1 } [[TMP8]] +; + %buffer = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) + %result = call { double, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_f64_1_0_0t( + target("dx.TypedBuffer", double, 1, 0, 0) %buffer, i32 0) + ret { double, i1 } %result +} + +define { <2 x double>, i1 } @loadAndReturnv2f64() { +; CHECK-LABEL: define { <2 x double>, i1 } @loadAndReturnv2f64() { +; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) +; CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) [[BUFFER]], i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]]) +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP5]], i32 [[TMP6]]) +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP8]], double [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { <2 x double>, i1 } poison, <2 x double> [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertvalue { <2 x double>, i1 } [[TMP11]], i1 [[TMP12]], 1 +; CHECK-NEXT: ret { <2 x double>, i1 } [[TMP13]] +; + %buffer = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) + %result = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t( + target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 0) + ret { <2 x double>, i1 } %result +} diff --git a/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll new file mode 100644 index 0000000000000..efb7c0ac104ed --- /dev/null +++ b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.6-compute" + +define void @storei64(i64 %0) { +; CHECK-LABEL: define void @storei64( +; CHECK-SAME: i64 [[TMP0:%.*]]) { +; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP0]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1 +; CHECK-NEXT: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_i64_1_0_0t.v2i32(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0, <2 x i32> [[TMP6]]) +; CHECK-NEXT: ret void +; + %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) + call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0,i64 %0) + ret void +} + + +define void @storev2i64(<2 x i64> %0) { +; CHECK-LABEL: define void @storev2i64( +; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) { +; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP8]], 32 +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP11]], i32 3 +; CHECK-NEXT: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t.v4i32(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0, <4 x i32> [[TMP13]]) +; CHECK-NEXT: ret void +; + %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) + call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0, <2 x i64> %0) + ret void +} From ecd32db1d289d0bfd361db4fb823de4c7ae7beea Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Fri, 20 Jun 2025 10:58:21 -0400 Subject: [PATCH 2/7] minimize code diff between double and i64 --- .../Target/DirectX/DXILIntrinsicExpansion.cpp | 74 +++++++++---------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index eb9268e78a9ad..45d8e497165cf 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -651,58 +651,56 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) { assert((IsDouble || ScalarTy->isIntegerTy(64)) && "Only expand double or int64 scalars or vectors"); - unsigned ExtractNum = 2; - if (auto *VT = dyn_cast(BufferTy)) { - assert(VT->getNumElements() == 2 && + // Determine if we're dealing with a vector or scalar + bool IsVector = isa(BufferTy); + if (IsVector) { + assert(cast(BufferTy)->getNumElements() == 2 && "TypedBufferStore vector must be size 2"); - ExtractNum = 4; } + + // Create the appropriate vector type for the result + Type *Int32Ty = Builder.getInt32Ty(); + Type *ResultTy = VectorType::get(Int32Ty, IsVector ? 4 : 2, false); + Value *Val = PoisonValue::get(ResultTy); + + // Split the 64-bit values into 32-bit components if (IsDouble) { - Type *SplitElementTy = Builder.getInt32Ty(); - if (ExtractNum == 4) + // Handle double type(s) + Type *SplitElementTy = Int32Ty; + if (IsVector) SplitElementTy = VectorType::get(SplitElementTy, 2, false); - // Handle double type(s) - keep original behavior auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy); Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble, {Orig->getOperand(2)}); - // create our vector Value *LowBits = Builder.CreateExtractValue(Split, 0); Value *HighBits = Builder.CreateExtractValue(Split, 1); - Value *Val; - if (ExtractNum == 2) { - Val = PoisonValue::get(VectorType::get(Builder.getInt32Ty(), 2, false)); + + if (IsVector) { + // For vector doubles, use shuffle to create the final vector + Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3}); + } else { + // For scalar doubles, insert the elements Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); - } else - Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3}); - - Builder.CreateIntrinsic(Builder.getVoidTy(), - Intrinsic::dx_resource_store_typedbuffer, - {Orig->getOperand(0), Orig->getOperand(1), Val}); + } } else { // Handle int64 type(s) Value *InputVal = Orig->getOperand(2); - Value *Val; - if (ExtractNum == 4) { + if (IsVector) { // Handle vector of int64 - Type *Int32x4Ty = VectorType::get(Builder.getInt32Ty(), 4, false); - Val = PoisonValue::get(Int32x4Ty); - for (unsigned I = 0; I < 2; ++I) { // Extract each int64 element Value *Int64Val = Builder.CreateExtractElement(InputVal, Builder.getInt32(I)); - // Get low 32 bits by truncating to i32 - Value *LowBits = Builder.CreateTrunc(Int64Val, Builder.getInt32Ty()); - - // Get high 32 bits by shifting right by 32 and truncating + // Split into low and high 32-bit parts + Value *LowBits = Builder.CreateTrunc(Int64Val, Int32Ty); Value *ShiftedVal = Builder.CreateLShr(Int64Val, Builder.getInt64(32)); - Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty()); + Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty); - // Insert into our final vector + // Insert into result vector Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(I * 2)); Val = Builder.CreateInsertElement(Val, HighBits, @@ -710,26 +708,20 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) { } } else { // Handle scalar int64 - Type *Int32x2Ty = VectorType::get(Builder.getInt32Ty(), 2, false); - Val = PoisonValue::get(Int32x2Ty); - - // Get low 32 bits by truncating to i32 - Value *LowBits = Builder.CreateTrunc(InputVal, Builder.getInt32Ty()); - - // Get high 32 bits by shifting right by 32 and truncating + Value *LowBits = Builder.CreateTrunc(InputVal, Int32Ty); Value *ShiftedVal = Builder.CreateLShr(InputVal, Builder.getInt64(32)); - Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty()); + Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty); - // Insert into our final vector Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); } - - Builder.CreateIntrinsic(Builder.getVoidTy(), - Intrinsic::dx_resource_store_typedbuffer, - {Orig->getOperand(0), Orig->getOperand(1), Val}); } + // Create the final intrinsic call + Builder.CreateIntrinsic(Builder.getVoidTy(), + Intrinsic::dx_resource_store_typedbuffer, + {Orig->getOperand(0), Orig->getOperand(1), Val}); + Orig->eraseFromParent(); return true; } From 8d8782d45ebf0e3b1f77778e91b1c2c7fc1161cd Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Fri, 20 Jun 2025 14:16:19 -0400 Subject: [PATCH 3/7] remove return handling --- .../Target/DirectX/DXILIntrinsicExpansion.cpp | 18 ++------ llvm/test/CodeGen/DirectX/BufferLoadDouble.ll | 2 +- llvm/test/CodeGen/DirectX/BufferLoadInt64.ll | 28 +++++------- .../test/CodeGen/DirectX/BufferStoreDouble.ll | 43 ------------------- 4 files changed, 14 insertions(+), 77 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index 45d8e497165cf..d50279461800e 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -605,23 +605,11 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) { Value *CheckBit = nullptr; for (User *U : make_early_inc_range(Orig->users())) { - if (auto *Ret = dyn_cast(U)) { - // For return instructions, we need to handle the case where the function - // is directly returning the result of the call - Type *RetTy = Ret->getFunction()->getReturnType(); - Value *StructRet = PoisonValue::get(RetTy); - StructRet = Builder.CreateInsertValue(StructRet, Result, {0}); - Value *CheckBitForRet = Builder.CreateExtractValue(Load, {1}); - StructRet = Builder.CreateInsertValue(StructRet, CheckBitForRet, {1}); - Ret->setOperand(0, StructRet); - continue; - } + // If it's not a ExtractValueInst, we don't know how to + // handle it auto *EVI = dyn_cast(U); - if (!EVI) { - // If it's not a ReturnInst or ExtractValueInst, we don't know how to - // handle it + if (!EVI) llvm_unreachable("Unexpected user of typedbufferload"); - } ArrayRef Indices = EVI->getIndices(); assert(Indices.size() == 1); diff --git a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll index af3ec9df37967..25abf2111060c 100644 --- a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll +++ b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s -target triple = "dxil-pc-shadermodel6.2-compute" +target triple = "dxil-pc-shadermodel6.6-compute" define void @loadf64() { ; check the handle from binding is unchanged diff --git a/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll index cea475524945c..42c0012ff3475 100644 --- a/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll +++ b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll @@ -3,8 +3,8 @@ target triple = "dxil-pc-shadermodel6.2-compute" -define { i64, i1 } @loadi64() { -; CHECK-LABEL: define { i64, i1 } @loadi64() { +define void @loadi64() { +; CHECK-LABEL: define void @loadi64() { ; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) ; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0 @@ -14,19 +14,15 @@ define { i64, i1 } @loadi64() { ; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 ; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP8]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { i64, i1 } [[TMP9]], i1 [[TMP10]], 1 -; CHECK-NEXT: ret { i64, i1 } [[TMP11]] +; CHECK-NEXT: ret void ; %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) - %result = call { i64, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_i64_1_0_0t( - target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0) - ret { i64, i1 } %result + %result = call { i64, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0) + ret void } -define { <2 x i64>, i1 } @loadv2i64() { -; CHECK-LABEL: define { <2 x i64>, i1 } @loadv2i64() { +define void @loadv2i64() { +; CHECK-LABEL: define void @loadv2i64() { ; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) ; CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0 @@ -44,13 +40,9 @@ define { <2 x i64>, i1 } @loadv2i64() { ; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[TMP13]], 32 ; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP12]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP15]], i32 1 -; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { <2 x i64>, i1 } poison, <2 x i64> [[TMP16]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { <2 x i64>, i1 } [[TMP17]], i1 [[TMP18]], 1 -; CHECK-NEXT: ret { <2 x i64>, i1 } [[TMP19]] +; CHECK-NEXT: ret void ; %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) - %result = call { <2 x i64>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t( - target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0) - ret { <2 x i64>, i1 } %result + %result = call { <2 x i64>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0) + ret void } diff --git a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll index 882948b6dce74..9c3dab0cc1e46 100644 --- a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll +++ b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll @@ -45,46 +45,3 @@ define void @storev2f64(<2 x double> %0) { <2 x double> %0) ret void } - -define { double, i1 } @loadAndReturnf64() { -; CHECK-LABEL: define { double, i1 } @loadAndReturnf64() { -; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) -; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_f64_1_0_0t(target("dx.TypedBuffer", double, 1, 0, 0) [[BUFFER]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { double, i1 } poison, double [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP6]], i1 [[TMP7]], 1 -; CHECK-NEXT: ret { double, i1 } [[TMP8]] -; - %buffer = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) - %result = call { double, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_f64_1_0_0t( - target("dx.TypedBuffer", double, 1, 0, 0) %buffer, i32 0) - ret { double, i1 } %result -} - -define { <2 x double>, i1 } @loadAndReturnv2f64() { -; CHECK-LABEL: define { <2 x double>, i1 } @loadAndReturnv2f64() { -; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) -; CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) [[BUFFER]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]]) -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP5]], i32 [[TMP6]]) -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP8]], double [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { <2 x double>, i1 } poison, <2 x double> [[TMP10]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertvalue { <2 x double>, i1 } [[TMP11]], i1 [[TMP12]], 1 -; CHECK-NEXT: ret { <2 x double>, i1 } [[TMP13]] -; - %buffer = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) - %result = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t( - target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 0) - ret { <2 x double>, i1 } %result -} From 3de2cdf01a202cbb9c8a74bc42e79e80bcda907f Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Thu, 26 Jun 2025 17:34:09 -0400 Subject: [PATCH 4/7] address pr comments --- .../Target/DirectX/DXILIntrinsicExpansion.cpp | 92 ++++++++----------- llvm/test/CodeGen/DirectX/BufferStoreInt64.ll | 16 +--- 2 files changed, 43 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index d50279461800e..1d92f995cc57d 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -544,6 +544,18 @@ static Value *expandRadiansIntrinsic(CallInst *Orig) { return Builder.CreateFMul(X, PiOver180); } +static Value* createCombinedi32toi64Expansion(IRBuilder<> &Builder, Value *LoBytes, Value *HighBytes) { + // For int64, manually combine two int32s + // First, zero-extend both values to i64 + Value *Lo = Builder.CreateZExt(LoBytes, Builder.getInt64Ty()); + Value *Hi = + Builder.CreateZExt(HighBytes, Builder.getInt64Ty()); + // Shift the high bits left by 32 bits + Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32)); + // OR the high and low bits together + return Builder.CreateOr(Lo, ShiftedHi); +} + static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) { IRBuilder<> Builder(Orig); @@ -579,22 +591,14 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) { Value *Result = PoisonValue::get(BufferTy); for (unsigned I = 0; I < ExtractNum; I += 2) { Value *Combined = nullptr; - if (IsDouble) { + if (IsDouble) // For doubles, use dx_asdouble intrinsic Combined = Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble, {ExtractElements[I], ExtractElements[I + 1]}); - } else { - // For int64, manually combine two int32s - // First, zero-extend both values to i64 - Value *Lo = Builder.CreateZExt(ExtractElements[I], Builder.getInt64Ty()); - Value *Hi = - Builder.CreateZExt(ExtractElements[I + 1], Builder.getInt64Ty()); - // Shift the high bits left by 32 bits - Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32)); - // OR the high and low bits together - Combined = Builder.CreateOr(Lo, ShiftedHi); - } + else + Combined = + createCombinedi32toi64Expansion(Builder, ExtractElements[I], ExtractElements[I + 1]); if (ExtractNum == 4) Result = Builder.CreateInsertElement(Result, Combined, @@ -650,60 +654,42 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) { Type *Int32Ty = Builder.getInt32Ty(); Type *ResultTy = VectorType::get(Int32Ty, IsVector ? 4 : 2, false); Value *Val = PoisonValue::get(ResultTy); - + + // Handle double type(s) + Type *SplitElementTy = Int32Ty; + if (IsVector) + SplitElementTy = VectorType::get(SplitElementTy, 2, false); + + Value *LowBits = nullptr; + Value *HighBits = nullptr; // Split the 64-bit values into 32-bit components if (IsDouble) { - // Handle double type(s) - Type *SplitElementTy = Int32Ty; - if (IsVector) - SplitElementTy = VectorType::get(SplitElementTy, 2, false); - auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy); Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble, {Orig->getOperand(2)}); - Value *LowBits = Builder.CreateExtractValue(Split, 0); - Value *HighBits = Builder.CreateExtractValue(Split, 1); - - if (IsVector) { - // For vector doubles, use shuffle to create the final vector - Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3}); - } else { - // For scalar doubles, insert the elements - Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); - Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); - } + LowBits = Builder.CreateExtractValue(Split, 0); + HighBits = Builder.CreateExtractValue(Split, 1); } else { // Handle int64 type(s) Value *InputVal = Orig->getOperand(2); + Constant *ShiftAmt = Builder.getInt64(32); + if (IsVector) + ShiftAmt = ConstantVector::getSplat(ElementCount::getFixed(2), ShiftAmt); - if (IsVector) { - // Handle vector of int64 - for (unsigned I = 0; I < 2; ++I) { - // Extract each int64 element - Value *Int64Val = - Builder.CreateExtractElement(InputVal, Builder.getInt32(I)); - - // Split into low and high 32-bit parts - Value *LowBits = Builder.CreateTrunc(Int64Val, Int32Ty); - Value *ShiftedVal = Builder.CreateLShr(Int64Val, Builder.getInt64(32)); - Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty); - - // Insert into result vector - Val = - Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(I * 2)); - Val = Builder.CreateInsertElement(Val, HighBits, - Builder.getInt32(I * 2 + 1)); - } - } else { - // Handle scalar int64 - Value *LowBits = Builder.CreateTrunc(InputVal, Int32Ty); - Value *ShiftedVal = Builder.CreateLShr(InputVal, Builder.getInt64(32)); - Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty); + // Split into low and high 32-bit parts + LowBits = Builder.CreateTrunc(InputVal, SplitElementTy); + Value *ShiftedVal = Builder.CreateLShr(InputVal, ShiftAmt); + HighBits = Builder.CreateTrunc(ShiftedVal, SplitElementTy); + } + if (IsVector) { + // For vector doubles, use shuffle to create the final vector + Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3}); + } else { + // For scalar doubles, insert the elements Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); } - } // Create the final intrinsic call Builder.CreateIntrinsic(Builder.getVoidTy(), diff --git a/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll index efb7c0ac104ed..c97a02d1873a0 100644 --- a/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll +++ b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll @@ -25,18 +25,10 @@ define void @storev2i64(<2 x i64> %0) { ; CHECK-LABEL: define void @storev2i64( ; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) { ; CHECK-NEXT: [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP2]], 32 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP8]], 32 -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP11]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP0]], splat (i64 32) +; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t.v4i32(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0, <4 x i32> [[TMP13]]) ; CHECK-NEXT: ret void ; From 45dbfb14541d714a875024dd8ac1c9b14b1941b8 Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Thu, 26 Jun 2025 18:48:52 -0400 Subject: [PATCH 5/7] fix formatting --- .../Target/DirectX/DXILIntrinsicExpansion.cpp | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index 1d92f995cc57d..dcf26185bc925 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -544,12 +544,13 @@ static Value *expandRadiansIntrinsic(CallInst *Orig) { return Builder.CreateFMul(X, PiOver180); } -static Value* createCombinedi32toi64Expansion(IRBuilder<> &Builder, Value *LoBytes, Value *HighBytes) { +static Value *createCombinedi32toi64Expansion(IRBuilder<> &Builder, + Value *LoBytes, + Value *HighBytes) { // For int64, manually combine two int32s // First, zero-extend both values to i64 Value *Lo = Builder.CreateZExt(LoBytes, Builder.getInt64Ty()); - Value *Hi = - Builder.CreateZExt(HighBytes, Builder.getInt64Ty()); + Value *Hi = Builder.CreateZExt(HighBytes, Builder.getInt64Ty()); // Shift the high bits left by 32 bits Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32)); // OR the high and low bits together @@ -591,14 +592,14 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) { Value *Result = PoisonValue::get(BufferTy); for (unsigned I = 0; I < ExtractNum; I += 2) { Value *Combined = nullptr; - if (IsDouble) + if (IsDouble) // For doubles, use dx_asdouble intrinsic Combined = Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble, {ExtractElements[I], ExtractElements[I + 1]}); else - Combined = - createCombinedi32toi64Expansion(Builder, ExtractElements[I], ExtractElements[I + 1]); + Combined = createCombinedi32toi64Expansion(Builder, ExtractElements[I], + ExtractElements[I + 1]); if (ExtractNum == 4) Result = Builder.CreateInsertElement(Result, Combined, @@ -654,7 +655,7 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) { Type *Int32Ty = Builder.getInt32Ty(); Type *ResultTy = VectorType::get(Int32Ty, IsVector ? 4 : 2, false); Value *Val = PoisonValue::get(ResultTy); - + // Handle double type(s) Type *SplitElementTy = Int32Ty; if (IsVector) @@ -672,7 +673,7 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) { } else { // Handle int64 type(s) Value *InputVal = Orig->getOperand(2); - Constant *ShiftAmt = Builder.getInt64(32); + Constant *ShiftAmt = Builder.getInt64(32); if (IsVector) ShiftAmt = ConstantVector::getSplat(ElementCount::getFixed(2), ShiftAmt); @@ -683,13 +684,13 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) { } if (IsVector) { - // For vector doubles, use shuffle to create the final vector - Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3}); - } else { - // For scalar doubles, insert the elements - Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); - Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); - } + // For vector doubles, use shuffle to create the final vector + Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3}); + } else { + // For scalar doubles, insert the elements + Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); + Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); + } // Create the final intrinsic call Builder.CreateIntrinsic(Builder.getVoidTy(), From cca0e2d91bf46af0a9fdd0893bc6bab22527ca0b Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Fri, 27 Jun 2025 13:09:40 -0400 Subject: [PATCH 6/7] address pr feedback --- llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index dcf26185bc925..7d0e3cc0cdd6e 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -656,7 +656,6 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) { Type *ResultTy = VectorType::get(Int32Ty, IsVector ? 4 : 2, false); Value *Val = PoisonValue::get(ResultTy); - // Handle double type(s) Type *SplitElementTy = Int32Ty; if (IsVector) SplitElementTy = VectorType::get(SplitElementTy, 2, false); @@ -684,10 +683,8 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) { } if (IsVector) { - // For vector doubles, use shuffle to create the final vector Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3}); } else { - // For scalar doubles, insert the elements Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0)); Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1)); } From 04232d9cb875b3225c0baa08a682657e7f965afa Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Fri, 27 Jun 2025 13:46:52 -0400 Subject: [PATCH 7/7] compromise was to revert back to origional --- .../Target/DirectX/DXILIntrinsicExpansion.cpp | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index 7d0e3cc0cdd6e..435b80ecaec64 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -544,19 +544,6 @@ static Value *expandRadiansIntrinsic(CallInst *Orig) { return Builder.CreateFMul(X, PiOver180); } -static Value *createCombinedi32toi64Expansion(IRBuilder<> &Builder, - Value *LoBytes, - Value *HighBytes) { - // For int64, manually combine two int32s - // First, zero-extend both values to i64 - Value *Lo = Builder.CreateZExt(LoBytes, Builder.getInt64Ty()); - Value *Hi = Builder.CreateZExt(HighBytes, Builder.getInt64Ty()); - // Shift the high bits left by 32 bits - Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32)); - // OR the high and low bits together - return Builder.CreateOr(Lo, ShiftedHi); -} - static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) { IRBuilder<> Builder(Orig); @@ -597,9 +584,17 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) { Combined = Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble, {ExtractElements[I], ExtractElements[I + 1]}); - else - Combined = createCombinedi32toi64Expansion(Builder, ExtractElements[I], - ExtractElements[I + 1]); + else { + // For int64, manually combine two int32s + // First, zero-extend both values to i64 + Value *Lo = Builder.CreateZExt(ExtractElements[I], Builder.getInt64Ty()); + Value *Hi = + Builder.CreateZExt(ExtractElements[I + 1], Builder.getInt64Ty()); + // Shift the high bits left by 32 bits + Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32)); + // OR the high and low bits together + Combined = Builder.CreateOr(Lo, ShiftedHi); + } if (ExtractNum == 4) Result = Builder.CreateInsertElement(Result, Combined,