From 2c4695c05e3a7197cd57ce81893c4cce9956fa73 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Wed, 18 Jun 2025 11:53:52 -0400
Subject: [PATCH 1/7] [DirectX] add support for i64 buffer load/stores

---
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 179 +++++++++++++-----
 llvm/test/CodeGen/DirectX/BufferLoadDouble.ll |   4 +-
 llvm/test/CodeGen/DirectX/BufferLoadInt64.ll  |  56 ++++++
 .../test/CodeGen/DirectX/BufferStoreDouble.ll |  43 +++++
 llvm/test/CodeGen/DirectX/BufferStoreInt64.ll |  46 +++++
 5 files changed, 281 insertions(+), 47 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
 create mode 100644 llvm/test/CodeGen/DirectX/BufferStoreInt64.ll
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index f99e8e7ccdc5d..eb9268e78a9ad 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -70,15 +71,17 @@ static bool isIntrinsicExpansion(Function &F) {
   case Intrinsic::vector_reduce_add:
   case Intrinsic::vector_reduce_fadd:
     return true;
-  case Intrinsic::dx_resource_load_typedbuffer:
-    // We need to handle doubles and vector of doubles.
-    return F.getReturnType()
-        ->getStructElementType(0)
-        ->getScalarType()
-        ->isDoubleTy();
-  case Intrinsic::dx_resource_store_typedbuffer:
-    // We need to handle doubles and vector of doubles.
-    return F.getFunctionType()->getParamType(2)->getScalarType()->isDoubleTy();
+  case Intrinsic::dx_resource_load_typedbuffer: {
+    // We need to handle i64, doubles, and vectors of them.
+    Type *ScalarTy =
+        F.getReturnType()->getStructElementType(0)->getScalarType();
+    return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
+  }
+  case Intrinsic::dx_resource_store_typedbuffer: {
+    // We need to handle i64 and doubles and vectors of i64 and doubles.
+    Type *ScalarTy = F.getFunctionType()->getParamType(2)->getScalarType();
+    return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
+  }
   }
   return false;
 }
@@ -545,13 +548,15 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
   IRBuilder<> Builder(Orig);
 
   Type *BufferTy = Orig->getType()->getStructElementType(0);
-  assert(BufferTy->getScalarType()->isDoubleTy() &&
-         "Only expand double or double2");
+  Type *ScalarTy = BufferTy->getScalarType();
+  bool IsDouble = ScalarTy->isDoubleTy();
+  assert(IsDouble || ScalarTy->isIntegerTy(64) &&
+                         "Only expand double or int64 scalars or vectors");
 
   unsigned ExtractNum = 2;
   if (auto *VT = dyn_cast<FixedVectorType>(BufferTy)) {
     assert(VT->getNumElements() == 2 &&
-           "TypedBufferLoad double vector has wrong size");
+           "TypedBufferLoad vector must be size 2");
     ExtractNum = 4;
   }
 
@@ -570,22 +575,54 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
     ExtractElements.push_back(
         Builder.CreateExtractElement(Extract, Builder.getInt32(I)));
 
-  // combine into double(s)
+  // combine into double(s) or int64(s)
   Value *Result = PoisonValue::get(BufferTy);
   for (unsigned I = 0; I < ExtractNum; I += 2) {
-    Value *Dbl =
-        Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble,
-                                {ExtractElements[I], ExtractElements[I + 1]});
+    Value *Combined = nullptr;
+    if (IsDouble) {
+      // For doubles, use dx_asdouble intrinsic
+      Combined =
+          Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble,
+                                  {ExtractElements[I], ExtractElements[I + 1]});
+    } else {
+      // For int64, manually combine two int32s
+      // First, zero-extend both values to i64
+      Value *Lo = Builder.CreateZExt(ExtractElements[I], Builder.getInt64Ty());
+      Value *Hi =
+          Builder.CreateZExt(ExtractElements[I + 1], Builder.getInt64Ty());
+      // Shift the high bits left by 32 bits
+      Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32));
+      // OR the high and low bits together
+      Combined = Builder.CreateOr(Lo, ShiftedHi);
+    }
+
     if (ExtractNum == 4)
-      Result =
-          Builder.CreateInsertElement(Result, Dbl, Builder.getInt32(I / 2));
+      Result = Builder.CreateInsertElement(Result, Combined,
+                                           Builder.getInt32(I / 2));
     else
-      Result = Dbl;
+      Result = Combined;
   }
 
   Value *CheckBit = nullptr;
   for (User *U : make_early_inc_range(Orig->users())) {
-    auto *EVI = cast<ExtractValueInst>(U);
+    if (auto *Ret = dyn_cast<ReturnInst>(U)) {
+      // For return instructions, we need to handle the case where the function
+      // is directly returning the result of the call
+      Type *RetTy = Ret->getFunction()->getReturnType();
+      Value *StructRet = PoisonValue::get(RetTy);
+      StructRet = Builder.CreateInsertValue(StructRet, Result, {0});
+      Value *CheckBitForRet = Builder.CreateExtractValue(Load, {1});
+      StructRet = Builder.CreateInsertValue(StructRet, CheckBitForRet, {1});
+      Ret->setOperand(0, StructRet);
+      continue;
+    }
+    auto *EVI = dyn_cast<ExtractValueInst>(U);
+    if (!EVI) {
+      // If it's not a ReturnInst or ExtractValueInst, we don't know how to
+      // handle it
+      llvm_unreachable("Unexpected user of typedbufferload");
+    }
+
     ArrayRef<unsigned> Indices = EVI->getIndices();
     assert(Indices.size() == 1);
 
@@ -609,38 +646,90 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   IRBuilder<> Builder(Orig);
 
   Type *BufferTy = Orig->getFunctionType()->getParamType(2);
-  assert(BufferTy->getScalarType()->isDoubleTy() &&
-         "Only expand double or double2");
+  Type *ScalarTy = BufferTy->getScalarType();
+  bool IsDouble = ScalarTy->isDoubleTy();
+  assert((IsDouble || ScalarTy->isIntegerTy(64)) &&
+         "Only expand double or int64 scalars or vectors");
 
   unsigned ExtractNum = 2;
   if (auto *VT = dyn_cast<FixedVectorType>(BufferTy)) {
     assert(VT->getNumElements() == 2 &&
-           "TypedBufferStore double vector has wrong size");
+           "TypedBufferStore vector must be size 2");
     ExtractNum = 4;
   }
+  if (IsDouble) {
+    Type *SplitElementTy = Builder.getInt32Ty();
+    if (ExtractNum == 4)
+      SplitElementTy = VectorType::get(SplitElementTy, 2, false);
+
+    // Handle double type(s) - keep original behavior
+    auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy);
+    Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble,
+                                           {Orig->getOperand(2)});
+    // create our vector
+    Value *LowBits = Builder.CreateExtractValue(Split, 0);
+    Value *HighBits = Builder.CreateExtractValue(Split, 1);
+    Value *Val;
+    if (ExtractNum == 2) {
+      Val = PoisonValue::get(VectorType::get(Builder.getInt32Ty(), 2, false));
+      Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
+      Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
+    } else
+      Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
+
+    Builder.CreateIntrinsic(Builder.getVoidTy(),
+                            Intrinsic::dx_resource_store_typedbuffer,
+                            {Orig->getOperand(0), Orig->getOperand(1), Val});
+  } else {
+    // Handle int64 type(s)
+    Value *InputVal = Orig->getOperand(2);
+    Value *Val;
+
+    if (ExtractNum == 4) {
+      // Handle vector of int64
+      Type *Int32x4Ty = VectorType::get(Builder.getInt32Ty(), 4, false);
+      Val = PoisonValue::get(Int32x4Ty);
+
+      for (unsigned I = 0; I < 2; ++I) {
+        // Extract each int64 element
+        Value *Int64Val =
+            Builder.CreateExtractElement(InputVal, Builder.getInt32(I));
+
+        // Get low 32 bits by truncating to i32
+        Value *LowBits = Builder.CreateTrunc(Int64Val, Builder.getInt32Ty());
+
+        // Get high 32 bits by shifting right by 32 and truncating
+        Value *ShiftedVal = Builder.CreateLShr(Int64Val, Builder.getInt64(32));
+        Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty());
+
+        // Insert into our final vector
+        Val =
+            Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(I * 2));
+        Val = Builder.CreateInsertElement(Val, HighBits,
+                                          Builder.getInt32(I * 2 + 1));
+      }
+    } else {
+      // Handle scalar int64
+      Type *Int32x2Ty = VectorType::get(Builder.getInt32Ty(), 2, false);
+      Val = PoisonValue::get(Int32x2Ty);
+
+      // Get low 32 bits by truncating to i32
+      Value *LowBits = Builder.CreateTrunc(InputVal, Builder.getInt32Ty());
+
+      // Get high 32 bits by shifting right by 32 and truncating
+      Value *ShiftedVal = Builder.CreateLShr(InputVal, Builder.getInt64(32));
+      Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty());
+
+      // Insert into our final vector
+      Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
+      Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
+    }
+
+    Builder.CreateIntrinsic(Builder.getVoidTy(),
+                            Intrinsic::dx_resource_store_typedbuffer,
+                            {Orig->getOperand(0), Orig->getOperand(1), Val});
+  }
 
-  Type *SplitElementTy = Builder.getInt32Ty();
-  if (ExtractNum == 4)
-    SplitElementTy = VectorType::get(SplitElementTy, 2, false);
-
-  // split our double(s)
-  auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy);
-  Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble,
-                                         Orig->getOperand(2));
-  // create our vector
-  Value *LowBits = Builder.CreateExtractValue(Split, 0);
-  Value *HighBits = Builder.CreateExtractValue(Split, 1);
-  Value *Val;
-  if (ExtractNum == 2) {
-    Val = PoisonValue::get(VectorType::get(SplitElementTy, 2, false));
-    Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
-    Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
-  } else
-    Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
-
-  Builder.CreateIntrinsic(Builder.getVoidTy(),
-                          Intrinsic::dx_resource_store_typedbuffer,
-                          {Orig->getOperand(0), Orig->getOperand(1), Val});
   Orig->eraseFromParent();
   return true;
 }
diff --git a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
index 80a071a66364b..af3ec9df37967 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s
 
-target triple = "dxil-pc-shadermodel6.6-compute"
+target triple = "dxil-pc-shadermodel6.2-compute"
 
 define void @loadf64() {
   ; check the handle from binding is unchanged
@@ -88,4 +88,4 @@ define void @loadf64WithCheckBit() {
   ; CHECK-NOT: extractvalue { double, i1 }
   %cb = extractvalue {double, i1} %load0, 1
   ret void
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
new file mode 100644
index 0000000000000..cea475524945c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.2-compute"
+
+define { i64, i1 } @loadi64() {
+; CHECK-LABEL: define { i64, i1 } @loadi64() {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { i64, i1 } [[TMP9]], i1 [[TMP10]], 1
+; CHECK-NEXT:    ret { i64, i1 } [[TMP11]]
+;
+  %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  %result = call { i64, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(
+  target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0)
+  ret { i64, i1 } %result
+}
+
+define { <2 x i64>, i1 } @loadv2i64() {
+; CHECK-LABEL: define { <2 x i64>, i1 } @loadv2i64() {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP8]], 32
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = shl i64 [[TMP13]], 32
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP12]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <2 x i64>, i1 } poison, <2 x i64> [[TMP16]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <2 x i64>, i1 } [[TMP17]], i1 [[TMP18]], 1
+; CHECK-NEXT:    ret { <2 x i64>, i1 } [[TMP19]]
+;
+  %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  %result = call { <2 x i64>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(
+  target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0)
+  ret { <2 x i64>, i1 } %result
+}
diff --git a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
index 9c3dab0cc1e46..882948b6dce74 100644
--- a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
+++ b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
@@ -45,3 +45,46 @@ define void @storev2f64(<2 x double> %0) {
       <2 x double> %0)
   ret void
 }
+
+define { double, i1 } @loadAndReturnf64() {
+; CHECK-LABEL: define { double, i1 } @loadAndReturnf64() {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_f64_1_0_0t(target("dx.TypedBuffer", double, 1, 0, 0) [[BUFFER]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { double, i1 } poison, double [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP6]], i1 [[TMP7]], 1
+; CHECK-NEXT:    ret { double, i1 } [[TMP8]]
+;
+  %buffer = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  %result = call { double, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_f64_1_0_0t(
+  target("dx.TypedBuffer", double, 1, 0, 0) %buffer, i32 0)
+  ret { double, i1 } %result
+}
+
+define { <2 x double>, i1 } @loadAndReturnv2f64() {
+; CHECK-LABEL: define { <2 x double>, i1 } @loadAndReturnv2f64() {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) [[BUFFER]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP5]], i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP8]], double [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <2 x double>, i1 } poison, <2 x double> [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x double>, i1 } [[TMP11]], i1 [[TMP12]], 1
+; CHECK-NEXT:    ret { <2 x double>, i1 } [[TMP13]]
+;
+  %buffer = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  %result = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t(
+  target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 0)
+  ret { <2 x double>, i1 } %result
+}
diff --git a/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll
new file mode 100644
index 0000000000000..efb7c0ac104ed
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+define void @storei64(i64 %0) {
+; CHECK-LABEL: define void @storei64(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP0]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1
+; CHECK-NEXT:    call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_i64_1_0_0t.v2i32(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0, <2 x i32> [[TMP6]])
+; CHECK-NEXT:    ret void
+;
+  %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0,i64 %0)
+  ret void
+}
+
+
+define void @storev2i64(<2 x i64> %0) {
+; CHECK-LABEL: define void @storev2i64(
+; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr i64 [[TMP8]], 32
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP11]], i32 3
+; CHECK-NEXT:    call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t.v4i32(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0, <4 x i32> [[TMP13]])
+; CHECK-NEXT:    ret void
+;
+  %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0, <2 x i64> %0)
+  ret void
+}

From ecd32db1d289d0bfd361db4fb823de4c7ae7beea Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Fri, 20 Jun 2025 10:58:21 -0400
Subject: [PATCH 2/7] minimize code diff between double and i64

---
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 74 +++++++++----------
 1 file changed, 33 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index eb9268e78a9ad..45d8e497165cf 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -651,58 +651,56 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   assert((IsDouble || ScalarTy->isIntegerTy(64)) &&
          "Only expand double or int64 scalars or vectors");
 
-  unsigned ExtractNum = 2;
-  if (auto *VT = dyn_cast<FixedVectorType>(BufferTy)) {
-    assert(VT->getNumElements() == 2 &&
+  // Determine if we're dealing with a vector or scalar
+  bool IsVector = isa<FixedVectorType>(BufferTy);
+  if (IsVector) {
+    assert(cast<FixedVectorType>(BufferTy)->getNumElements() == 2 &&
            "TypedBufferStore vector must be size 2");
-    ExtractNum = 4;
   }
+
+  // Create the appropriate vector type for the result
+  Type *Int32Ty = Builder.getInt32Ty();
+  Type *ResultTy = VectorType::get(Int32Ty, IsVector ? 4 : 2, false);
+  Value *Val = PoisonValue::get(ResultTy);
+
+  // Split the 64-bit values into 32-bit components
   if (IsDouble) {
-    Type *SplitElementTy = Builder.getInt32Ty();
-    if (ExtractNum == 4)
+    // Handle double type(s)
+    Type *SplitElementTy = Int32Ty;
+    if (IsVector)
       SplitElementTy = VectorType::get(SplitElementTy, 2, false);
 
-    // Handle double type(s) - keep original behavior
     auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy);
     Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble,
                                            {Orig->getOperand(2)});
-    // create our vector
     Value *LowBits = Builder.CreateExtractValue(Split, 0);
     Value *HighBits = Builder.CreateExtractValue(Split, 1);
-    Value *Val;
-    if (ExtractNum == 2) {
-      Val = PoisonValue::get(VectorType::get(Builder.getInt32Ty(), 2, false));
+
+    if (IsVector) {
+      // For vector doubles, use shuffle to create the final vector
+      Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
+    } else {
+      // For scalar doubles, insert the elements
       Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
       Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
-    } else
-      Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
-
-    Builder.CreateIntrinsic(Builder.getVoidTy(),
-                            Intrinsic::dx_resource_store_typedbuffer,
-                            {Orig->getOperand(0), Orig->getOperand(1), Val});
+    }
   } else {
     // Handle int64 type(s)
     Value *InputVal = Orig->getOperand(2);
-    Value *Val;
 
-    if (ExtractNum == 4) {
+    if (IsVector) {
       // Handle vector of int64
-      Type *Int32x4Ty = VectorType::get(Builder.getInt32Ty(), 4, false);
-      Val = PoisonValue::get(Int32x4Ty);
-
       for (unsigned I = 0; I < 2; ++I) {
         // Extract each int64 element
         Value *Int64Val =
             Builder.CreateExtractElement(InputVal, Builder.getInt32(I));
 
-        // Get low 32 bits by truncating to i32
-        Value *LowBits = Builder.CreateTrunc(Int64Val, Builder.getInt32Ty());
-
-        // Get high 32 bits by shifting right by 32 and truncating
+        // Split into low and high 32-bit parts
+        Value *LowBits = Builder.CreateTrunc(Int64Val, Int32Ty);
         Value *ShiftedVal = Builder.CreateLShr(Int64Val, Builder.getInt64(32));
-        Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty());
+        Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty);
 
-        // Insert into our final vector
+        // Insert into result vector
         Val =
             Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(I * 2));
         Val = Builder.CreateInsertElement(Val, HighBits,
@@ -710,26 +708,20 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
       }
     } else {
       // Handle scalar int64
-      Type *Int32x2Ty = VectorType::get(Builder.getInt32Ty(), 2, false);
-      Val = PoisonValue::get(Int32x2Ty);
-
-      // Get low 32 bits by truncating to i32
-      Value *LowBits = Builder.CreateTrunc(InputVal, Builder.getInt32Ty());
-
-      // Get high 32 bits by shifting right by 32 and truncating
+      Value *LowBits = Builder.CreateTrunc(InputVal, Int32Ty);
       Value *ShiftedVal = Builder.CreateLShr(InputVal, Builder.getInt64(32));
-      Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty());
+      Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty);
 
-      // Insert into our final vector
       Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
       Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
     }
-
-    Builder.CreateIntrinsic(Builder.getVoidTy(),
-                            Intrinsic::dx_resource_store_typedbuffer,
-                            {Orig->getOperand(0), Orig->getOperand(1), Val});
   }
 
+  // Create the final intrinsic call
+  Builder.CreateIntrinsic(Builder.getVoidTy(),
+                          Intrinsic::dx_resource_store_typedbuffer,
+                          {Orig->getOperand(0), Orig->getOperand(1), Val});
+
   Orig->eraseFromParent();
   return true;
 }

From 8d8782d45ebf0e3b1f77778e91b1c2c7fc1161cd Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Fri, 20 Jun 2025 14:16:19 -0400
Subject: [PATCH 3/7] remove return handling

---
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 18 ++------
 llvm/test/CodeGen/DirectX/BufferLoadDouble.ll |  2 +-
 llvm/test/CodeGen/DirectX/BufferLoadInt64.ll  | 28 +++++-------
 .../test/CodeGen/DirectX/BufferStoreDouble.ll | 43 -------------------
 4 files changed, 14 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 45d8e497165cf..d50279461800e 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -605,23 +605,11 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
 
   Value *CheckBit = nullptr;
   for (User *U : make_early_inc_range(Orig->users())) {
-    if (auto *Ret = dyn_cast<ReturnInst>(U)) {
-      // For return instructions, we need to handle the case where the function
-      // is directly returning the result of the call
-      Type *RetTy = Ret->getFunction()->getReturnType();
-      Value *StructRet = PoisonValue::get(RetTy);
-      StructRet = Builder.CreateInsertValue(StructRet, Result, {0});
-      Value *CheckBitForRet = Builder.CreateExtractValue(Load, {1});
-      StructRet = Builder.CreateInsertValue(StructRet, CheckBitForRet, {1});
-      Ret->setOperand(0, StructRet);
-      continue;
-    }
+    // If it's not a ExtractValueInst, we don't know how to
+    // handle it
     auto *EVI = dyn_cast<ExtractValueInst>(U);
-    if (!EVI) {
-      // If it's not a ReturnInst or ExtractValueInst, we don't know how to
-      // handle it
+    if (!EVI)
       llvm_unreachable("Unexpected user of typedbufferload");
-    }
 
     ArrayRef<unsigned> Indices = EVI->getIndices();
     assert(Indices.size() == 1);
diff --git a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
index af3ec9df37967..25abf2111060c 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s
 
-target triple = "dxil-pc-shadermodel6.2-compute"
+target triple = "dxil-pc-shadermodel6.6-compute"
 
 define void @loadf64() {
   ; check the handle from binding is unchanged
diff --git a/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
index cea475524945c..42c0012ff3475 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
@@ -3,8 +3,8 @@
 
 target triple = "dxil-pc-shadermodel6.2-compute"
 
-define { i64, i1 } @loadi64() {
-; CHECK-LABEL: define { i64, i1 } @loadi64() {
+define void @loadi64() {
+; CHECK-LABEL: define void @loadi64() {
 ; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0
@@ -14,19 +14,15 @@ define { i64, i1 } @loadi64() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { i64, i1 } [[TMP9]], i1 [[TMP10]], 1
-; CHECK-NEXT:    ret { i64, i1 } [[TMP11]]
+; CHECK-NEXT:    ret void
 ;
   %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-  %result = call { i64, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(
-  target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0)
-  ret { i64, i1 } %result
+  %result = call { i64, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0)
+  ret void
 }
 
-define { <2 x i64>, i1 } @loadv2i64() {
-; CHECK-LABEL: define { <2 x i64>, i1 } @loadv2i64() {
+define void @loadv2i64() {
+; CHECK-LABEL: define void @loadv2i64() {
 ; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0
@@ -44,13 +40,9 @@ define { <2 x i64>, i1 } @loadv2i64() {
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl i64 [[TMP13]], 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP12]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP15]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <2 x i64>, i1 } poison, <2 x i64> [[TMP16]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <2 x i64>, i1 } [[TMP17]], i1 [[TMP18]], 1
-; CHECK-NEXT:    ret { <2 x i64>, i1 } [[TMP19]]
+; CHECK-NEXT:    ret void
 ;
   %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-  %result = call { <2 x i64>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(
-  target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0)
-  ret { <2 x i64>, i1 } %result
+  %result = call { <2 x i64>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0)
+  ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
index 882948b6dce74..9c3dab0cc1e46 100644
--- a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
+++ b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
@@ -45,46 +45,3 @@ define void @storev2f64(<2 x double> %0) {
       <2 x double> %0)
   ret void
 }
-
-define { double, i1 } @loadAndReturnf64() {
-; CHECK-LABEL: define { double, i1 } @loadAndReturnf64() {
-; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-; CHECK-NEXT:    [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_f64_1_0_0t(target("dx.TypedBuffer", double, 1, 0, 0) [[BUFFER]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { double, i1 } poison, double [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP6]], i1 [[TMP7]], 1
-; CHECK-NEXT:    ret { double, i1 } [[TMP8]]
-;
-  %buffer = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-  %result = call { double, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_f64_1_0_0t(
-  target("dx.TypedBuffer", double, 1, 0, 0) %buffer, i32 0)
-  ret { double, i1 } %result
-}
-
-define { <2 x double>, i1 } @loadAndReturnv2f64() {
-; CHECK-LABEL: define { <2 x double>, i1 } @loadAndReturnv2f64() {
-; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-; CHECK-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) [[BUFFER]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP5]], i32 [[TMP6]])
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP8]], double [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <2 x double>, i1 } poison, <2 x double> [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x double>, i1 } [[TMP11]], i1 [[TMP12]], 1
-; CHECK-NEXT:    ret { <2 x double>, i1 } [[TMP13]]
-;
-  %buffer = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-  %result = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t(
-  target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 0)
-  ret { <2 x double>, i1 } %result
-}

From 3de2cdf01a202cbb9c8a74bc42e79e80bcda907f Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Thu, 26 Jun 2025 17:34:09 -0400
Subject: [PATCH 4/7] address pr comments

---
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 92 ++++++++-----------
 llvm/test/CodeGen/DirectX/BufferStoreInt64.ll | 16 +---
 2 files changed, 43 insertions(+), 65 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index d50279461800e..1d92f995cc57d 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -544,6 +544,18 @@ static Value *expandRadiansIntrinsic(CallInst *Orig) {
   return Builder.CreateFMul(X, PiOver180);
 }
 
+static Value* createCombinedi32toi64Expansion(IRBuilder<> &Builder, Value *LoBytes, Value *HighBytes) {
+  // For int64, manually combine two int32s
+  // First, zero-extend both values to i64
+  Value *Lo = Builder.CreateZExt(LoBytes, Builder.getInt64Ty());
+  Value *Hi =
+    Builder.CreateZExt(HighBytes, Builder.getInt64Ty());
+  // Shift the high bits left by 32 bits
+  Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32));
+  // OR the high and low bits together
+  return Builder.CreateOr(Lo, ShiftedHi);
+}
+
 static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
   IRBuilder<> Builder(Orig);
 
@@ -579,22 +591,14 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
   Value *Result = PoisonValue::get(BufferTy);
   for (unsigned I = 0; I < ExtractNum; I += 2) {
     Value *Combined = nullptr;
-    if (IsDouble) {
+    if (IsDouble) 
       // For doubles, use dx_asdouble intrinsic
       Combined =
           Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble,
                                   {ExtractElements[I], ExtractElements[I + 1]});
-    } else {
-      // For int64, manually combine two int32s
-      // First, zero-extend both values to i64
-      Value *Lo = Builder.CreateZExt(ExtractElements[I], Builder.getInt64Ty());
-      Value *Hi =
-          Builder.CreateZExt(ExtractElements[I + 1], Builder.getInt64Ty());
-      // Shift the high bits left by 32 bits
-      Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32));
-      // OR the high and low bits together
-      Combined = Builder.CreateOr(Lo, ShiftedHi);
-    }
+    else
+      Combined = 
+          createCombinedi32toi64Expansion(Builder, ExtractElements[I], ExtractElements[I + 1]);
 
     if (ExtractNum == 4)
       Result = Builder.CreateInsertElement(Result, Combined,
@@ -650,60 +654,42 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   Type *Int32Ty = Builder.getInt32Ty();
   Type *ResultTy = VectorType::get(Int32Ty, IsVector ? 4 : 2, false);
   Value *Val = PoisonValue::get(ResultTy);
-
+  
+  // Handle double type(s)
+  Type *SplitElementTy = Int32Ty;
+  if (IsVector)
+    SplitElementTy = VectorType::get(SplitElementTy, 2, false);
+
+  Value *LowBits = nullptr;
+  Value *HighBits = nullptr;
   // Split the 64-bit values into 32-bit components
   if (IsDouble) {
-    // Handle double type(s)
-    Type *SplitElementTy = Int32Ty;
-    if (IsVector)
-      SplitElementTy = VectorType::get(SplitElementTy, 2, false);
-
     auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy);
     Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble,
                                            {Orig->getOperand(2)});
-    Value *LowBits = Builder.CreateExtractValue(Split, 0);
-    Value *HighBits = Builder.CreateExtractValue(Split, 1);
-
-    if (IsVector) {
-      // For vector doubles, use shuffle to create the final vector
-      Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
-    } else {
-      // For scalar doubles, insert the elements
-      Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
-      Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
-    }
+    LowBits = Builder.CreateExtractValue(Split, 0);
+    HighBits = Builder.CreateExtractValue(Split, 1);
   } else {
     // Handle int64 type(s)
     Value *InputVal = Orig->getOperand(2);
+     Constant *ShiftAmt = Builder.getInt64(32);
+    if (IsVector)
+      ShiftAmt = ConstantVector::getSplat(ElementCount::getFixed(2), ShiftAmt);
 
-    if (IsVector) {
-      // Handle vector of int64
-      for (unsigned I = 0; I < 2; ++I) {
-        // Extract each int64 element
-        Value *Int64Val =
-            Builder.CreateExtractElement(InputVal, Builder.getInt32(I));
-
-        // Split into low and high 32-bit parts
-        Value *LowBits = Builder.CreateTrunc(Int64Val, Int32Ty);
-        Value *ShiftedVal = Builder.CreateLShr(Int64Val, Builder.getInt64(32));
-        Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty);
-
-        // Insert into result vector
-        Val =
-            Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(I * 2));
-        Val = Builder.CreateInsertElement(Val, HighBits,
-                                          Builder.getInt32(I * 2 + 1));
-      }
-    } else {
-      // Handle scalar int64
-      Value *LowBits = Builder.CreateTrunc(InputVal, Int32Ty);
-      Value *ShiftedVal = Builder.CreateLShr(InputVal, Builder.getInt64(32));
-      Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty);
+    // Split into low and high 32-bit parts
+    LowBits = Builder.CreateTrunc(InputVal, SplitElementTy);
+    Value *ShiftedVal = Builder.CreateLShr(InputVal, ShiftAmt);
+    HighBits = Builder.CreateTrunc(ShiftedVal, SplitElementTy);
+  }
 
+  if (IsVector) {
+      // For vector doubles, use shuffle to create the final vector
+      Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
+    } else {
+      // For scalar doubles, insert the elements
       Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
       Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
     }
-  }
 
   // Create the final intrinsic call
   Builder.CreateIntrinsic(Builder.getVoidTy(),
diff --git a/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll
index efb7c0ac104ed..c97a02d1873a0 100644
--- a/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll
+++ b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll
@@ -25,18 +25,10 @@ define void @storev2i64(<2 x i64> %0) {
 ; CHECK-LABEL: define void @storev2i64(
 ; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) {
 ; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], 32
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = lshr i64 [[TMP8]], 32
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP11]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[TMP0]], splat (i64 32)
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:    call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t.v4i32(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0, <4 x i32> [[TMP13]])
 ; CHECK-NEXT:    ret void
 ;

From 45dbfb14541d714a875024dd8ac1c9b14b1941b8 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Thu, 26 Jun 2025 18:48:52 -0400
Subject: [PATCH 5/7] fix formatting

---
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 1d92f995cc57d..dcf26185bc925 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -544,12 +544,13 @@ static Value *expandRadiansIntrinsic(CallInst *Orig) {
   return Builder.CreateFMul(X, PiOver180);
 }
 
-static Value* createCombinedi32toi64Expansion(IRBuilder<> &Builder, Value *LoBytes, Value *HighBytes) {
+static Value *createCombinedi32toi64Expansion(IRBuilder<> &Builder,
+                                              Value *LoBytes,
+                                              Value *HighBytes) {
   // For int64, manually combine two int32s
   // First, zero-extend both values to i64
   Value *Lo = Builder.CreateZExt(LoBytes, Builder.getInt64Ty());
-  Value *Hi =
-    Builder.CreateZExt(HighBytes, Builder.getInt64Ty());
+  Value *Hi = Builder.CreateZExt(HighBytes, Builder.getInt64Ty());
   // Shift the high bits left by 32 bits
   Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32));
   // OR the high and low bits together
@@ -591,14 +592,14 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
   Value *Result = PoisonValue::get(BufferTy);
   for (unsigned I = 0; I < ExtractNum; I += 2) {
     Value *Combined = nullptr;
-    if (IsDouble) 
+    if (IsDouble)
       // For doubles, use dx_asdouble intrinsic
       Combined =
           Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble,
                                   {ExtractElements[I], ExtractElements[I + 1]});
     else
-      Combined = 
-          createCombinedi32toi64Expansion(Builder, ExtractElements[I], ExtractElements[I + 1]);
+      Combined = createCombinedi32toi64Expansion(Builder, ExtractElements[I],
+                                                 ExtractElements[I + 1]);
 
     if (ExtractNum == 4)
       Result = Builder.CreateInsertElement(Result, Combined,
@@ -654,7 +655,7 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   Type *Int32Ty = Builder.getInt32Ty();
   Type *ResultTy = VectorType::get(Int32Ty, IsVector ? 4 : 2, false);
   Value *Val = PoisonValue::get(ResultTy);
-  
+
   // Handle double type(s)
   Type *SplitElementTy = Int32Ty;
   if (IsVector)
@@ -672,7 +673,7 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   } else {
     // Handle int64 type(s)
     Value *InputVal = Orig->getOperand(2);
-     Constant *ShiftAmt = Builder.getInt64(32);
+    Constant *ShiftAmt = Builder.getInt64(32);
     if (IsVector)
       ShiftAmt = ConstantVector::getSplat(ElementCount::getFixed(2), ShiftAmt);
 
@@ -683,13 +684,13 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   }
 
   if (IsVector) {
-      // For vector doubles, use shuffle to create the final vector
-      Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
-    } else {
-      // For scalar doubles, insert the elements
-      Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
-      Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
-    }
+    // For vector doubles, use shuffle to create the final vector
+    Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
+  } else {
+    // For scalar doubles, insert the elements
+    Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
+    Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
+  }
 
   // Create the final intrinsic call
   Builder.CreateIntrinsic(Builder.getVoidTy(),

From cca0e2d91bf46af0a9fdd0893bc6bab22527ca0b Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Fri, 27 Jun 2025 13:09:40 -0400
Subject: [PATCH 6/7] address pr feedback

---
 llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index dcf26185bc925..7d0e3cc0cdd6e 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -656,7 +656,6 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   Type *ResultTy = VectorType::get(Int32Ty, IsVector ? 4 : 2, false);
   Value *Val = PoisonValue::get(ResultTy);
 
-  // Handle double type(s)
   Type *SplitElementTy = Int32Ty;
   if (IsVector)
     SplitElementTy = VectorType::get(SplitElementTy, 2, false);
@@ -684,10 +683,8 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   }
 
   if (IsVector) {
-    // For vector doubles, use shuffle to create the final vector
     Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
   } else {
-    // For scalar doubles, insert the elements
     Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
     Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
   }

From 04232d9cb875b3225c0baa08a682657e7f965afa Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Fri, 27 Jun 2025 13:46:52 -0400
Subject: [PATCH 7/7] compromise was to revert back to origional

---
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 27 ++++++++-----------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 7d0e3cc0cdd6e..435b80ecaec64 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -544,19 +544,6 @@ static Value *expandRadiansIntrinsic(CallInst *Orig) {
   return Builder.CreateFMul(X, PiOver180);
 }
 
-static Value *createCombinedi32toi64Expansion(IRBuilder<> &Builder,
-                                              Value *LoBytes,
-                                              Value *HighBytes) {
-  // For int64, manually combine two int32s
-  // First, zero-extend both values to i64
-  Value *Lo = Builder.CreateZExt(LoBytes, Builder.getInt64Ty());
-  Value *Hi = Builder.CreateZExt(HighBytes, Builder.getInt64Ty());
-  // Shift the high bits left by 32 bits
-  Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32));
-  // OR the high and low bits together
-  return Builder.CreateOr(Lo, ShiftedHi);
-}
-
 static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
   IRBuilder<> Builder(Orig);
 
@@ -597,9 +584,17 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
       Combined =
           Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble,
                                   {ExtractElements[I], ExtractElements[I + 1]});
-    else
-      Combined = createCombinedi32toi64Expansion(Builder, ExtractElements[I],
-                                                 ExtractElements[I + 1]);
+    else {
+      // For int64, manually combine two int32s
+      // First, zero-extend both values to i64
+      Value *Lo = Builder.CreateZExt(ExtractElements[I], Builder.getInt64Ty());
+      Value *Hi =
+          Builder.CreateZExt(ExtractElements[I + 1], Builder.getInt64Ty());
+      // Shift the high bits left by 32 bits
+      Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32));
+      // OR the high and low bits together
+      Combined = Builder.CreateOr(Lo, ShiftedHi);
+    }
 
     if (ExtractNum == 4)
       Result = Builder.CreateInsertElement(Result, Combined,