diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst index 8e43bfaaaf32e..a982c3a29fcc3 100644 --- a/llvm/docs/DirectX/DXILResources.rst +++ b/llvm/docs/DirectX/DXILResources.rst @@ -361,11 +361,60 @@ Examples: - ``i32`` - Index into the buffer +Texture and Typed Buffer Stores +------------------------------- + +*relevant types: Textures and TypedBuffer* + +The `TextureStore`_ and `BufferStore`_ DXIL operations always write all four +32-bit components to a texture or a typed buffer. While both operations include +a mask parameter, it is specified that the mask must cover all components when +used with these types. + +The store operations that we define as intrinsics behave similarly, and will +only accept writes to the whole of the contained type. This differs from the +loads above, but this makes sense to do from a semantics preserving point of +view. Thus, texture and buffer stores may only operate on 4-element vectors of +types that are 32-bits or fewer, such as ``<4 x i32>``, ``<4 x float>``, and +``<4 x half>``, and 2 element vectors of 64-bit types like ``<2 x double>`` and +``<2 x i64>``. + +.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore +.. _TextureStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#texturestore + Examples: -.. code-block:: llvm +.. list-table:: ``@llvm.dx.typedBufferStore`` + :header-rows: 1 - %ret = call {<4 x float>, i1} - @llvm.dx.typedBufferLoad.checkbit.v4f32.tdx.TypedBuffer_v4f32_0_0_0t( - target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index) + * - Argument + - + - Type + - Description + * - Return value + - + - ``void`` + - + * - ``%buffer`` + - 0 + - ``target(dx.TypedBuffer, ...)`` + - The buffer to store into + * - ``%index`` + - 1 + - ``i32`` + - Index into the buffer + * - ``%data`` + - 2 + - A 4- or 2-element vector of the type of the buffer + - The data to store + +Examples: + +.. code-block:: llvm + call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f32_1_0_0t( + target("dx.TypedBuffer", f32, 1, 0) %buf, i32 %index, <4 x f32> %data) + call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f16_1_0_0t( + target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data) + call void @llvm.dx.typedBufferStore.tdx.Buffer_v2f64_1_0_0t( + target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data) diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 40c9ac3f0da34..c36e98f040ab8 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -32,6 +32,8 @@ def int_dx_handle_fromBinding def int_dx_typedBufferLoad : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty]>; +def int_dx_typedBufferStore + : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty]>; // Cast between target extension handle types and dxil-style opaque handles def int_dx_cast_handle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 67a9b9d02bb6a..759a58ed3930e 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -707,6 +707,18 @@ def BufferLoad : DXILOp<68, bufferLoad> { let stages = [Stages]; } +def BufferStore : DXILOp<69, bufferStore> { + let Doc = "writes to an RWTypedBuffer"; + // Handle, Coord0, Coord1, Val0, Val1, Val2, Val3, Mask + let arguments = [ + HandleTy, Int32Ty, Int32Ty, OverloadTy, OverloadTy, OverloadTy, OverloadTy, + Int8Ty + ]; + let result = VoidTy; + let overloads = [Overloads]; + let stages = [Stages]; +} + def ThreadId : DXILOp<93, threadId> { let Doc = "Reads the thread ID"; let LLVMIntrinsic = int_dx_thread_id; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index df2751d99576a..f968cab1dccf1 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -82,8 +82,11 @@ class OpLowerer { public: OpLowerer(Module &M, DXILResourceMap &DRM) : M(M), OpBuilder(M), DRM(DRM) {} - void replaceFunction(Function &F, - llvm::function_ref ReplaceCall) { + /// Replace every call to \c F using \c ReplaceCall, and then erase \c F. If + /// there is an error replacing a call, we emit a diagnostic and return true. + [[nodiscard]] bool + replaceFunction(Function &F, + llvm::function_ref ReplaceCall) { for (User *U : make_early_inc_range(F.users())) { CallInst *CI = dyn_cast(U); if (!CI) @@ -94,16 +97,18 @@ class OpLowerer { DiagnosticInfoUnsupported Diag(*CI->getFunction(), Message, CI->getDebugLoc()); M.getContext().diagnose(Diag); - continue; + return true; } } if (F.user_empty()) F.eraseFromParent(); + return false; } - void replaceFunctionWithOp(Function &F, dxil::OpCode DXILOp) { + [[nodiscard]] + bool replaceFunctionWithOp(Function &F, dxil::OpCode DXILOp) { bool IsVectorArgExpansion = isVectorArgExpansion(F); - replaceFunction(F, [&](CallInst *CI) -> Error { + return replaceFunction(F, [&](CallInst *CI) -> Error { SmallVector Args; OpBuilder.getIRB().SetInsertPoint(CI); if (IsVectorArgExpansion) { @@ -175,12 +180,12 @@ class OpLowerer { CleanupCasts.clear(); } - void lowerToCreateHandle(Function &F) { + [[nodiscard]] bool lowerToCreateHandle(Function &F) { IRBuilder<> &IRB = OpBuilder.getIRB(); Type *Int8Ty = IRB.getInt8Ty(); Type *Int32Ty = IRB.getInt32Ty(); - replaceFunction(F, [&](CallInst *CI) -> Error { + return replaceFunction(F, [&](CallInst *CI) -> Error { IRB.SetInsertPoint(CI); auto *It = DRM.find(CI); @@ -205,10 +210,10 @@ class OpLowerer { }); } - void lowerToBindAndAnnotateHandle(Function &F) { + [[nodiscard]] bool lowerToBindAndAnnotateHandle(Function &F) { IRBuilder<> &IRB = OpBuilder.getIRB(); - replaceFunction(F, [&](CallInst *CI) -> Error { + return replaceFunction(F, [&](CallInst *CI) -> Error { IRB.SetInsertPoint(CI); auto *It = DRM.find(CI); @@ -251,12 +256,11 @@ class OpLowerer { /// Lower `dx.handle.fromBinding` intrinsics depending on the shader model and /// taking into account binding information from DXILResourceAnalysis. - void lowerHandleFromBinding(Function &F) { + bool lowerHandleFromBinding(Function &F) { Triple TT(Triple(M.getTargetTriple())); if (TT.getDXILVersion() < VersionTuple(1, 6)) - lowerToCreateHandle(F); - else - lowerToBindAndAnnotateHandle(F); + return lowerToCreateHandle(F); + return lowerToBindAndAnnotateHandle(F); } /// Replace uses of \c Intrin with the values in the `dx.ResRet` of \c Op. @@ -342,11 +346,11 @@ class OpLowerer { return Error::success(); } - void lowerTypedBufferLoad(Function &F) { + [[nodiscard]] bool lowerTypedBufferLoad(Function &F) { IRBuilder<> &IRB = OpBuilder.getIRB(); Type *Int32Ty = IRB.getInt32Ty(); - replaceFunction(F, [&](CallInst *CI) -> Error { + return replaceFunction(F, [&](CallInst *CI) -> Error { IRB.SetInsertPoint(CI); Value *Handle = @@ -368,8 +372,51 @@ class OpLowerer { }); } + [[nodiscard]] bool lowerTypedBufferStore(Function &F) { + IRBuilder<> &IRB = OpBuilder.getIRB(); + Type *Int8Ty = IRB.getInt8Ty(); + Type *Int32Ty = IRB.getInt32Ty(); + + return replaceFunction(F, [&](CallInst *CI) -> Error { + IRB.SetInsertPoint(CI); + + Value *Handle = + createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType()); + Value *Index0 = CI->getArgOperand(1); + Value *Index1 = UndefValue::get(Int32Ty); + // For typed stores, the mask must always cover all four elements. + Constant *Mask = ConstantInt::get(Int8Ty, 0xF); + + Value *Data = CI->getArgOperand(2); + auto *DataTy = dyn_cast(Data->getType()); + if (!DataTy || DataTy->getNumElements() != 4) + return make_error( + "typedBufferStore data must be a vector of 4 elements", + inconvertibleErrorCode()); + Value *Data0 = + IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 0)); + Value *Data1 = + IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 1)); + Value *Data2 = + IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 2)); + Value *Data3 = + IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 3)); + + std::array Args{Handle, Index0, Index1, Data0, + Data1, Data2, Data3, Mask}; + Expected OpCall = + OpBuilder.tryCreateOp(OpCode::BufferStore, Args); + if (Error E = OpCall.takeError()) + return E; + + CI->eraseFromParent(); + return Error::success(); + }); + } + bool lowerIntrinsics() { bool Updated = false; + bool HasErrors = false; for (Function &F : make_early_inc_range(M.functions())) { if (!F.isDeclaration()) @@ -380,19 +427,22 @@ class OpLowerer { continue; #define DXIL_OP_INTRINSIC(OpCode, Intrin) \ case Intrin: \ - replaceFunctionWithOp(F, OpCode); \ + HasErrors |= replaceFunctionWithOp(F, OpCode); \ break; #include "DXILOperation.inc" case Intrinsic::dx_handle_fromBinding: - lowerHandleFromBinding(F); + HasErrors |= lowerHandleFromBinding(F); break; case Intrinsic::dx_typedBufferLoad: - lowerTypedBufferLoad(F); + HasErrors |= lowerTypedBufferLoad(F); + break; + case Intrinsic::dx_typedBufferStore: + HasErrors |= lowerTypedBufferStore(F); break; } Updated = true; } - if (Updated) + if (Updated && !HasErrors) cleanupHandleCasts(); return Updated; diff --git a/llvm/test/CodeGen/DirectX/BufferStore-errors.ll b/llvm/test/CodeGen/DirectX/BufferStore-errors.ll new file mode 100644 index 0000000000000..b00fc38c901f9 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/BufferStore-errors.ll @@ -0,0 +1,37 @@ +; We use llc for this test so that we don't abort after the first error. +; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s + +target triple = "dxil-pc-shadermodel6.6-compute" + +; CHECK: error: +; CHECK-SAME: in function storetoomany +; CHECK-SAME: typedBufferStore data must be a vector of 4 elements +define void @storetoomany(<5 x float> %data, i32 %index) { + %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v5f32( + target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, + i32 %index, <5 x float> %data) + + ret void +} + +; CHECK: error: +; CHECK-SAME: in function storetoofew +; CHECK-SAME: typedBufferStore data must be a vector of 4 elements +define void @storetoofew(<3 x i32> %data, i32 %index) { + %buffer = call target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_1_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4i32_1_0_0t.v3i32( + target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %buffer, + i32 %index, <3 x i32> %data) + + ret void +} + +declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v5f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <5 x float>) +declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4i32_1_0_0t.v3i32(target("dx.TypedBuffer", <4 x i32>, 1, 0, 0), i32, <3 x i32>) diff --git a/llvm/test/CodeGen/DirectX/BufferStore.ll b/llvm/test/CodeGen/DirectX/BufferStore.ll new file mode 100644 index 0000000000000..4aebbe155dc99 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/BufferStore.ll @@ -0,0 +1,92 @@ +; RUN: opt -S -dxil-op-lower %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.6-compute" + +define void @storefloat(<4 x float> %data, i32 %index) { + + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]] + %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; The temporary casts should all have been cleaned up + ; CHECK-NOT: %dx.cast_handle + + ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data, i32 0 + ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data, i32 1 + ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data, i32 2 + ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data, i32 3 + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15) + call void @llvm.dx.typedBufferStore( + target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, + i32 %index, <4 x float> %data) + + ret void +} + +define void @storeint(<4 x i32> %data, i32 %index) { + + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]] + %buffer = call target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_1_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x i32> %data, i32 0 + ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x i32> %data, i32 1 + ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x i32> %data, i32 2 + ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x i32> %data, i32 3 + ; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, i32 [[DATA0_0]], i32 [[DATA0_1]], i32 [[DATA0_2]], i32 [[DATA0_3]], i8 15) + call void @llvm.dx.typedBufferStore( + target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %buffer, + i32 %index, <4 x i32> %data) + + ret void +} + +define void @storehalf(<4 x half> %data, i32 %index) { + + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]] + %buffer = call target("dx.TypedBuffer", <4 x half>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f16_1_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; The temporary casts should all have been cleaned up + ; CHECK-NOT: %dx.cast_handle + + ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x half> %data, i32 0 + ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x half> %data, i32 1 + ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x half> %data, i32 2 + ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x half> %data, i32 3 + ; CHECK: call void @dx.op.bufferStore.f16(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, half [[DATA0_0]], half [[DATA0_1]], half [[DATA0_2]], half [[DATA0_3]], i8 15) + call void @llvm.dx.typedBufferStore( + target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, + i32 %index, <4 x half> %data) + + ret void +} + +define void @storei16(<4 x i16> %data, i32 %index) { + + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]] + %buffer = call target("dx.TypedBuffer", <4 x i16>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i16_1_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; The temporary casts should all have been cleaned up + ; CHECK-NOT: %dx.cast_handle + + ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x i16> %data, i32 0 + ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x i16> %data, i32 1 + ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x i16> %data, i32 2 + ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x i16> %data, i32 3 + ; CHECK: call void @dx.op.bufferStore.i16(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, i16 [[DATA0_0]], i16 [[DATA0_1]], i16 [[DATA0_2]], i16 [[DATA0_3]], i8 15) + call void @llvm.dx.typedBufferStore( + target("dx.TypedBuffer", <4 x i16>, 1, 0, 0) %buffer, + i32 %index, <4 x i16> %data) + + ret void +}