diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 3769aae7b620f..8bf0723220093 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2056,18 +2056,28 @@ SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops, SDLoc()); } + SDLoc DL(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - uint64_t Align = cast(Op.getOperand(2))->getZExtValue(); - SDLoc DL(Op.getNode()); + uint64_t Align = Op.getConstantOperandVal(2); + + // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that + // the default stack alignment should be used. + if (Align == 0) + Align = DAG.getSubtarget().getFrameLowering()->getStackAlign().value(); // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32. - MVT ValueSizeTy = nvTM->is64Bit() ? MVT::i64 : MVT::i32; + const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL); + + SDValue Alloc = + DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other}, + {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT), + DAG.getTargetConstant(Align, DL, MVT::i32)}); + + SDValue ASC = DAG.getAddrSpaceCast( + DL, Op.getValueType(), Alloc, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC); - SDValue AllocOps[] = {Chain, DAG.getZExtOrTrunc(Size, DL, ValueSizeTy), - DAG.getTargetConstant(Align, DL, MVT::i32)}; - EVT RetTypes[] = {ValueSizeTy, MVT::Other}; - return DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, RetTypes, AllocOps); + return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL); } SDValue NVPTXTargetLowering::LowerSTACKRESTORE(SDValue Op, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 6639554e450f2..a90dfe7a0e6ca 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3102,28 +3102,20 @@ def CALL_PROTOTYPE : "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; def SDTDynAllocaOp : - SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>; + SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>; def dyn_alloca : SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp, [SDNPHasChain, SDNPSideEffect]>; -def DYNAMIC_STACKALLOC32 : - NVPTXInst<(outs Int32Regs:$ptr), - (ins Int32Regs:$size, i32imm:$align), - "alloca.u32 \t$ptr, $size, $align;\n\t" - "cvta.local.u32 \t$ptr, $ptr;", - [(set i32:$ptr, (dyn_alloca i32:$size, (i32 timm:$align)))]>, - Requires<[hasPTX<73>, hasSM<52>]>; - -def DYNAMIC_STACKALLOC64 : - NVPTXInst<(outs Int64Regs:$ptr), - (ins Int64Regs:$size, i32imm:$align), - "alloca.u64 \t$ptr, $size, $align;\n\t" - "cvta.local.u64 \t$ptr, $ptr;", - [(set i64:$ptr, (dyn_alloca i64:$size, (i32 timm:$align)))]>, - Requires<[hasPTX<73>, hasSM<52>]>; - +foreach t = [I32RT, I64RT] in { + def DYNAMIC_STACKALLOC # t.Size : + NVPTXInst<(outs t.RC:$ptr), + (ins t.RC:$size, i32imm:$align), + "alloca.u" # t.Size # " \t$ptr, $size, $align;", + [(set t.Ty:$ptr, (dyn_alloca t.Ty:$size, timm:$align))]>, + Requires<[hasPTX<73>, hasSM<52>]>; +} // // BRX diff --git a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll index f70831cc97ae1..0474d82556c1e 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll @@ -6,20 +6,20 @@ target triple = "nvptx64-nvidia-cuda" define void @foo(i64 %a, ptr %p0, ptr %p1) { ; CHECK-LABEL: foo( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-NEXT: .reg .b64 %rd<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; ; CHECK-NEXT: add.s64 %rd2, %rd1, 7; ; CHECK-NEXT: and.b64 %rd3, %rd2, -8; ; CHECK-NEXT: alloca.u64 %rd4, %rd3, 16; -; CHECK-NEXT: cvta.local.u64 %rd4, %rd4; -; CHECK-NEXT: ld.param.b64 %rd5, [foo_param_1]; -; CHECK-NEXT: alloca.u64 %rd6, %rd3, 16; -; CHECK-NEXT: cvta.local.u64 %rd6, %rd6; -; CHECK-NEXT: ld.param.b64 %rd7, [foo_param_2]; -; CHECK-NEXT: st.b64 [%rd5], %rd4; -; CHECK-NEXT: st.b64 [%rd7], %rd6; +; CHECK-NEXT: cvta.local.u64 %rd5, %rd4; +; CHECK-NEXT: ld.param.b64 %rd6, [foo_param_1]; +; CHECK-NEXT: alloca.u64 %rd7, %rd3, 16; +; CHECK-NEXT: cvta.local.u64 %rd8, %rd7; +; CHECK-NEXT: ld.param.b64 %rd9, [foo_param_2]; +; CHECK-NEXT: st.b64 [%rd6], %rd5; +; CHECK-NEXT: st.b64 [%rd9], %rd8; ; CHECK-NEXT: ret; %b = alloca i8, i64 %a, align 16 %c = alloca i8, i64 %a, align 16 diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll index 664569e3c525c..28bef0de48166 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll @@ -1,42 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS ; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS -; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32 -; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64 +; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-32 +; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-64 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} ; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52. -; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc( -; CHECK-NOT: __local_depot - -; CHECK-32: ld.param.b32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0]; -; CHECK-32-NEXT: add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7; -; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8; -; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16; -; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]]; -; CHECK-32-NEXT: { // callseq 0, 0 -; CHECK-32-NEXT: .param .b32 param0; -; CHECK-32-NEXT: st.param.b32 [param0], %r[[ALLOCA]]; - -; CHECK-64: ld.param.b64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0]; -; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7; -; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8; -; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16; -; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]]; -; CHECK-64-NEXT: { // callseq 0, 0 -; CHECK-64-NEXT: .param .b64 param0; -; CHECK-64-NEXT: st.param.b64 [param0], %rd[[ALLOCA]]; - -; CHECK-NEXT: .param .b32 retval0; -; CHECK-NEXT: call.uni (retval0), -; CHECK-NEXT: bar, - define i32 @test_dynamic_stackalloc(i64 %n) { +; CHECK-32-LABEL: test_dynamic_stackalloc( +; CHECK-32: { +; CHECK-32-NEXT: .reg .b32 %r<8>; +; CHECK-32-EMPTY: +; CHECK-32-NEXT: // %bb.0: +; CHECK-32-NEXT: ld.param.b32 %r1, [test_dynamic_stackalloc_param_0]; +; CHECK-32-NEXT: add.s32 %r2, %r1, 7; +; CHECK-32-NEXT: and.b32 %r3, %r2, -8; +; CHECK-32-NEXT: alloca.u32 %r4, %r3, 16; +; CHECK-32-NEXT: cvta.local.u32 %r5, %r4; +; CHECK-32-NEXT: { // callseq 0, 0 +; CHECK-32-NEXT: .param .b32 param0; +; CHECK-32-NEXT: st.param.b32 [param0], %r5; +; CHECK-32-NEXT: .param .b32 retval0; +; CHECK-32-NEXT: call.uni (retval0), +; CHECK-32-NEXT: bar, +; CHECK-32-NEXT: ( +; CHECK-32-NEXT: param0 +; CHECK-32-NEXT: ); +; CHECK-32-NEXT: ld.param.b32 %r6, [retval0]; +; CHECK-32-NEXT: } // callseq 0 +; CHECK-32-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-32-NEXT: ret; +; +; CHECK-64-LABEL: test_dynamic_stackalloc( +; CHECK-64: { +; CHECK-64-NEXT: .reg .b32 %r<3>; +; CHECK-64-NEXT: .reg .b64 %rd<6>; +; CHECK-64-EMPTY: +; CHECK-64-NEXT: // %bb.0: +; CHECK-64-NEXT: ld.param.b64 %rd1, [test_dynamic_stackalloc_param_0]; +; CHECK-64-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-64-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-64-NEXT: alloca.u64 %rd4, %rd3, 16; +; CHECK-64-NEXT: cvta.local.u64 %rd5, %rd4; +; CHECK-64-NEXT: { // callseq 0, 0 +; CHECK-64-NEXT: .param .b64 param0; +; CHECK-64-NEXT: st.param.b64 [param0], %rd5; +; CHECK-64-NEXT: .param .b32 retval0; +; CHECK-64-NEXT: call.uni (retval0), +; CHECK-64-NEXT: bar, +; CHECK-64-NEXT: ( +; CHECK-64-NEXT: param0 +; CHECK-64-NEXT: ); +; CHECK-64-NEXT: ld.param.b32 %r1, [retval0]; +; CHECK-64-NEXT: } // callseq 0 +; CHECK-64-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-64-NEXT: ret; %alloca = alloca i8, i64 %n, align 16 %call = call i32 @bar(ptr %alloca) ret i32 %call } +define float @test_dynamic_stackalloc_unaligned(i64 %0) { +; CHECK-32-LABEL: test_dynamic_stackalloc_unaligned( +; CHECK-32: { +; CHECK-32-NEXT: .reg .b32 %r<6>; +; CHECK-32-NEXT: .reg .b32 %f<2>; +; CHECK-32-EMPTY: +; CHECK-32-NEXT: // %bb.0: +; CHECK-32-NEXT: ld.param.b32 %r1, [test_dynamic_stackalloc_unaligned_param_0]; +; CHECK-32-NEXT: shl.b32 %r2, %r1, 2; +; CHECK-32-NEXT: add.s32 %r3, %r2, 7; +; CHECK-32-NEXT: and.b32 %r4, %r3, -8; +; CHECK-32-NEXT: alloca.u32 %r5, %r4, 8; +; CHECK-32-NEXT: ld.local.b32 %f1, [%r5]; +; CHECK-32-NEXT: st.param.b32 [func_retval0], %f1; +; CHECK-32-NEXT: ret; +; +; CHECK-64-LABEL: test_dynamic_stackalloc_unaligned( +; CHECK-64: { +; CHECK-64-NEXT: .reg .b32 %f<2>; +; CHECK-64-NEXT: .reg .b64 %rd<6>; +; CHECK-64-EMPTY: +; CHECK-64-NEXT: // %bb.0: +; CHECK-64-NEXT: ld.param.b64 %rd1, [test_dynamic_stackalloc_unaligned_param_0]; +; CHECK-64-NEXT: shl.b64 %rd2, %rd1, 2; +; CHECK-64-NEXT: add.s64 %rd3, %rd2, 7; +; CHECK-64-NEXT: and.b64 %rd4, %rd3, -8; +; CHECK-64-NEXT: alloca.u64 %rd5, %rd4, 8; +; CHECK-64-NEXT: ld.local.b32 %f1, [%rd5]; +; CHECK-64-NEXT: st.param.b32 [func_retval0], %f1; +; CHECK-64-NEXT: ret; + %4 = alloca float, i64 %0, align 4 + %5 = getelementptr float, ptr %4, i64 0 + %6 = load float, ptr %5, align 4 + ret float %6 +} + declare i32 @bar(ptr) +