From 01d58d91559fb73cc6198e229a12749532fc3834 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Wed, 11 Jun 2025 18:19:37 +0000 Subject: [PATCH 1/2] pre-commit tests --- llvm/test/CodeGen/NVPTX/bug26185-2.ll | 24 +++++++-- llvm/test/CodeGen/NVPTX/bug26185.ll | 77 ++++++++++++++++++++++----- 2 files changed, 86 insertions(+), 15 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll index c4d1537557cad..2778f59d50fd3 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} @@ -10,14 +11,31 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; CHECK-LABEL: spam define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 { +; CHECK-LABEL: spam( +; CHECK: .maxntid 1, 1, 1 +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %bb +; CHECK-NEXT: ld.param.b64 %rd1, [spam_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [spam_param_3]; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 1; +; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3; +; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1]; +; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd4+16]; +; CHECK-NEXT: cvt.s32.s16 %r1, %rs1; +; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1; +; CHECK-NEXT: ld.global.b64 %rd7, [%rd5]; +; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7; +; CHECK-NEXT: st.global.b64 [%rd5], %rd8; +; CHECK-NEXT: ret; bb: %tmp5 = add nsw i64 %arg3, 8 %tmp6 = getelementptr i16, ptr addrspace(1) %arg, i64 %tmp5 -; CHECK: ld.global.nc.b16 %tmp7 = load i16, ptr addrspace(1) %tmp6, align 2 -; CHECK: cvt.s32.s16 %tmp8 = sext i16 %tmp7 to i64 %tmp9 = mul nsw i64 %tmp8, %tmp8 %tmp10 = load i64, ptr addrspace(1) %arg1, align 8 diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll index 3b30ce560edbc..1663d335c7724 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} @@ -7,45 +8,97 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-unknown-unknown" -; CHECK-LABEL: ex_zext define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) { +; CHECK-LABEL: ex_zext( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b64 %rd1, [ex_zext_param_0]; +; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; +; CHECK-NEXT: ld.param.b64 %rd3, [ex_zext_param_1]; +; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3; +; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd2]; +; CHECK-NEXT: cvt.u32.u8 %r1, %rs1; +; CHECK-NEXT: st.global.b32 [%rd4], %r1; +; CHECK-NEXT: ret; entry: -; CHECK: ld.global.nc.b8 %val = load i8, ptr %data -; CHECK: cvt.u32.u8 %valext = zext i8 %val to i32 store i32 %valext, ptr %res ret void } -; CHECK-LABEL: ex_sext define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) { +; CHECK-LABEL: ex_sext( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b64 %rd1, [ex_sext_param_0]; +; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; +; CHECK-NEXT: ld.param.b64 %rd3, [ex_sext_param_1]; +; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3; +; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd2]; +; CHECK-NEXT: cvt.s32.s8 %r1, %rs1; +; CHECK-NEXT: st.global.b32 [%rd4], %r1; +; CHECK-NEXT: ret; entry: -; CHECK: ld.global.nc.b8 %val = load i8, ptr %data -; CHECK: cvt.s32.s8 %valext = sext i8 %val to i32 store i32 %valext, ptr %res ret void } -; CHECK-LABEL: ex_zext_v2 define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) { +; CHECK-LABEL: ex_zext_v2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b64 %rd1, [ex_zext_v2_param_0]; +; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; +; CHECK-NEXT: ld.param.b64 %rd3, [ex_zext_v2_param_1]; +; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3; +; CHECK-NEXT: ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: st.global.v2.b32 [%rd4], {%r2, %r1}; +; CHECK-NEXT: ret; entry: -; CHECK: ld.global.nc.v2.b8 %val = load <2 x i8>, ptr %data -; CHECK: cvt.u32.u16 %valext = zext <2 x i8> %val to <2 x i32> store <2 x i32> %valext, ptr %res ret void } -; CHECK-LABEL: ex_sext_v2 define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) { +; CHECK-LABEL: ex_sext_v2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b64 %rd1, [ex_sext_v2_param_0]; +; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; +; CHECK-NEXT: ld.param.b64 %rd3, [ex_sext_v2_param_1]; +; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3; +; CHECK-NEXT: ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs2; +; CHECK-NEXT: cvt.s32.s8 %r2, %r1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-NEXT: cvt.s32.s8 %r4, %r3; +; CHECK-NEXT: st.global.v2.b32 [%rd4], {%r4, %r2}; +; CHECK-NEXT: ret; entry: -; CHECK: ld.global.nc.v2.b8 %val = load <2 x i8>, ptr %data -; CHECK: cvt.s32.s8 %valext = sext <2 x i8> %val to <2 x i32> store <2 x i32> %valext, ptr %res ret void From 706c502ad40c110dee5fda5eab097a4f5372c850 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Wed, 11 Jun 2025 19:44:25 +0000 Subject: [PATCH 2/2] [NVPTX] Cleanup ld/st lowering --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 450 +++++++------------ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 3 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 4 - llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 94 ++-- llvm/test/CodeGen/NVPTX/bug26185-2.ll | 4 +- llvm/test/CodeGen/NVPTX/bug26185.ll | 8 +- llvm/test/CodeGen/NVPTX/i1-ext-load.ll | 4 +- llvm/test/CodeGen/NVPTX/ldu-ldg.ll | 8 +- llvm/test/CodeGen/NVPTX/variadics-backend.ll | 19 +- 9 files changed, 229 insertions(+), 365 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 79b1bfbc8072b..ff10eea371049 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -136,7 +136,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { break; case NVPTXISD::LDUV2: case NVPTXISD::LDUV4: - if (tryLDGLDU(N)) + if (tryLDU(N)) return; break; case NVPTXISD::StoreV2: @@ -324,7 +324,7 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_p: - return tryLDGLDU(N); + return tryLDU(N); case Intrinsic::nvvm_tcgen05_ld_16x64b_x1: case Intrinsic::nvvm_tcgen05_ld_16x64b_x2: @@ -1048,35 +1048,28 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { assert(LD->readMem() && "Expected load"); // do not support pre/post inc/dec - LoadSDNode *PlainLoad = dyn_cast(N); + const LoadSDNode *PlainLoad = dyn_cast(LD); if (PlainLoad && PlainLoad->isIndexed()) return false; - EVT LoadedVT = LD->getMemoryVT(); - if (!LoadedVT.isSimple()) + const EVT LoadedEVT = LD->getMemoryVT(); + if (!LoadedEVT.isSimple()) return false; + const MVT LoadedVT = LoadedEVT.getSimpleVT(); // Address Space Setting const unsigned CodeAddrSpace = getCodeAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) - return tryLDGLDU(N); + return tryLDG(LD); - SDLoc DL(N); + SDLoc DL(LD); SDValue Chain = N->getOperand(0); - auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD); + const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD); - // Type Setting: fromType + fromTypeWidth - // - // Sign : ISD::SEXTLOAD - // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the - // type is integer - // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float - MVT SimpleVT = LoadedVT.getSimpleVT(); - // Read at least 8 bits (predicates are stored as 8-bit values) - unsigned FromTypeWidth = std::max(8U, (unsigned)SimpleVT.getSizeInBits()); + const unsigned FromTypeWidth = LoadedVT.getSizeInBits(); // Vector Setting - unsigned int FromType = + const unsigned FromType = (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) ? NVPTX::PTXLdStInstCode::Signed : NVPTX::PTXLdStInstCode::Untyped; @@ -1102,29 +1095,17 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { if (!Opcode) return false; - SDNode *NVPTXLD = - CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops); + SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops); if (!NVPTXLD) return false; - MachineMemOperand *MemRef = cast(N)->getMemOperand(); + MachineMemOperand *MemRef = LD->getMemOperand(); CurDAG->setNodeMemRefs(cast(NVPTXLD), {MemRef}); - ReplaceNode(N, NVPTXLD); + ReplaceNode(LD, NVPTXLD); return true; } -static bool isSubVectorPackedInI32(EVT EltVT) { - // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for - // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use - // vectorized loads/stores with the actual element type for i8/i16 as that - // would require v8/v16 variants that do not exist. - // In order to load/store such vectors efficiently, in Type Legalization - // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will - // lower to PTX as vectors of b32. - return Isv2x16VT(EltVT) || EltVT == MVT::v4i8; -} - static unsigned getLoadStoreVectorNumElts(SDNode *N) { switch (N->getOpcode()) { case NVPTXISD::LoadV2: @@ -1142,21 +1123,21 @@ static unsigned getLoadStoreVectorNumElts(SDNode *N) { } bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { - MemSDNode *MemSD = cast(N); - const EVT MemEVT = MemSD->getMemoryVT(); + MemSDNode *LD = cast(N); + const EVT MemEVT = LD->getMemoryVT(); if (!MemEVT.isSimple()) return false; const MVT MemVT = MemEVT.getSimpleVT(); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); - if (canLowerToLDG(*MemSD, *Subtarget, CodeAddrSpace)) - return tryLDGLDU(N); + const unsigned CodeAddrSpace = getCodeAddrSpace(LD); + if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) + return tryLDG(LD); - EVT EltVT = N->getValueType(0); - SDLoc DL(N); - SDValue Chain = N->getOperand(0); - auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD); + const MVT EltVT = LD->getSimpleValueType(0); + SDLoc DL(LD); + SDValue Chain = LD->getChain(); + const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD); // Type Setting: fromType + fromTypeWidth // @@ -1167,18 +1148,15 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { // Read at least 8 bits (predicates are stored as 8-bit values) // The last operand holds the original LoadSDNode::getExtensionType() value const unsigned TotalWidth = MemVT.getSizeInBits(); - unsigned ExtensionType = N->getConstantOperandVal(N->getNumOperands() - 1); - unsigned FromType = (ExtensionType == ISD::SEXTLOAD) - ? NVPTX::PTXLdStInstCode::Signed - : NVPTX::PTXLdStInstCode::Untyped; + const unsigned ExtensionType = + N->getConstantOperandVal(N->getNumOperands() - 1); + const unsigned FromType = (ExtensionType == ISD::SEXTLOAD) + ? NVPTX::PTXLdStInstCode::Signed + : NVPTX::PTXLdStInstCode::Untyped; - unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N); - - if (isSubVectorPackedInI32(EltVT)) { - assert(ExtensionType == ISD::NON_EXTLOAD); - EltVT = MVT::i32; - } + const unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N); + assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD)); assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 && FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load"); @@ -1196,192 +1174,183 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { std::optional Opcode; switch (N->getOpcode()) { default: - return false; + llvm_unreachable("Unexpected opcode"); case NVPTXISD::LoadV2: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2, - NVPTX::LDV_i16_v2, NVPTX::LDV_i32_v2, - NVPTX::LDV_i64_v2); + Opcode = + pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v2, NVPTX::LDV_i16_v2, + NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2); break; case NVPTXISD::LoadV4: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4, - NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4, - NVPTX::LDV_i64_v4); + Opcode = + pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v4, NVPTX::LDV_i16_v4, + NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4); break; case NVPTXISD::LoadV8: - Opcode = - pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */}, - {/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */}); + Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i8 */}, {/* no v8i16 */}, + NVPTX::LDV_i32_v8, {/* no v8i64 */}); break; } if (!Opcode) return false; - SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops); - MachineMemOperand *MemRef = cast(N)->getMemOperand(); - CurDAG->setNodeMemRefs(cast(LD), {MemRef}); + MachineMemOperand *MemRef = LD->getMemOperand(); + CurDAG->setNodeMemRefs(cast(NVPTXLD), {MemRef}); - ReplaceNode(N, LD); + ReplaceNode(LD, NVPTXLD); return true; } -bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { - auto *Mem = cast(N); - - // If this is an LDG intrinsic, the address is the third operand. If its an - // LDG/LDU SD node (from custom vector handling), then its the second operand - SDValue Op1 = N->getOperand(N->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1); +bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) { + const EVT LoadedEVT = LD->getMemoryVT(); + if (!LoadedEVT.isSimple()) + return false; + const MVT LoadedVT = LoadedEVT.getSimpleVT(); - const EVT OrigType = N->getValueType(0); - EVT EltVT = Mem->getMemoryVT(); - unsigned NumElts = 1; + SDLoc DL(LD); - if (EltVT == MVT::i128 || EltVT == MVT::f128) { - EltVT = MVT::i64; - NumElts = 2; - } - if (EltVT.isVector()) { - NumElts = EltVT.getVectorNumElements(); - EltVT = EltVT.getVectorElementType(); - // vectors of 8/16bits type are loaded/stored as multiples of v4i8/v2x16 - // elements. - if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) || - (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) || - (EltVT == MVT::i16 && OrigType == MVT::v2i16) || - (EltVT == MVT::i8 && OrigType == MVT::v4i8)) { - assert(NumElts % OrigType.getVectorNumElements() == 0 && - "NumElts must be divisible by the number of elts in subvectors"); - EltVT = OrigType; - NumElts /= OrigType.getVectorNumElements(); - } + const unsigned TotalWidth = LoadedVT.getSizeInBits(); + unsigned ExtensionType; + unsigned NumElts; + if (const auto *Load = dyn_cast(LD)) { + ExtensionType = Load->getExtensionType(); + NumElts = 1; + } else { + ExtensionType = LD->getConstantOperandVal(LD->getNumOperands() - 1); + NumElts = getLoadStoreVectorNumElts(LD); } + const unsigned FromType = (ExtensionType == ISD::SEXTLOAD) + ? NVPTX::PTXLdStInstCode::Signed + : NVPTX::PTXLdStInstCode::Untyped; - // Build the "promoted" result VTList for the load. If we are really loading - // i8s, then the return type will be promoted to i16 since we do not expose - // 8-bit registers in NVPTX. - const EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; - SmallVector InstVTs; - InstVTs.append(NumElts, NodeVT); - InstVTs.push_back(MVT::Other); - SDVTList InstVTList = CurDAG->getVTList(InstVTs); - SDValue Chain = N->getOperand(0); + const unsigned FromTypeWidth = TotalWidth / NumElts; + + assert(!(LD->getSimpleValueType(0).isVector() && + ExtensionType != ISD::NON_EXTLOAD)); + assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 && + FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load"); SDValue Base, Offset; - SelectADDR(Op1, Base, Offset); - SDValue Ops[] = {Base, Offset, Chain}; + SelectADDR(LD->getOperand(1), Base, Offset); + SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base, + Offset, LD->getChain()}; + const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; std::optional Opcode; - switch (N->getOpcode()) { + switch (LD->getOpcode()) { default: - return false; + llvm_unreachable("Unexpected opcode"); case ISD::LOAD: - Opcode = pickOpcodeForVT( - EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8, - NVPTX::INT_PTX_LDG_GLOBAL_i16, NVPTX::INT_PTX_LDG_GLOBAL_i32, - NVPTX::INT_PTX_LDG_GLOBAL_i64); - break; - case ISD::INTRINSIC_W_CHAIN: - Opcode = pickOpcodeForVT( - EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8, - NVPTX::INT_PTX_LDU_GLOBAL_i16, NVPTX::INT_PTX_LDU_GLOBAL_i32, - NVPTX::INT_PTX_LDU_GLOBAL_i64); + Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i8, + NVPTX::LD_GLOBAL_NC_i16, NVPTX::LD_GLOBAL_NC_i32, + NVPTX::LD_GLOBAL_NC_i64); break; case NVPTXISD::LoadV2: Opcode = pickOpcodeForVT( - EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE, - NVPTX::INT_PTX_LDG_G_v2i16_ELE, NVPTX::INT_PTX_LDG_G_v2i32_ELE, - NVPTX::INT_PTX_LDG_G_v2i64_ELE); - break; - case NVPTXISD::LDUV2: - Opcode = pickOpcodeForVT( - EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v2i8_ELE, - NVPTX::INT_PTX_LDU_G_v2i16_ELE, NVPTX::INT_PTX_LDU_G_v2i32_ELE, - NVPTX::INT_PTX_LDU_G_v2i64_ELE); + TargetVT, NVPTX::LD_GLOBAL_NC_v2i8, NVPTX::LD_GLOBAL_NC_v2i16, + NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64); break; case NVPTXISD::LoadV4: Opcode = pickOpcodeForVT( - EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE, - NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE, - NVPTX::INT_PTX_LDG_G_v4i64_ELE); - break; - case NVPTXISD::LDUV4: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDU_G_v4i8_ELE, - NVPTX::INT_PTX_LDU_G_v4i16_ELE, - NVPTX::INT_PTX_LDU_G_v4i32_ELE, {/* no v4i64 */}); + TargetVT, NVPTX::LD_GLOBAL_NC_v4i8, NVPTX::LD_GLOBAL_NC_v4i16, + NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64); break; case NVPTXISD::LoadV8: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */}, - {/* no v8i16 */}, NVPTX::INT_PTX_LDG_G_v8i32_ELE, - {/* no v8i64 */}); + Opcode = pickOpcodeForVT(TargetVT, {/* no v8i8 */}, {/* no v8i16 */}, + NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */}); break; } if (!Opcode) return false; - SDLoc DL(N); - SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); + SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops); - // For automatic generation of LDG (through SelectLoad[Vector], not the - // intrinsics), we may have an extending load like: - // - // i32,ch = load t0, t7, undef:i64 - // - // In this case, the matching logic above will select a load for the original - // memory type (in this case, i8) and our types will not match (the node needs - // to return an i32 in this case). Our LDG/LDU nodes do not support the - // concept of sign-/zero-extension, so emulate it here by adding an explicit - // CVT instruction. Ptxas should clean up any redundancies here. - - LoadSDNode *LdNode = dyn_cast(N); - - if (OrigType != EltVT && - (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) { - // We have an extending-load. The instruction we selected operates on the - // smaller type, but the SDNode we are replacing has the larger type. We - // need to emit a CVT to make the types match. - unsigned CvtOpc = - GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode); - - // For each output value, apply the manual sign/zero-extension and make sure - // all users of the load go through that CVT. - for (unsigned i = 0; i != NumElts; ++i) { - SDValue Res(LD, i); - SDValue OrigVal(N, i); - - SDNode *CvtNode = - CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, - CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, - DL, MVT::i32)); - ReplaceUses(OrigVal, SDValue(CvtNode, 0)); - } + ReplaceNode(LD, NVPTXLDG); + return true; +} + +bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) { + auto *LD = cast(N); + + unsigned NumElts; + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode"); + case ISD::INTRINSIC_W_CHAIN: + NumElts = 1; + break; + case NVPTXISD::LDUV2: + NumElts = 2; + break; + case NVPTXISD::LDUV4: + NumElts = 4; + break; } - ReplaceNode(N, LD); + const MVT::SimpleValueType SelectVT = + MVT::getIntegerVT(LD->getMemoryVT().getSizeInBits() / NumElts).SimpleTy; + + // If this is an LDU intrinsic, the address is the third operand. If its an + // LDU SD node (from custom vector handling), then its the second operand + SDValue Addr = + LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1); + + SDValue Base, Offset; + SelectADDR(Addr, Base, Offset); + SDValue Ops[] = {Base, Offset, LD->getChain()}; + + std::optional Opcode; + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode"); + case ISD::INTRINSIC_W_CHAIN: + Opcode = + pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_i8, NVPTX::LDU_GLOBAL_i16, + NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64); + break; + case NVPTXISD::LDUV2: + Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v2i8, + NVPTX::LDU_GLOBAL_v2i16, NVPTX::LDU_GLOBAL_v2i32, + NVPTX::LDU_GLOBAL_v2i64); + break; + case NVPTXISD::LDUV4: + Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v4i8, + NVPTX::LDU_GLOBAL_v4i16, NVPTX::LDU_GLOBAL_v4i32, + {/* no v4i64 */}); + break; + } + if (!Opcode) + return false; + + SDLoc DL(N); + SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops); + + ReplaceNode(LD, NVPTXLDU); return true; } bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { MemSDNode *ST = cast(N); assert(ST->writeMem() && "Expected store"); - StoreSDNode *PlainStore = dyn_cast(N); - AtomicSDNode *AtomicStore = dyn_cast(N); + StoreSDNode *PlainStore = dyn_cast(ST); + AtomicSDNode *AtomicStore = dyn_cast(ST); assert((PlainStore || AtomicStore) && "Expected store"); // do not support pre/post inc/dec if (PlainStore && PlainStore->isIndexed()) return false; - EVT StoreVT = ST->getMemoryVT(); + const EVT StoreVT = ST->getMemoryVT(); if (!StoreVT.isSimple()) return false; // Address Space Setting - unsigned int CodeAddrSpace = getCodeAddrSpace(ST); + const unsigned CodeAddrSpace = getCodeAddrSpace(ST); - SDLoc DL(N); + SDLoc DL(ST); SDValue Chain = ST->getChain(); - auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST); + const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST); // Vector Setting const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits(); @@ -1417,85 +1386,78 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!NVPTXST) return false; - MachineMemOperand *MemRef = cast(N)->getMemOperand(); + MachineMemOperand *MemRef = ST->getMemOperand(); CurDAG->setNodeMemRefs(cast(NVPTXST), {MemRef}); - ReplaceNode(N, NVPTXST); + ReplaceNode(ST, NVPTXST); return true; } bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { - SDValue Op1 = N->getOperand(1); - EVT EltVT = Op1.getValueType(); - MemSDNode *MemSD = cast(N); - EVT StoreVT = MemSD->getMemoryVT(); + MemSDNode *ST = cast(N); + const EVT StoreVT = ST->getMemoryVT(); assert(StoreVT.isSimple() && "Store value is not simple"); // Address Space Setting - unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); + const unsigned CodeAddrSpace = getCodeAddrSpace(ST); if (CodeAddrSpace == NVPTX::AddressSpace::Const) { report_fatal_error("Cannot store to pointer that points to constant " "memory space"); } - SDLoc DL(N); - SDValue Chain = N->getOperand(0); - auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD); + SDLoc DL(ST); + SDValue Chain = ST->getChain(); + const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST); // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits(); - unsigned NumElts = getLoadStoreVectorNumElts(N); - - SmallVector Ops(N->ops().slice(1, NumElts)); - SDValue N2 = N->getOperand(NumElts + 1); - unsigned ToTypeWidth = TotalWidth / NumElts; + const unsigned NumElts = getLoadStoreVectorNumElts(ST); - if (isSubVectorPackedInI32(EltVT)) { - EltVT = MVT::i32; - } + SmallVector Ops(ST->ops().slice(1, NumElts)); + SDValue Addr = N->getOperand(NumElts + 1); + const unsigned ToTypeWidth = TotalWidth / NumElts; assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for store"); SDValue Offset, Base; - SelectADDR(N2, Base, Offset); + SelectADDR(Addr, Base, Offset); Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL), getI32Imm(ToTypeWidth, DL), Base, Offset, Chain}); + const MVT::SimpleValueType EltVT = + ST->getOperand(1).getSimpleValueType().SimpleTy; std::optional Opcode; - switch (N->getOpcode()) { + switch (ST->getOpcode()) { default: return false; case NVPTXISD::StoreV2: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2, - NVPTX::STV_i16_v2, NVPTX::STV_i32_v2, - NVPTX::STV_i64_v2); + Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v2, NVPTX::STV_i16_v2, + NVPTX::STV_i32_v2, NVPTX::STV_i64_v2); break; case NVPTXISD::StoreV4: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4, - NVPTX::STV_i16_v4, NVPTX::STV_i32_v4, - NVPTX::STV_i64_v4); + Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v4, NVPTX::STV_i16_v4, + NVPTX::STV_i32_v4, NVPTX::STV_i64_v4); break; case NVPTXISD::StoreV8: - Opcode = - pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */}, - {/* no v8i16 */}, NVPTX::STV_i32_v8, {/* no v8i64 */}); + Opcode = pickOpcodeForVT(EltVT, {/* no v8i8 */}, {/* no v8i16 */}, + NVPTX::STV_i32_v8, {/* no v8i64 */}); break; } if (!Opcode) return false; - SDNode *ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); + SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); - MachineMemOperand *MemRef = cast(N)->getMemOperand(); - CurDAG->setNodeMemRefs(cast(ST), {MemRef}); + MachineMemOperand *MemRef = ST->getMemOperand(); + CurDAG->setNodeMemRefs(cast(NVPTXST), {MemRef}); - ReplaceNode(N, ST); + ReplaceNode(ST, NVPTXST); return true; } @@ -2285,70 +2247,6 @@ void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) { ReplaceNode(N, Mov); } -/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a -/// conversion from \p SrcTy to \p DestTy. -unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, - LoadSDNode *LdNode) { - bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD; - switch (SrcTy.SimpleTy) { - default: - llvm_unreachable("Unhandled source type"); - case MVT::i8: - switch (DestTy.SimpleTy) { - default: - llvm_unreachable("Unhandled dest type"); - case MVT::i16: - return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; - case MVT::i32: - return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; - case MVT::i64: - return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; - } - case MVT::i16: - switch (DestTy.SimpleTy) { - default: - llvm_unreachable("Unhandled dest type"); - case MVT::i8: - return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; - case MVT::i32: - return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; - case MVT::i64: - return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; - } - case MVT::i32: - switch (DestTy.SimpleTy) { - default: - llvm_unreachable("Unhandled dest type"); - case MVT::i8: - return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; - case MVT::i16: - return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; - case MVT::i64: - return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; - } - case MVT::i64: - switch (DestTy.SimpleTy) { - default: - llvm_unreachable("Unhandled dest type"); - case MVT::i8: - return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; - case MVT::i16: - return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; - case MVT::i32: - return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; - } - case MVT::f16: - switch (DestTy.SimpleTy) { - default: - llvm_unreachable("Unhandled dest type"); - case MVT::f32: - return NVPTX::CVT_f32_f16; - case MVT::f64: - return NVPTX::CVT_f64_f16; - } - } -} - bool NVPTXDAGToDAGISel::tryFence(SDNode *N) { SDLoc DL(N); assert(N->getOpcode() == ISD::ATOMIC_FENCE); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 473f4781a6c38..ff58e4486a222 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -75,7 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { void SelectTexSurfHandle(SDNode *N); bool tryLoad(SDNode *N); bool tryLoadVector(SDNode *N); - bool tryLDGLDU(SDNode *N); + bool tryLDU(SDNode *N); + bool tryLDG(MemSDNode *N); bool tryStore(SDNode *N); bool tryStoreVector(SDNode *N); bool tryLoadParam(SDNode *N); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4c3501df57f84..5dbdce52f0553 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -135,11 +135,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; -def hasVote : Predicate<"Subtarget->hasVote()">; -def hasDouble : Predicate<"Subtarget->hasDouble()">; def hasClusters : Predicate<"Subtarget->hasClusters()">; -def hasLDG : Predicate<"Subtarget->hasLDG()">; -def hasLDU : Predicate<"Subtarget->hasLDU()">; def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">; def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">; def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index b3c1296cf0ca6..5de3dee1fb344 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2143,15 +2143,12 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; class LDU_G : NVPTXInst<(outs regclass:$result), (ins ADDR:$src), - "ldu.global." # TyStr # " \t$result, [$src];", - []>, Requires<[hasLDU]>; + "ldu.global." # TyStr # " \t$result, [$src];", []>; -def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>; -def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>; -def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>; -def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>; -def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>; -def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>; +def LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>; +def LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>; +def LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>; +def LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>; // vector @@ -2168,19 +2165,14 @@ class VLDU_G_ELE_V4 "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; -def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>; -def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>; -def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>; -def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>; -def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>; -def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>; +def LDU_GLOBAL_v2i8 : VLDU_G_ELE_V2<"b8", Int16Regs>; +def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<"b16", Int16Regs>; +def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<"b32", Int32Regs>; +def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<"b64", Int64Regs>; -def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>; -def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>; -def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>; -def INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>; -def INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>; -def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>; +def LDU_GLOBAL_v4i8 : VLDU_G_ELE_V4<"b8", Int16Regs>; +def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<"b16", Int16Regs>; +def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", Int32Regs>; //----------------------------------- @@ -2191,55 +2183,47 @@ def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>; // non-coherent texture cache, and therefore the values read must be read-only // during the lifetime of the kernel. -class LDG_G - : NVPTXInst<(outs regclass:$result), (ins ADDR:$src), - "ld.global.nc." # TyStr # " \t$result, [$src];", - []>, Requires<[hasLDG]>; +class LDG_G + : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>; -def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>; -def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>; -def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>; -def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>; -def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>; -def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>; +def LD_GLOBAL_NC_i8 : LDG_G; +def LD_GLOBAL_NC_i16 : LDG_G; +def LD_GLOBAL_NC_i32 : LDG_G; +def LD_GLOBAL_NC_i64 : LDG_G; // vector // Elementized vector ldg -class VLDG_G_ELE_V2 : +class VLDG_G_ELE_V2 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), - (ins ADDR:$src), - "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>; + (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>; -class VLDG_G_ELE_V4 : +class VLDG_G_ELE_V4 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins ADDR:$src), - "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; + (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; -class VLDG_G_ELE_V8 : +class VLDG_G_ELE_V8 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), - (ins ADDR:$src), - "ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>; + (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>; // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads. -def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>; -def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>; -def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>; -def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>; -def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>; -def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>; - -def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>; -def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>; -def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>; -def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>; - -def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>; -def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>; -def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>; -def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>; +def LD_GLOBAL_NC_v2i8 : VLDG_G_ELE_V2; +def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2; +def LD_GLOBAL_NC_v2i32 : VLDG_G_ELE_V2; +def LD_GLOBAL_NC_v2i64 : VLDG_G_ELE_V2; + +def LD_GLOBAL_NC_v4i8 : VLDG_G_ELE_V4; +def LD_GLOBAL_NC_v4i16 : VLDG_G_ELE_V4; +def LD_GLOBAL_NC_v4i32 : VLDG_G_ELE_V4; + +def LD_GLOBAL_NC_v4i64 : VLDG_G_ELE_V4; +def LD_GLOBAL_NC_v8i32 : VLDG_G_ELE_V8; multiclass NG_TO_G Preds = []> { if Supports32 then diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll index 2778f59d50fd3..4e11f58f85ee0 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll @@ -15,7 +15,6 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p ; CHECK-LABEL: spam( ; CHECK: .maxntid 1, 1, 1 ; CHECK-NEXT: { -; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<9>; ; CHECK-EMPTY: @@ -25,8 +24,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p ; CHECK-NEXT: shl.b64 %rd3, %rd2, 1; ; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3; ; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1]; -; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd4+16]; -; CHECK-NEXT: cvt.s32.s16 %r1, %rs1; +; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16]; ; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1; ; CHECK-NEXT: ld.global.b64 %rd7, [%rd5]; ; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7; diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll index 1663d335c7724..6148c0756e393 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185.ll @@ -11,7 +11,6 @@ target triple = "nvptx64-unknown-unknown" define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) { ; CHECK-LABEL: ex_zext( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: @@ -20,8 +19,7 @@ define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) { ; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; ; CHECK-NEXT: ld.param.b64 %rd3, [ex_zext_param_1]; ; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3; -; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd2]; -; CHECK-NEXT: cvt.u32.u8 %r1, %rs1; +; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd2]; ; CHECK-NEXT: st.global.b32 [%rd4], %r1; ; CHECK-NEXT: ret; entry: @@ -34,7 +32,6 @@ entry: define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) { ; CHECK-LABEL: ex_sext( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: @@ -43,8 +40,7 @@ define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) { ; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; ; CHECK-NEXT: ld.param.b64 %rd3, [ex_sext_param_1]; ; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3; -; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd2]; -; CHECK-NEXT: cvt.s32.s8 %r1, %rs1; +; CHECK-NEXT: ld.global.nc.s8 %r1, [%rd2]; ; CHECK-NEXT: st.global.b32 [%rd4], %r1; ; CHECK-NEXT: ret; entry: diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll index bb88d1f2755ca..3dceefb93a47d 100644 --- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll +++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll @@ -7,7 +7,6 @@ target triple = "nvptx-nvidia-cuda" define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) { ; CHECK-LABEL: foo( -; CHECK: .reg .b16 %rs<2>; ; CHECK: .reg .b32 %r<4>; ; CHECK: .reg .b64 %rd<5>; ; CHECK-EMPTY: @@ -15,8 +14,7 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) { ; CHECK: cvta.to.global.u64 %rd2, %rd1; ; CHECK: ld.param.b64 %rd3, [foo_param_1]; ; CHECK: cvta.to.global.u64 %rd4, %rd3; -; CHECK: ld.global.nc.b8 %rs1, [%rd2]; -; CHECK: cvt.u32.u8 %r1, %rs1; +; CHECK: ld.global.nc.b8 %r1, [%rd2]; ; CHECK: add.s32 %r2, %r1, 1; ; CHECK: and.b32 %r3, %r2, 1; ; CHECK: st.global.b32 [%rd4], %r3; diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll index 7ac697c4ce203..7f4b049af84fb 100644 --- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll @@ -163,14 +163,12 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) { define i8 @test_ldg_i8(ptr addrspace(1) %ptr) { ; CHECK-LABEL: test_ldg_i8( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i8_param_0]; -; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd1]; -; CHECK-NEXT: cvt.u32.u8 %r1, %rs1; +; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4) @@ -180,14 +178,12 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) { define i16 @test_ldg_i16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: test_ldg_i16( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i16_param_0]; -; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd1]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: ld.global.nc.b16 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2) diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 3bbdf641ade26..ddaa9fd831af7 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -211,7 +211,7 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b16 %rs<8>; +; CHECK-PTX-NEXT: .reg .b16 %rs<5>; ; CHECK-PTX-NEXT: .reg .b32 %r<4>; ; CHECK-PTX-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX-EMPTY: @@ -220,18 +220,15 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7]; -; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1; -; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs2; -; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6]; -; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3; -; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs4; -; CHECK-PTX-NEXT: ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5]; -; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5; -; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs6; +; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1; +; CHECK-PTX-NEXT: ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2; +; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3; ; CHECK-PTX-NEXT: mov.b32 %r1, 1; ; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1; -; CHECK-PTX-NEXT: mov.b16 %rs7, 1; -; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs7; +; CHECK-PTX-NEXT: mov.b16 %rs4, 1; +; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs4; ; CHECK-PTX-NEXT: mov.b64 %rd3, 1; ; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd3; ; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8;