From 0c2df80ac8d1dff137d7c1918c425ac2771b3554 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Fri, 23 Aug 2024 17:36:37 -0700 Subject: [PATCH 1/6] [NVPTX] Improve copy avoidance during lowering. On newer GPUs, where `cvta.param` instruction is available we can avoid making byval arguments when their pointers are used in a few more cases, even when __grid_constant__ is not specified. - phi - select - memcpy from the parameter. --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 267 ++++++--- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 3 + .../CodeGen/NVPTX/lower-args-gridconstant.ll | 556 ++++++++++++------ llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 402 ++++++++----- 4 files changed, 839 insertions(+), 389 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 1205ad4c6b008..243f39d8a1671 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -79,15 +79,15 @@ // // define void @foo({i32*, i32*}* byval %input) { // %b_param = addrspacecat ptr %input to ptr addrspace(101) -// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1 -// %b = load ptr, ptr addrspace(101) %b_ptr -// %b_global = addrspacecast ptr %b to ptr addrspace(1) -// ; use %b_generic +// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, +// i32 1 %b = load ptr, ptr addrspace(101) %b_ptr %b_global = addrspacecast +// ptr %b to ptr addrspace(1) ; use %b_generic // } // -// Create a local copy of kernel byval parameters used in a way that *might* mutate -// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters -// are undefined behaviour, and don't require local copies. +// Create a local copy of kernel byval parameters used in a way that *might* +// mutate the parameter, by storing it in an alloca. Mutations to +// "grid_constant" parameters are undefined behaviour, and don't require +// local copies. // // define void @foo(ptr byval(%struct.s) align 4 %input) { // store i32 42, ptr %input @@ -124,11 +124,11 @@ // // define void @foo(ptr byval(%struct.s) %input) { // %input1 = addrspacecast ptr %input to ptr addrspace(101) -// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast -// ; to prevent generic -> param -> generic from getting cancelled out -// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) -// %call = call i32 @escape(ptr %input1.gen) -// ret void +// ; the following intrinsic converts pointer to generic. We don't use an +// addrspacecast ; to prevent generic -> param -> generic from getting +// cancelled out %input1.gen = call ptr +// @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) %call = +// call i32 @escape(ptr %input1.gen) ret void // } // // TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't @@ -139,16 +139,21 @@ #include "NVPTX.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include #include @@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args", // pointer in parameter AS. // For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to // generic using cvta.param. -static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { +static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam, + bool IsGridConstant) { Instruction *I = dyn_cast(OldUse->getUser()); assert(I && "OldUse must be in an instruction"); struct IP { @@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { SmallVector ItemsToConvert = {{OldUse, I, Param}}; SmallVector InstructionsToDelete; - auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * { + auto CloneInstInParamAS = [HasCvtaParam, + IsGridConstant](const IP &I) -> Value * { if (auto *LI = dyn_cast(I.OldInstruction)) { LI->setOperand(0, I.NewParam); return LI; @@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { // Just pass through the argument, the old ASC is no longer needed. return I.NewParam; } + if (auto *MI = dyn_cast(I.OldInstruction)) { + if (MI->getRawSource() == I.OldUse->get()) { + // convert to memcpy/memmove from param space. + IRBuilder<> Builder(I.OldInstruction); + Intrinsic::ID ID = MI->getIntrinsicID(); + + CallInst *B = Builder.CreateMemTransferInst( + ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam, + MI->getSourceAlign(), MI->getLength(), MI->isVolatile()); + for (unsigned I : {0, 1}) + if (uint64_t Bytes = MI->getParamDereferenceableBytes(I)) + B->addDereferenceableParamAttr(I, Bytes); + return B; + } + // We may be able to handle other cases if the argument is + // __grid_constant__ + } - if (GridConstant) { + if (HasCvtaParam) { auto GetParamAddrCastToGeneric = [](Value *Addr, Instruction *OriginalUser) -> Value * { PointerType *ReturnTy = @@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { OriginalUser->getIterator()); return CvtToGenCall; }; - - if (auto *CI = dyn_cast(I.OldInstruction)) { - I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI)); - return CI; + auto *ParamInGenericAS = + GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction); + + // phi/select could use generic arg pointers w/o __grid_constant__ + if (auto *PHI = dyn_cast(I.OldInstruction)) { + for (auto [Idx, V] : enumerate(PHI->incoming_values())) { + if (V.get() == I.OldUse->get()) + PHI->setIncomingValue(Idx, ParamInGenericAS); + } } - if (auto *SI = dyn_cast(I.OldInstruction)) { - // byval address is being stored, cast it to generic - if (SI->getValueOperand() == I.OldUse->get()) - SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI)); - return SI; + if (auto *SI = dyn_cast(I.OldInstruction)) { + if (SI->getTrueValue() == I.OldUse->get()) + SI->setTrueValue(ParamInGenericAS); + if (SI->getFalseValue() == I.OldUse->get()) + SI->setFalseValue(ParamInGenericAS); } - if (auto *PI = dyn_cast(I.OldInstruction)) { - if (PI->getPointerOperand() == I.OldUse->get()) - PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI)); - return PI; + + // Escapes or writes can only use generic param pointers if + // __grid_constant__ is in effect. + if (IsGridConstant) { + if (auto *CI = dyn_cast(I.OldInstruction)) { + I.OldUse->set(ParamInGenericAS); + return CI; + } + if (auto *SI = dyn_cast(I.OldInstruction)) { + // byval address is being stored, cast it to generic + if (SI->getValueOperand() == I.OldUse->get()) + SI->setOperand(0, ParamInGenericAS); + return SI; + } + if (auto *PI = dyn_cast(I.OldInstruction)) { + if (PI->getPointerOperand() == I.OldUse->get()) + PI->setOperand(0, ParamInGenericAS); + return PI; + } + // TODO: iIf we allow stores, we should allow memcpy/memset to + // parameter, too. } - llvm_unreachable( - "Instruction unsupported even for grid_constant argument"); } llvm_unreachable("Unsupported instruction"); @@ -409,49 +453,121 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, } } +namespace { +struct ArgUseChecker : PtrUseVisitor { + using Base = PtrUseVisitor; + + bool IsGridConstant; + SmallPtrSet AllArgUsers; + // Set of phi/select instructions using the Arg + SmallPtrSet Conditionals; + + ArgUseChecker(const DataLayout &DL, bool IsGridConstant) + : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {} + + PtrInfo visitArgPtr(Argument &A) { + assert(A.getType()->isPointerTy()); + IntegerType *IntIdxTy = cast(DL.getIndexType(A.getType())); + IsOffsetKnown = false; + Offset = APInt(IntIdxTy->getBitWidth(), 0); + PI.reset(); + AllArgUsers.clear(); + Conditionals.clear(); + + LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n"); + // Enqueue the uses of this pointer. + enqueueUsers(A); + AllArgUsers.insert(&A); + + // Visit all the uses off the worklist until it is empty. + // Note that unlike PtrUseVisitor we're intentionally do not track offset. + // We're only interested in how we use the pointer. + while (!(Worklist.empty() || PI.isAborted())) { + UseToVisit ToVisit = Worklist.pop_back_val(); + U = ToVisit.UseAndIsOffsetKnown.getPointer(); + Instruction *I = cast(U->getUser()); + AllArgUsers.insert(I); + if (isa(I) || isa(I)) + Conditionals.insert(I); + LLVM_DEBUG(dbgs() << "Processing " << *I << "\n"); + Base::visit(I); + } + if (PI.isEscaped()) + LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst() + << "\n"); + else if (PI.isAborted()) + LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst() + << "\n"); + LLVM_DEBUG(dbgs() << "Traversed " << AllArgUsers.size() << " with " + << Conditionals.size() << " conditionals\n"); + return PI; + } + + void visitStoreInst(StoreInst &SI) { + // Storing the pointer escapes it. + if (U->get() == SI.getValueOperand()) + return PI.setEscapedAndAborted(&SI); + // Writes to the pointer are UB w/ __gid_constant__, but do not force a + // copy. + if (!IsGridConstant) + return PI.setAborted(&SI); + } + + void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { + // ASC to param space are no-ops and do not need a copy + if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM) + return PI.setEscapedAndAborted(&ASC); + Base::visitAddrSpaceCastInst(ASC); + } + + void visitPtrToIntInst(PtrToIntInst &I) { + if (IsGridConstant) + return; + Base::visitPtrToIntInst(I); + } + void visitPHINodeOrSelectInst(Instruction &I) { + assert(isa(I) || isa(I)); + } + // PHI and select just pass through the pointers. + void visitPHINode(PHINode &PN) { enqueueUsers(PN); } + void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); } + + void visitMemTransferInst(MemTransferInst &II) { + if (*U == II.getRawDest() && !IsGridConstant) + PI.setAborted(&II); + + // TODO: memcpy from arg is OK as it can get unrolled into ld.param. + // However, memcpys are currently expected to be unrolled before we + // get here, so we never see them in practice, and we do not currently + // handle them when we convert IR to access param space directly. So, + // we'll mark it as an escape for now. It would still force a copy on + // pre-sm_70 GPUs where we can't take address of a parameter w/o a copy. + // + // PI.setEscaped(&II); + } + + void visitMemSetInst(MemSetInst &II) { + if (*U == II.getRawDest() && !IsGridConstant) + PI.setAborted(&II); + } + // debug only helper. + auto &getVisitedUses() { return VisitedUses; } +}; +} // namespace void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { - bool IsGridConstant = isParamGridConstant(*Arg); Function *Func = Arg->getParent(); + bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam(); + bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg); + const DataLayout &DL = Func->getDataLayout(); BasicBlock::iterator FirstInst = Func->getEntryBlock().begin(); Type *StructType = Arg->getParamByValType(); assert(StructType && "Missing byval type"); - auto AreSupportedUsers = [&](Value *Start) { - SmallVector ValuesToCheck = {Start}; - auto IsSupportedUse = [IsGridConstant](Value *V) -> bool { - if (isa(V) || isa(V) || isa(V)) - return true; - // ASC to param space are OK, too -- we'll just strip them. - if (auto *ASC = dyn_cast(V)) { - if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM) - return true; - } - // Simple calls and stores are supported for grid_constants - // writes to these pointers are undefined behaviour - if (IsGridConstant && - (isa(V) || isa(V) || isa(V))) - return true; - return false; - }; - - while (!ValuesToCheck.empty()) { - Value *V = ValuesToCheck.pop_back_val(); - if (!IsSupportedUse(V)) { - LLVM_DEBUG(dbgs() << "Need a " - << (isParamGridConstant(*Arg) ? "cast " : "copy ") - << "of " << *Arg << " because of " << *V << "\n"); - (void)Arg; - return false; - } - if (!isa(V) && !isa(V) && !isa(V) && - !isa(V)) - llvm::append_range(ValuesToCheck, V->users()); - } - return true; - }; - - if (llvm::all_of(Arg->users(), AreSupportedUsers)) { + ArgUseChecker AUC(DL, IsGridConstant); + ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg); + // Easy case, accessing parameter directly is fine. + if (!(PI.isEscaped() || PI.isAborted()) && AUC.Conditionals.empty()) { // Convert all loads and intermediate operations to use parameter AS and // skip creation of a local copy of the argument. SmallVector UsesToUpdate; @@ -462,7 +578,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), FirstInst); for (Use *U : UsesToUpdate) - convertToParamAS(U, ArgInParamAS, IsGridConstant); + convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant); LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n"); const auto *TLI = @@ -473,13 +589,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, return; } - const DataLayout &DL = Func->getDataLayout(); + // We can't access byval arg directly and need a pointer. on sm_70+ we have + // ability to take a pointer to the argument without making a local copy. + // However, we're still not allowed to write to it. If the user specified + // `__grid_constant__` for the argument, we'll consider escaped pointer as + // read-only. unsigned AS = DL.getAllocaAddrSpace(); - if (isParamGridConstant(*Arg)) { - // Writes to a grid constant are undefined behaviour. We do not need a - // temporary copy. When a pointer might have escaped, conservatively replace - // all of its uses (which might include a device function call) with a cast - // to the generic address space. + if (HasCvtaParam && (!(PI.isEscaped() || PI.isAborted()) || IsGridConstant)) { + LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n"); + // Replace all argument pointer uses (which might include a device function + // call) with a cast to the generic address space using cvta.param + // instruction, which avoids a local copy. IRBuilder<> IRB(&Func->getEntryBlock().front()); // Cast argument to param address space @@ -500,6 +620,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, // Do not replace Arg in the cast to param space CastToParam->setOperand(0, Arg); } else { + LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n"); // Otherwise we have to create a temporary copy. AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 0591782e8148b..526032781d857 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -94,6 +94,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasDotInstructions() const { return SmVersion >= 61 && PTXVersion >= 50; } + bool hasCvtaParam() const { + return SmVersion >= 70 && PTXVersion >= 77; + } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } // GPUs with "a" suffix have include architecture-accelerated features that diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index f6db9c429dba5..176dfee11cfb0 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -1,18 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT -; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX +; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT +; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) { ; PTX-LABEL: grid_const_int( -; PTX-NOT: ld.u32 -; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0]; -; +; PTX: { +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [grid_const_int_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_int_param_1]; +; PTX-NEXT: ld.param.u32 %r2, [grid_const_int_param_0]; +; PTX-NEXT: add.s32 %r3, %r2, %r1; +; PTX-NEXT: st.global.u32 [%rd2], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_int( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) { -; OPT-NOT: alloca -; OPT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 -; +; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr +; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 +; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4 +; OPT-NEXT: ret void %tmp = load i32, ptr %input1, align 4 %add = add i32 %tmp, %input2 store i32 %add, ptr %out @@ -24,19 +36,29 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ ; PTX-LABEL: grid_const_struct( ; PTX: { -; PTX-NOT: ld.u32 -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0]; -; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4]; -; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [grid_const_struct_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_struct_param_0]; +; PTX-NEXT: ld.param.u32 %r2, [grid_const_struct_param_0+4]; +; PTX-NEXT: add.s32 %r3, %r1, %r2; +; PTX-NEXT: st.global.u32 [%rd2], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_struct( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) { -; OPT-NOT: alloca -; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 -; OPT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 -; OPT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 -; OPT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 +; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 +; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 +; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]] +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4 +; OPT-NEXT: ret void %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 %int1 = load i32, ptr %gep1 @@ -49,41 +71,85 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-LABEL: grid_const_escape( ; PTX: { -; PTX-NOT: .local -; PTX: cvta.param.{{.*}} +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_escape_param_0; +; PTX-NEXT: mov.u64 %rd2, %rd1; +; PTX-NEXT: cvta.param.u64 %rd3, %rd2; +; PTX-NEXT: { // callseq 0, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd3; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r1, [retval0+0]; +; PTX-NEXT: } // callseq 0 +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) { -; OPT-NOT: alloca [[STRUCT_S]] -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) +; OPT-NEXT: ret void %call = call i32 @escape(ptr %input) ret void } define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) { ; PTX-LABEL: multiple_grid_const_escape( -; PTX: mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0; -; PTX: mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2; -; PTX: mov.{{.*}} [[RD3:%.*]], [[RD2]]; -; PTX: mov.{{.*}} [[RD4:%.*]], [[RD1]]; -; PTX: cvta.param.{{.*}} [[RD5:%.*]], [[RD4]]; -; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD3]]; -; PTX: { -; PTX: st.param.b64 [param0+0], [[RD5]]; -; PTX: st.param.b64 [param2+0], [[RD6]]; -; +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot3[4]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.u64 %SPL, __local_depot3; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: mov.b64 %rd1, multiple_grid_const_escape_param_0; +; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_2; +; PTX-NEXT: mov.u64 %rd3, %rd2; +; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1]; +; PTX-NEXT: cvta.param.u64 %rd4, %rd3; +; PTX-NEXT: mov.u64 %rd5, %rd1; +; PTX-NEXT: cvta.param.u64 %rd6, %rd5; +; PTX-NEXT: add.u64 %rd7, %SP, 0; +; PTX-NEXT: add.u64 %rd8, %SPL, 0; +; PTX-NEXT: st.local.u32 [%rd8], %r1; +; PTX-NEXT: { // callseq 1, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd6; +; PTX-NEXT: .param .b64 param1; +; PTX-NEXT: st.param.b64 [param1+0], %rd7; +; PTX-NEXT: .param .b64 param2; +; PTX-NEXT: st.param.b64 [param2+0], %rd4; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape3, +; PTX-NEXT: ( +; PTX-NEXT: param0, +; PTX-NEXT: param1, +; PTX-NEXT: param2 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; PTX-NEXT: } // callseq 1 +; PTX-NEXT: ret; ; OPT-LABEL: define void @multiple_grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) { -; OPT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT-NOT: alloca %struct.s -; OPT: [[A_ADDR:%.*]] = alloca i32, align 4 -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 ; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]]) -; +; OPT-NEXT: ret void %a.addr = alloca i32, align 4 store i32 %a, ptr %a.addr, align 4 %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b) @@ -92,40 +158,58 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) { ; PTX-LABEL: grid_const_memory_escape( -; PTX-NOT: .local -; PTX: mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0; -; PTX: cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]]; -; PTX: st.global.u64 [[[RD4:%.*]]], [[RD3]]; -; +; PTX: { +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.u64 %rd4, %rd1; +; PTX-NEXT: cvta.param.u64 %rd5, %rd4; +; PTX-NEXT: st.global.u64 [%rd3], %rd5; +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_memory_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) { -; OPT-NOT: alloca [[STRUCT_S]] -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8 -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) +; OPT-NEXT: [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8 +; OPT-NEXT: ret void store ptr %input, ptr %addr, align 8 ret void } define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) { ; PTX-LABEL: grid_const_inlineasm_escape( -; PTX-NOT .local -; PTX: add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4; -; PTX: cvta.param.u64 [[RD4:%.*]], [[RD2]] -; PTX: cvta.param.u64 [[RD3:%.*]], [[RD1]] -; PTX: add.s64 [[RD5:%.*]], [[RD3]], [[RD4]]; -; +; PTX: { +; PTX-NEXT: .reg .b64 %rd<8>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0; +; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5; +; PTX-NEXT: mov.u64 %rd7, %rd4; +; PTX-NEXT: cvta.param.u64 %rd2, %rd7; +; PTX-NEXT: add.s64 %rd3, %rd2, 4; +; PTX-NEXT: // begin inline asm +; PTX-NEXT: add.s64 %rd1, %rd2, %rd3; +; PTX-NEXT: // end inline asm +; PTX-NEXT: st.global.u64 [%rd6], %rd1; +; PTX-NEXT: ret; +; PTX-NOT .local ; OPT-LABEL: define void @grid_const_inlineasm_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) { -; OPT-NOT: alloca [[STRUCT_S]] -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0 -; OPT: [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1 -; OPT: [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]]) -; OPT: [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]]) -; OPT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1) +; OPT-NEXT: [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 +; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 +; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 +; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8 +; OPT-NEXT: ret void %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1 @@ -135,24 +219,42 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escape( -; PTX-NOT: .local -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0]; -; PTX: add.{{.*}} -; PTX: cvta.param.u64 [[RD3:%.*]], {{%.*}} -; PTX: st.param.{{.*}} [param0+0], [[RD3]] -; PTX: call -; +; PTX: { +; PTX-NEXT: .reg .b32 %r<5>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escape_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escape_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.u64 %rd4, %rd1; +; PTX-NEXT: cvta.param.u64 %rd5, %rd4; +; PTX-NEXT: ld.u32 %r1, [%rd5]; +; PTX-NEXT: add.s32 %r2, %r1, %r1; +; PTX-NEXT: st.global.u32 [%rd3], %r2; +; PTX-NEXT: { // callseq 2, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd5; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r3, [retval0+0]; +; PTX-NEXT: } // callseq 2 +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_partial_escape( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) { -; OPT-NOT: alloca -; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4 -; OPT: [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]] -; OPT: store i32 [[TWICE]] -; OPT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) -; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) -; OPT: ret void -; +; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) +; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 +; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]] +; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4 +; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) +; OPT-NEXT: ret void %val = load i32, ptr %input %twice = add i32 %val, %val store i32 %twice, ptr %output @@ -163,27 +265,46 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escapemem( ; PTX: { -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0]; -; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4]; -; PTX: cvta.param.{{.*}} [[RD5:%.*]], {{%.*}}; -; PTX: st.global.{{.*}} [{{.*}}], [[RD5]]; -; PTX: add.s32 [[R3:%.*]], [[R1]], [[R2]] -; PTX: st.param.{{.*}} [param0+0], [[RD5]] -; PTX: escape +; PTX-NEXT: .reg .b32 %r<6>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escapemem_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escapemem_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.u64 %rd4, %rd1; +; PTX-NEXT: cvta.param.u64 %rd5, %rd4; +; PTX-NEXT: ld.u32 %r1, [%rd5]; +; PTX-NEXT: ld.u32 %r2, [%rd5+4]; +; PTX-NEXT: st.global.u64 [%rd3], %rd5; +; PTX-NEXT: add.s32 %r3, %r1, %r2; +; PTX-NEXT: { // callseq 3, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd5; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r4, [retval0+0]; +; PTX-NEXT: } // callseq 3 +; PTX-NEXT: st.param.b32 [func_retval0+0], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define i32 @grid_const_partial_escapemem( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) { -; OPT-NOT: alloca -; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0 -; OPT: [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4 -; OPT: [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1 -; OPT: [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4 -; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) -; OPT: store ptr [[INPUT1]] -; OPT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] -; OPT: [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]]) -; OPT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr +; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 +; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4 +; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 +; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4 +; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8 +; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] +; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) +; OPT-NEXT: ret i32 [[ADD]] %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %val1 = load i32, ptr %ptr1 %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 @@ -194,29 +315,48 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu ret i32 %add } -define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) { -; PTX-LABEL: grid_const_phi_escape( -; PTX: cvta.param.{{.*}} [[RD1:%.*]], {{.*}} -; PTX: @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]]; -; PTX: $L__BB[[TARGET_LABEL]]: -; PTX: ld.{{.*}} [[R1:%.*]], [[[RD1]]]; -; -; OPT-LABEL: define void @grid_const_phi_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) { -; OPT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) -; OPT: br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]] +define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) { +; PTX-LABEL: grid_const_phi( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd5, grid_const_phi_param_0; +; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6; +; PTX-NEXT: mov.u64 %rd7, %rd5; +; PTX-NEXT: cvta.param.u64 %rd8, %rd7; +; PTX-NEXT: ld.global.u32 %r1, [%rd1]; +; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; +; PTX-NEXT: @%p1 bra $L__BB8_2; +; PTX-NEXT: // %bb.1: // %second +; PTX-NEXT: add.s64 %rd8, %rd8, 4; +; PTX-NEXT: $L__BB8_2: // %merge +; PTX-NEXT: ld.u32 %r2, [%rd8]; +; PTX-NEXT: st.global.u32 [%rd1], %r2; +; PTX-NEXT: ret; +; OPT-LABEL: define void @grid_const_phi( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 +; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: -; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 -; OPT: br label %[[MERGE:.*]] +; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; OPT-NEXT: br label %[[MERGE:.*]] ; OPT: [[SECOND]]: -; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1 -; OPT: br label %[[MERGE]] +; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1 +; OPT-NEXT: br label %[[MERGE]] ; OPT: [[MERGE]]: -; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] -; OPT-NOT: load i32, ptr addrspace(101) -; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; +; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 @@ -235,32 +375,53 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { -; PTX-LABEL: grid_const_phi_escape2( -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4]; -; PTX: @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]]; -; PTX: cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]]; -; PTX: ld.u32 [[R1]], [[[RD1]]]; -; PTX: $L__BB[[LABEL]]: -; PTX: st.global.u32 [[[RD3:%.*]]], [[R1]] -; OPT-LABEL: define void @grid_const_phi_escape2( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) { -; OPT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 -; OPT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; OPT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 -; OPT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 -; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]]) -; OPT: br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]] +define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { +; PTX-LABEL: grid_const_phi_ngc( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<12>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd6, grid_const_phi_ngc_param_0; +; PTX-NEXT: ld.param.u64 %rd7, [grid_const_phi_ngc_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd7; +; PTX-NEXT: mov.u64 %rd10, %rd6; +; PTX-NEXT: cvta.param.u64 %rd11, %rd10; +; PTX-NEXT: ld.global.u32 %r1, [%rd1]; +; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; +; PTX-NEXT: @%p1 bra $L__BB9_2; +; PTX-NEXT: // %bb.1: // %second +; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1; +; PTX-NEXT: mov.u64 %rd9, %rd8; +; PTX-NEXT: cvta.param.u64 %rd2, %rd9; +; PTX-NEXT: add.s64 %rd11, %rd2, 4; +; PTX-NEXT: $L__BB9_2: // %merge +; PTX-NEXT: ld.u32 %r2, [%rd11]; +; PTX-NEXT: st.global.u32 [%rd1], %r2; +; PTX-NEXT: ret; +; OPT-LABEL: define void @grid_const_phi_ngc( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 +; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: -; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 -; OPT: br label %[[MERGE:.*]] +; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; OPT-NEXT: br label %[[MERGE:.*]] ; OPT: [[SECOND]]: -; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 -; OPT: br label %[[MERGE]] +; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1 +; OPT-NEXT: br label %[[MERGE]] ; OPT: [[MERGE]]: -; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] -; +; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 br i1 %less, label %first, label %second @@ -278,22 +439,42 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { -; PTX-LABEL: grid_const_select_escape( -; PTX: cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]] -; PTX: setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0 -; PTX: add.{{.*}} [[RD3:%.*]], %SP, 0; -; PTX: selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]]; -; PTX: ld.u32 {{%.*}}, [[[RD4]]]; -; OPT-LABEL: define void @grid_const_select_escape( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) { -; OPT: [[INPUT24:%.*]] = alloca i32, align 4 -; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]]) -; OPT: load i32, ptr [[INOUT]] -; OPT: [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]] -; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; +define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { +; PTX-LABEL: grid_const_select( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<10>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1; +; PTX-NEXT: mov.u64 %rd5, %rd4; +; PTX-NEXT: cvta.param.u64 %rd6, %rd5; +; PTX-NEXT: mov.u64 %rd7, %rd1; +; PTX-NEXT: cvta.param.u64 %rd8, %rd7; +; PTX-NEXT: ld.global.u32 %r1, [%rd3]; +; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; +; PTX-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1; +; PTX-NEXT: ld.u32 %r2, [%rd9]; +; PTX-NEXT: st.global.u32 [%rd3], %r2; +; PTX-NEXT: ret; +; OPT-LABEL: define void @grid_const_select( +; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 +; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] +; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 %ptrnew = select i1 %less, ptr %input1, ptr %input2 @@ -304,16 +485,27 @@ define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval( define i32 @grid_const_ptrtoint(ptr byval(i32) %input) { ; PTX-LABEL: grid_const_ptrtoint( -; PTX-NOT: .local -; PTX: ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0]; -; PTX: cvta.param.u64 [[RD1:%.*]], {{%.*}} -; PTX: cvt.u32.u64 {{%.*}}, [[RD1]] +; PTX: { +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0; +; PTX-NEXT: mov.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0]; +; PTX-NEXT: cvta.param.u64 %rd3, %rd2; +; PTX-NEXT: cvt.u32.u64 %r2, %rd3; +; PTX-NEXT: add.s32 %r3, %r1, %r2; +; PTX-NEXT: st.param.b32 [func_retval0+0], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define i32 @grid_const_ptrtoint( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) { -; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT2]] -; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) -; OPT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 +; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4 +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 +; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]] +; OPT-NEXT: ret i32 [[KEEPALIVE]] %val = load i32, ptr %input %ptrval = ptrtoint ptr %input to i32 %keepalive = add i32 %val, %ptrval @@ -352,13 +544,13 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr !14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15} !15 = !{i32 1} -!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17} +!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17} !17 = !{i32 1} -!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19} +!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19} !19 = !{i32 1} -!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21} +!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21} !21 = !{i32 1} !22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23} diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index f041f202777f6..7aec67a2ea628 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -1,166 +1,300 @@ -; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32 -; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} - -%struct.ham = type { [4 x i32] } - -; // Verify that load with static offset into parameter is done directly. -; CHECK-LABEL: .visible .entry static_offset -; CHECK-NOT: .local -; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] -; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 -; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] -; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] -; -; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0] -; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1 -; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]] -; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]] -; -; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12]; -; CHECK: st.global.u32 [[[result_addr_g]]], [[value]]; -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { -bb: - %tmp = icmp eq i32 %arg2, 3 - br i1 %tmp, label %bb3, label %bb6 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple nvptx -mcpu=sm_70 -nvptx-lower-args -S | FileCheck %s --check-prefixes=CHECK,CHECK32 +source_filename = "" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.S = type { i32, i32 } + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0 -bb3: ; preds = %bb - %tmp4 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 3 - %tmp5 = load i32, ptr %tmp4, align 4 - store i32 %tmp5, ptr %arg, align 4 - br label %bb6 +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0 -bb6: ; preds = %bb3, %bb +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @read_only( +; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 +; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; CHECK-NEXT: ret void +; +entry: + %i = load i32, ptr %s, align 4 + store i32 %i, ptr %out, align 4 ret void } -; // Verify that load with dynamic offset into parameter is also done directly. -; CHECK-LABEL: .visible .entry dynamic_offset -; CHECK-NOT: .local -; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] -; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 -; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] -; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] -; CHECK64: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]], +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @read_only_gep( +; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; CHECK-NEXT: ret void ; -; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0] -; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1 -; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]] -; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]] -; CHECK32: add.s32 %[[param_w_offset:r[0-9]+]], %[[param_addr1]], +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %i = load i32, ptr %b, align 4 + store i32 %i, ptr %out, align 4 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @read_only_gep_asc( +; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; CHECK-NEXT: ret void ; -; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]]; -; CHECK: st.global.u32 [[[result_addr_g]]], [[value]]; +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %asc = addrspacecast ptr %b to ptr addrspace(101) + %i = load i32, ptr addrspace(101) %asc, align 4 + store i32 %i, ptr %out, align 4 + ret void +} -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { -bb: - %tmp = sext i32 %arg2 to i64 - %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp - %tmp4 = load i32, ptr %tmp3, align 4 - store i32 %tmp4, ptr %arg, align 4 +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @read_only_gep_asc0( +; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; CHECK-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; CHECK-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr +; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 +; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; CHECK-NEXT: ret void +; +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %asc = addrspacecast ptr %b to ptr addrspace(101) + %asc0 = addrspacecast ptr addrspace(101) %asc to ptr + %i = load i32, ptr %asc0, align 4 + store i32 %i, ptr %out, align 4 ret void } -; Same as above, but with a bitcast present in the chain -; CHECK-LABEL:.visible .entry gep_bitcast -; CHECK-NOT: .local -; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_param_0] -; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_param_1 +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @escape_ptr( +; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) #[[ATTR0]] +; CHECK-NEXT: ret void ; -; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_param_0] -; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_param_1 +entry: + call void @_Z6escapePv(ptr noundef nonnull %s) #0 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @escape_ptr_gep( +; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; CHECK-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR0]] +; CHECK-NEXT: ret void ; -; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_param_2] -; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}] -; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]]; -; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}] -; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]]; +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + call void @_Z6escapePv(ptr noundef nonnull %b) #0 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @escape_ptr_store( +; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 +; CHECK-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 +; CHECK-NEXT: ret void ; -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @gep_bitcast(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 { -bb: - %n64 = sext i32 %n to i64 - %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64 - %load = load i8, ptr %gep, align 4 - store i8 %load, ptr %out, align 4 +entry: + %i = ptrtoint ptr %s to i64 + store i64 %i, ptr %out, align 8 ret void } -; Same as above, but with an ASC(101) present in the chain -; CHECK-LABEL:.visible .entry gep_bitcast_asc -; CHECK-NOT: .local -; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_asc_param_0] -; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_asc_param_1 +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @escape_ptr_gep_store( +; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 +; CHECK-NEXT: ret void ; -; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_asc_param_0] -; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_asc_param_1 +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %i = ptrtoint ptr %b to i64 + store i64 %i, ptr %out, align 8 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_math_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @escape_math_store( +; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = or disjoint i64 [[I]], 1 +; CHECK-NEXT: store i64 [[ADD]], ptr [[OUT2]], align 8 +; CHECK-NEXT: ret void ; -; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_asc_param_2] -; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}] -; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]]; -; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}] -; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]]; +entry: + %i = ptrtoint ptr %s to i64 + %add = or disjoint i64 %i, 1 + store i64 %add, ptr %out, align 8 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @memcpy_from_param( +; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; CHECK-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S4]], i64 16, i1 true) +; CHECK-NEXT: ret void ; -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @gep_bitcast_asc(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 { -bb: - %n64 = sext i32 %n to i64 - %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64 - %asc = addrspacecast ptr %gep to ptr addrspace(101) - %load = load i8, ptr addrspace(101) %asc, align 4 - store i8 %load, ptr %out, align 4 +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) ret void } +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @memcpy_to_param( +; CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; CHECK-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; CHECK-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; CHECK-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) + ret void +} -; Verify that if the pointer escapes, then we do fall back onto using a temp copy. -; CHECK-LABEL: .visible .entry pointer_escapes -; CHECK: .local .align 4 .b8 __local_depot{{.*}} -; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] -; CHECK64: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0; -; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0] -; CHECK32: add.u32 %[[copy_addr:r[0-9]+]], %SPL, 0; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12]; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8]; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4]; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1]; -; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12], -; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8], -; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4], -; CHECK-DAG: st.local.u32 [%[[copy_addr]]], -; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] -; CHECK64: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]], -; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]] -; CHECK32: add.s32 %[[copy_w_offset:r[0-9]+]], %[[copy_addr]], -; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]]; -; CHECK: st.global.u32 [[[result_addr_g]]], [[value]]; - -; Function Attrs: convergent norecurse nounwind mustprogress -define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 { +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @copy_on_store( +; CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; CHECK-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; CHECK-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 +; CHECK-NEXT: store i32 [[I]], ptr [[S3]], align 4 +; CHECK-NEXT: ret void +; bb: - %tmp = sext i32 %arg2 to i64 - %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp - %tmp4 = load i32, ptr %tmp3, align 4 - store i32 %tmp4, ptr %arg, align 4 - %tmp5 = call ptr @escape(ptr nonnull %tmp3) #3 + %i = load i32, ptr %in, align 4 + store i32 %i, ptr %s, align 4 ret void } -; Function Attrs: convergent nounwind -declare dso_local ptr @escape(ptr) local_unnamed_addr - +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } -!llvm.module.flags = !{!0, !1, !2} -!nvvm.annotations = !{!3, !4, !5, !6, !7} +!llvm.module.flags = !{!0, !1, !2, !3} +!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15} +!llvm.ident = !{!16, !17} -!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]} +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} !1 = !{i32 1, !"wchar_size", i32 4} !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} -!3 = !{ptr @static_offset, !"kernel", i32 1} -!4 = !{ptr @dynamic_offset, !"kernel", i32 1} -!5 = !{ptr @pointer_escapes, !"kernel", i32 1} -!6 = !{ptr @gep_bitcast, !"kernel", i32 1} -!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1} +!3 = !{i32 7, !"frame-pointer", i32 2} +!4 = !{ptr @read_only, !"kernel", i32 1} +!5 = !{ptr @escape_ptr, !"kernel", i32 1} +!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1} +!7 = !{ptr @escape_ptr_store, !"kernel", i32 1} +!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1} +!9 = !{ptr @escape_math_store, !"kernel", i32 1} +!10 = !{ptr @memcpy_from_param, !"kernel", i32 1} +!11 = !{ptr @memcpy_to_param, !"kernel", i32 1} +!12 = !{ptr @copy_on_store, !"kernel", i32 1} +!13 = !{ptr @read_only_gep, !"kernel", i32 1} +!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1} +!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1} +!16 = !{!"clang version 20.0.0git"} +!17 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK32: {{.*}} From c620b9cf8b00adaadb1e5ce24691b1d81dae39af Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 28 Aug 2024 11:47:05 -0700 Subject: [PATCH 2/6] fixups --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 243f39d8a1671..a79dd23abeec6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -507,7 +507,7 @@ struct ArgUseChecker : PtrUseVisitor { // Storing the pointer escapes it. if (U->get() == SI.getValueOperand()) return PI.setEscapedAndAborted(&SI); - // Writes to the pointer are UB w/ __gid_constant__, but do not force a + // Writes to the pointer are UB w/ __grid_constant__, but do not force a // copy. if (!IsGridConstant) return PI.setAborted(&SI); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 526032781d857..457f10f1d64a2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -94,9 +94,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasDotInstructions() const { return SmVersion >= 61 && PTXVersion >= 50; } - bool hasCvtaParam() const { - return SmVersion >= 70 && PTXVersion >= 77; - } + bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } // GPUs with "a" suffix have include architecture-accelerated features that From 169233b42194759bd8f11cfce2d3d617f6e30144 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 28 Aug 2024 14:06:25 -0700 Subject: [PATCH 3/6] Improving tests --- llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 473 +++++++++++++------- 1 file changed, 321 insertions(+), 152 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 7aec67a2ea628..a414a6c41cd5b 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -1,5 +1,6 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -mtriple nvptx -mcpu=sm_70 -nvptx-lower-args -S | FileCheck %s --check-prefixes=CHECK,CHECK32 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70 source_filename = "" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" @@ -23,15 +24,15 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @read_only( -; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 -; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @read_only( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void ; entry: %i = load i32, ptr %s, align 4 @@ -41,16 +42,16 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @read_only_gep( -; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @read_only_gep( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -61,16 +62,16 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @read_only_gep_asc( -; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @read_only_gep_asc( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -82,21 +83,21 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @read_only_gep_asc0( -; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; CHECK-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; CHECK-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr -; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 -; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @read_only_gep_asc0( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -109,17 +110,17 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @escape_ptr( -; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) #[[ATTR0]] -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @escape_ptr( +; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) +; COMMON-NEXT: ret void ; entry: call void @_Z6escapePv(ptr noundef nonnull %s) #0 @@ -128,18 +129,18 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @escape_ptr_gep( -; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; CHECK-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR0]] -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @escape_ptr_gep( +; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; COMMON-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -149,81 +150,75 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @escape_ptr_store( -; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 -; CHECK-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @escape_ptr_store( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 +; COMMON-NEXT: ret void ; entry: - %i = ptrtoint ptr %s to i64 - store i64 %i, ptr %out, align 8 + store ptr %s, ptr %out, align 8 ret void } ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @escape_ptr_gep_store( -; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @escape_ptr_gep_store( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; COMMON-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 +; COMMON-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 - %i = ptrtoint ptr %b to i64 - store i64 %i, ptr %out, align 8 + store ptr %b, ptr %out, align 8 ret void } ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) -define dso_local void @escape_math_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @escape_math_store( -; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 -; CHECK-NEXT: [[ADD:%.*]] = or disjoint i64 [[I]], 1 -; CHECK-NEXT: store i64 [[ADD]], ptr [[OUT2]], align 8 -; CHECK-NEXT: ret void +define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptrtoint( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 +; COMMON-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 +; COMMON-NEXT: ret void ; entry: %i = ptrtoint ptr %s to i64 - %add = or disjoint i64 %i, 1 - store i64 %add, ptr %out, align 8 + store i64 %i, ptr %out, align 8 ret void } ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @memcpy_from_param( -; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; CHECK-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S4]], i64 16, i1 true) -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @memcpy_from_param( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; COMMON-NEXT: ret void ; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) @@ -232,17 +227,17 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @memcpy_to_param( -; CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 -; CHECK-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; CHECK-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; CHECK-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @memcpy_to_param( +; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) +; COMMON-NEXT: ret void ; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) @@ -251,18 +246,18 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 { -; CHECK-LABEL: define dso_local void @copy_on_store( -; CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { -; CHECK-NEXT: [[BB:.*:]] -; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 -; CHECK-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; CHECK-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 -; CHECK-NEXT: store i32 [[I]], ptr [[S3]], align 4 -; CHECK-NEXT: ret void +; COMMON-LABEL: define dso_local void @copy_on_store( +; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[S3]], align 4 +; COMMON-NEXT: ret void ; bb: %i = load i32, ptr %in, align 4 @@ -270,13 +265,185 @@ bb: ret void } +define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { +; SM_60-LABEL: define void @test_select( +; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; SM_60-NEXT: [[BB:.*:]] +; SM_60-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr +; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 +; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 +; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] +; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define void @test_select( +; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; SM_70-NEXT: [[BB:.*:]] +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] +; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT2]], align 4 +; SM_70-NEXT: ret void +; +bb: + %ptrnew = select i1 %cond, ptr %input1, ptr %input2 + %valloaded = load i32, ptr %ptrnew, align 4 + store i32 %valloaded, ptr %out, align 4 + ret void +} + +define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { +; COMMON-LABEL: define void @test_select_write( +; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr +; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 +; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] +; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; COMMON-NEXT: ret void +; +bb: + %ptrnew = select i1 %cond, ptr %input1, ptr %input2 + store i32 1, ptr %ptrnew, align 4 + ret void +} + +define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) { +; SM_60-LABEL: define void @test_phi( +; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; SM_60-NEXT: [[BB:.*:]] +; SM_60-NEXT: [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr +; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 +; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 +; SM_60-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; SM_60: [[FIRST]]: +; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 +; SM_60-NEXT: br label %[[MERGE:.*]] +; SM_60: [[SECOND]]: +; SM_60-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 +; SM_60-NEXT: br label %[[MERGE]] +; SM_60: [[MERGE]]: +; SM_60-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT8]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define void @test_phi( +; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; SM_70-NEXT: [[BB:.*:]] +; SM_70-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; SM_70-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; SM_70: [[FIRST]]: +; SM_70-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; SM_70-NEXT: br label %[[MERGE:.*]] +; SM_70: [[SECOND]]: +; SM_70-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1 +; SM_70-NEXT: br label %[[MERGE]] +; SM_70: [[MERGE]]: +; SM_70-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; SM_70-NEXT: ret void +; +bb: + br i1 %cond, label %first, label %second + +first: ; preds = %bb + %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0 + br label %merge + +second: ; preds = %bb + %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1 + br label %merge + +merge: ; preds = %second, %first + %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ] + %valloaded = load i32, ptr %ptrnew, align 4 + store i32 %valloaded, ptr %inout, align 4 + ret void +} + +define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) { +; COMMON-LABEL: define void @test_phi_write( +; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 +; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 +; COMMON-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; COMMON: [[FIRST]]: +; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 +; COMMON-NEXT: br label %[[MERGE:.*]] +; COMMON: [[SECOND]]: +; COMMON-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 +; COMMON-NEXT: br label %[[MERGE]] +; COMMON: [[MERGE]]: +; COMMON-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; COMMON-NEXT: ret void +; +bb: + br i1 %cond, label %first, label %second + +first: ; preds = %bb + %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0 + br label %merge + +second: ; preds = %bb + %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1 + br label %merge + +merge: ; preds = %second, %first + %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ] + store i32 1, ptr %ptrnew, align 4 + ret void +} + attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" } attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } !llvm.module.flags = !{!0, !1, !2, !3} -!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15} -!llvm.ident = !{!16, !17} +!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19} +!llvm.ident = !{!20, !21} !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} !1 = !{i32 1, !"wchar_size", i32 4} @@ -287,14 +454,16 @@ attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } !6 = !{ptr @escape_ptr_gep, !"kernel", i32 1} !7 = !{ptr @escape_ptr_store, !"kernel", i32 1} !8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1} -!9 = !{ptr @escape_math_store, !"kernel", i32 1} +!9 = !{ptr @escape_ptrtoint, !"kernel", i32 1} !10 = !{ptr @memcpy_from_param, !"kernel", i32 1} !11 = !{ptr @memcpy_to_param, !"kernel", i32 1} !12 = !{ptr @copy_on_store, !"kernel", i32 1} !13 = !{ptr @read_only_gep, !"kernel", i32 1} !14 = !{ptr @read_only_gep_asc, !"kernel", i32 1} !15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1} -!16 = !{!"clang version 20.0.0git"} -!17 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK32: {{.*}} +!16 = !{ptr @test_select, !"kernel", i32 1} +!17 = !{ptr @test_phi, !"kernel", i32 1} +!18 = !{ptr @test_phi_write, !"kernel", i32 1} +!19 = !{ptr @test_select_write, !"kernel", i32 1} +!20 = !{!"clang version 20.0.0git"} +!21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} From 00d73007e7ddf25ac22b6a33efe963f4cc6df69d Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 28 Aug 2024 15:33:27 -0700 Subject: [PATCH 4/6] Undo unintentional comment reformat. --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index a79dd23abeec6..8585ab756ae43 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -79,15 +79,15 @@ // // define void @foo({i32*, i32*}* byval %input) { // %b_param = addrspacecat ptr %input to ptr addrspace(101) -// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, -// i32 1 %b = load ptr, ptr addrspace(101) %b_ptr %b_global = addrspacecast -// ptr %b to ptr addrspace(1) ; use %b_generic +// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1 +// %b = load ptr, ptr addrspace(101) %b_ptr +// %b_global = addrspacecast ptr %b to ptr addrspace(1) +// ; use %b_generic // } // -// Create a local copy of kernel byval parameters used in a way that *might* -// mutate the parameter, by storing it in an alloca. Mutations to -// "grid_constant" parameters are undefined behaviour, and don't require -// local copies. +// Create a local copy of kernel byval parameters used in a way that *might* mutate +// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters +// are undefined behaviour, and don't require local copies. // // define void @foo(ptr byval(%struct.s) align 4 %input) { // store i32 42, ptr %input @@ -124,11 +124,11 @@ // // define void @foo(ptr byval(%struct.s) %input) { // %input1 = addrspacecast ptr %input to ptr addrspace(101) -// ; the following intrinsic converts pointer to generic. We don't use an -// addrspacecast ; to prevent generic -> param -> generic from getting -// cancelled out %input1.gen = call ptr -// @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) %call = -// call i32 @escape(ptr %input1.gen) ret void +// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast +// ; to prevent generic -> param -> generic from getting cancelled out +// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) +// %call = call i32 @escape(ptr %input1.gen) +// ret void // } // // TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't From 909970a7222f23e15d416271e4304309d7f1b3a8 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Thu, 5 Sep 2024 14:33:26 -0700 Subject: [PATCH 5/6] Removed some unused data. Improved readability a bit. --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 31 ++++++++---------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 8585ab756ae43..130075f22f55c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -458,7 +458,6 @@ struct ArgUseChecker : PtrUseVisitor { using Base = PtrUseVisitor; bool IsGridConstant; - SmallPtrSet AllArgUsers; // Set of phi/select instructions using the Arg SmallPtrSet Conditionals; @@ -471,13 +470,11 @@ struct ArgUseChecker : PtrUseVisitor { IsOffsetKnown = false; Offset = APInt(IntIdxTy->getBitWidth(), 0); PI.reset(); - AllArgUsers.clear(); Conditionals.clear(); LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n"); // Enqueue the uses of this pointer. enqueueUsers(A); - AllArgUsers.insert(&A); // Visit all the uses off the worklist until it is empty. // Note that unlike PtrUseVisitor we're intentionally do not track offset. @@ -486,7 +483,6 @@ struct ArgUseChecker : PtrUseVisitor { UseToVisit ToVisit = Worklist.pop_back_val(); U = ToVisit.UseAndIsOffsetKnown.getPointer(); Instruction *I = cast(U->getUser()); - AllArgUsers.insert(I); if (isa(I) || isa(I)) Conditionals.insert(I); LLVM_DEBUG(dbgs() << "Processing " << *I << "\n"); @@ -498,8 +494,8 @@ struct ArgUseChecker : PtrUseVisitor { else if (PI.isAborted()) LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst() << "\n"); - LLVM_DEBUG(dbgs() << "Traversed " << AllArgUsers.size() << " with " - << Conditionals.size() << " conditionals\n"); + LLVM_DEBUG(dbgs() << "Traversed " << Conditionals.size() + << " conditionals\n"); return PI; } @@ -535,25 +531,17 @@ struct ArgUseChecker : PtrUseVisitor { void visitMemTransferInst(MemTransferInst &II) { if (*U == II.getRawDest() && !IsGridConstant) PI.setAborted(&II); - - // TODO: memcpy from arg is OK as it can get unrolled into ld.param. - // However, memcpys are currently expected to be unrolled before we - // get here, so we never see them in practice, and we do not currently - // handle them when we convert IR to access param space directly. So, - // we'll mark it as an escape for now. It would still force a copy on - // pre-sm_70 GPUs where we can't take address of a parameter w/o a copy. - // - // PI.setEscaped(&II); + // memcpy/memmove are OK when the pointer is source. We can convert them to + // AS-specific memcpy. } void visitMemSetInst(MemSetInst &II) { - if (*U == II.getRawDest() && !IsGridConstant) + if (!IsGridConstant) PI.setAborted(&II); } - // debug only helper. - auto &getVisitedUses() { return VisitedUses; } -}; +}; // struct ArgUseChecker } // namespace + void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { Function *Func = Arg->getParent(); @@ -566,8 +554,9 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, ArgUseChecker AUC(DL, IsGridConstant); ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg); + bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted()); // Easy case, accessing parameter directly is fine. - if (!(PI.isEscaped() || PI.isAborted()) && AUC.Conditionals.empty()) { + if (ArgUseIsReadOnly && AUC.Conditionals.empty()) { // Convert all loads and intermediate operations to use parameter AS and // skip creation of a local copy of the argument. SmallVector UsesToUpdate; @@ -595,7 +584,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, // `__grid_constant__` for the argument, we'll consider escaped pointer as // read-only. unsigned AS = DL.getAllocaAddrSpace(); - if (HasCvtaParam && (!(PI.isEscaped() || PI.isAborted()) || IsGridConstant)) { + if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) { LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n"); // Replace all argument pointer uses (which might include a device function // call) with a cast to the generic address space using cvta.param From e38eaf346194215bffbd2dae03291c64e32e9b4c Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 11 Sep 2024 10:27:48 -0700 Subject: [PATCH 6/6] Typo fix --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 130075f22f55c..082546c4dd72f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -477,7 +477,7 @@ struct ArgUseChecker : PtrUseVisitor { enqueueUsers(A); // Visit all the uses off the worklist until it is empty. - // Note that unlike PtrUseVisitor we're intentionally do not track offset. + // Note that unlike PtrUseVisitor we intentionally do not track offsets. // We're only interested in how we use the pointer. while (!(Worklist.empty() || PI.isAborted())) { UseToVisit ToVisit = Worklist.pop_back_val();