diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 318528021ef75..36062e69ef107 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -499,6 +499,18 @@ class TargetTransformInfo { LLVM_ABI bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const; + // Given an address space cast of the given pointer value, calculate the known + // bits of the source pointer in the source addrspace and the destination + // pointer in the destination addrspace. + LLVM_ABI std::pair + computeKnownBitsAddrSpaceCast(unsigned ToAS, const Value &PtrOp) const; + + // Given an address space cast, calculate the known bits of the resulting ptr + // in the destination addrspace using the known bits of the source pointer in + // the source addrspace. + LLVM_ABI KnownBits computeKnownBitsAddrSpaceCast( + unsigned FromAS, unsigned ToAS, const KnownBits &FromPtrBits) const; + /// Return true if globals in this address space can have initializers other /// than `undef`. LLVM_ABI bool diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index bb299becfdcba..460a1b2cff67e 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -16,6 +16,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" @@ -151,6 +152,52 @@ class TargetTransformInfoImplBase { } virtual bool isNoopAddrSpaceCast(unsigned, unsigned) const { return false; } + + virtual std::pair + computeKnownBitsAddrSpaceCast(unsigned ToAS, const Value &PtrOp) const { + const Type *PtrTy = PtrOp.getType(); + assert(PtrTy->isPtrOrPtrVectorTy() && + "expected pointer or pointer vector type"); + unsigned FromAS = PtrTy->getPointerAddressSpace(); + + if (DL.isNonIntegralAddressSpace(FromAS)) + return std::pair(KnownBits(DL.getPointerSizeInBits(FromAS)), + KnownBits(DL.getPointerSizeInBits(ToAS))); + + KnownBits FromPtrBits; + if (const AddrSpaceCastInst *CastI = dyn_cast(&PtrOp)) { + std::pair KB = computeKnownBitsAddrSpaceCast( + CastI->getDestAddressSpace(), *CastI->getPointerOperand()); + FromPtrBits = KB.second; + } else if (FromAS == 0 && + PatternMatch::match(&PtrOp, PatternMatch::m_Zero())) { + // For addrspace 0, we know that a null pointer has the value 0. + FromPtrBits = KnownBits::makeConstant( + APInt::getZero(DL.getPointerSizeInBits(FromAS))); + } else { + FromPtrBits = computeKnownBits(&PtrOp, DL, nullptr); + } + + KnownBits ToPtrBits = + computeKnownBitsAddrSpaceCast(FromAS, ToAS, FromPtrBits); + + return {FromPtrBits, ToPtrBits}; + } + + virtual KnownBits + computeKnownBitsAddrSpaceCast(unsigned FromAS, unsigned ToAS, + const KnownBits &FromPtrBits) const { + unsigned ToASBitSize = DL.getPointerSizeInBits(ToAS); + + if (DL.isNonIntegralAddressSpace(FromAS)) + return KnownBits(ToASBitSize); + + // By default, we assume that all valid "larger" (e.g. 64-bit) to "smaller" + // (e.g. 32-bit) casts work by chopping off the high bits. + // By default, we do not assume that null results in null again. + return FromPtrBits.anyextOrTrunc(ToASBitSize); + } + virtual bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const { return AS == 0; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index b9dec59a1ecad..1b2d35ad67eb1 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -330,6 +330,17 @@ bool TargetTransformInfo::isNoopAddrSpaceCast(unsigned FromAS, return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS); } +std::pair +TargetTransformInfo::computeKnownBitsAddrSpaceCast(unsigned ToAS, + const Value &PtrOp) const { + return TTIImpl->computeKnownBitsAddrSpaceCast(ToAS, PtrOp); +} + +KnownBits TargetTransformInfo::computeKnownBitsAddrSpaceCast( + unsigned FromAS, unsigned ToAS, const KnownBits &FromPtrBits) const { + return TTIImpl->computeKnownBitsAddrSpaceCast(FromAS, ToAS, FromPtrBits); +} + bool TargetTransformInfo::canHaveNonUndefGlobalInitializerInAddressSpace( unsigned AS) const { return TTIImpl->canHaveNonUndefGlobalInitializerInAddressSpace(AS); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fdd54c42..558b7c2491e8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1150,41 +1150,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); return NewVal; } - case Intrinsic::ptrmask: { - unsigned OldAS = OldV->getType()->getPointerAddressSpace(); - unsigned NewAS = NewV->getType()->getPointerAddressSpace(); - Value *MaskOp = II->getArgOperand(1); - Type *MaskTy = MaskOp->getType(); - - bool DoTruncate = false; - - const GCNTargetMachine &TM = - static_cast(getTLI()->getTargetMachine()); - if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) { - // All valid 64-bit to 32-bit casts work by chopping off the high - // bits. Any masking only clearing the low bits will also apply in the new - // address space. - if (DL.getPointerSizeInBits(OldAS) != 64 || - DL.getPointerSizeInBits(NewAS) != 32) - return nullptr; - - // TODO: Do we need to thread more context in here? - KnownBits Known = computeKnownBits(MaskOp, DL, nullptr, II); - if (Known.countMinLeadingOnes() < 32) - return nullptr; - - DoTruncate = true; - } - - IRBuilder<> B(II); - if (DoTruncate) { - MaskTy = B.getInt32Ty(); - MaskOp = B.CreateTrunc(MaskOp, MaskTy); - } - - return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, - {NewV, MaskOp}); - } case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: { Type *DestTy = II->getType(); diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 3ad87545953ff..f44283c1591b2 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -206,6 +206,12 @@ class InferAddressSpacesImpl { bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const; + Value *clonePtrMaskWithNewAddressSpace( + IntrinsicInst *I, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, + SmallVectorImpl *PoisonUsesToFix) const; + Value *cloneInstructionWithNewAddressSpace( Instruction *I, unsigned NewAddrSpace, const ValueToValueMapTy &ValueWithNewAddrSpace, @@ -651,6 +657,66 @@ static Value *operandWithNewAddressSpaceOrCreatePoison( return PoisonValue::get(NewPtrTy); } +// A helper function for cloneInstructionWithNewAddressSpace. Handles the +// conversion of a ptrmask intrinsic instruction. +Value *InferAddressSpacesImpl::clonePtrMaskWithNewAddressSpace( + IntrinsicInst *I, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, + SmallVectorImpl *PoisonUsesToFix) const { + const Use &PtrOpUse = I->getArgOperandUse(0); + unsigned OldAddrSpace = PtrOpUse->getType()->getPointerAddressSpace(); + Value *MaskOp = I->getArgOperand(1); + Type *MaskTy = MaskOp->getType(); + + KnownBits OldPtrBits{DL->getPointerSizeInBits(OldAddrSpace)}; + KnownBits NewPtrBits{DL->getPointerSizeInBits(NewAddrSpace)}; + if (!TTI->isNoopAddrSpaceCast(OldAddrSpace, NewAddrSpace)) { + std::tie(OldPtrBits, NewPtrBits) = + TTI->computeKnownBitsAddrSpaceCast(NewAddrSpace, *PtrOpUse.get()); + } + + // If the pointers in both addrspaces have a bitwise representation and if the + // representation of the new pointer is smaller (fewer bits) than the old one, + // check if the mask is applicable to the ptr in the new addrspace. Any + // masking only clearing the low bits will also apply in the new addrspace + // Note: checking if the mask clears high bits is not sufficient as those + // might have already been 0 in the old ptr. + if (OldPtrBits.getBitWidth() > NewPtrBits.getBitWidth()) { + KnownBits MaskBits = + computeKnownBits(MaskOp, *DL, /*AssumptionCache=*/nullptr, I); + // Set all unknown bits of the old ptr to 1, so that we are conservative in + // checking which bits are cleared by the mask. + OldPtrBits.One |= ~OldPtrBits.Zero; + // Check which bits are cleared by the mask in the old ptr. + KnownBits ClearedBits = KnownBits::sub(OldPtrBits, OldPtrBits & MaskBits); + + // If the mask isn't applicable to the new ptr, leave the ptrmask as-is and + // insert an addrspacecast after it. + if (ClearedBits.countMaxActiveBits() > NewPtrBits.countMaxActiveBits()) { + std::optional InsertPoint = + I->getInsertionPointAfterDef(); + assert(InsertPoint && "insertion after ptrmask should be possible"); + Type *NewPtrType = getPtrOrVecOfPtrsWithNewAS(I->getType(), NewAddrSpace); + Instruction *AddrSpaceCast = + new AddrSpaceCastInst(I, NewPtrType, "", *InsertPoint); + AddrSpaceCast->setDebugLoc(I->getDebugLoc()); + return AddrSpaceCast; + } + } + + IRBuilder<> B(I); + if (NewPtrBits.getBitWidth() < MaskTy->getScalarSizeInBits()) { + MaskTy = MaskTy->getWithNewBitWidth(NewPtrBits.getBitWidth()); + MaskOp = B.CreateTrunc(MaskOp, MaskTy); + } + Value *NewPtr = operandWithNewAddressSpaceOrCreatePoison( + PtrOpUse, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, + PoisonUsesToFix); + return B.CreateIntrinsic(Intrinsic::ptrmask, {NewPtr->getType(), MaskTy}, + {NewPtr, MaskOp}); +} + // Returns a clone of `I` with its operands converted to those specified in // ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an // operand whose address space needs to be modified might not exist in @@ -660,9 +726,6 @@ static Value *operandWithNewAddressSpaceOrCreatePoison( // Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast // from a pointer whose type already matches. Therefore, this function returns a // Value* instead of an Instruction*. -// -// This may also return nullptr in the case the instruction could not be -// rewritten. Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace( Instruction *I, unsigned NewAddrSpace, const ValueToValueMapTy &ValueWithNewAddrSpace, @@ -683,17 +746,8 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace( // Technically the intrinsic ID is a pointer typed argument, so specially // handle calls early. assert(II->getIntrinsicID() == Intrinsic::ptrmask); - Value *NewPtr = operandWithNewAddressSpaceOrCreatePoison( - II->getArgOperandUse(0), NewAddrSpace, ValueWithNewAddrSpace, - PredicatedAS, PoisonUsesToFix); - Value *Rewrite = - TTI->rewriteIntrinsicWithAddressSpace(II, II->getArgOperand(0), NewPtr); - if (Rewrite) { - assert(Rewrite != II && "cannot modify this pointer operation in place"); - return Rewrite; - } - - return nullptr; + return clonePtrMaskWithNewAddressSpace( + II, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, PoisonUsesToFix); } unsigned AS = TTI->getAssumedAddrSpace(I); @@ -1331,7 +1385,10 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( unsigned OperandNo = PoisonUse->getOperandNo(); assert(isa(NewV->getOperand(OperandNo))); - NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(PoisonUse->get())); + WeakTrackingVH NewOp = ValueWithNewAddrSpace.lookup(PoisonUse->get()); + assert(NewOp && + "poison replacements in ValueWithNewAddrSpace shouldn't be null"); + NewV->setOperand(OperandNo, NewOp); } SmallVector DeadInstructions; diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll index 6ef926f935830..6c911339ab760 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll @@ -3,9 +3,10 @@ define i8 @ptrmask_cast_local_to_flat(ptr addrspace(3) %src.ptr, i64 %mask) { ; CHECK-LABEL: @ptrmask_cast_local_to_flat( -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[SRC_PTR:%.*]] to ptr +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[SRC_PTR1:%.*]] to ptr ; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[CAST]], i64 [[MASK:%.*]]) -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[MASKED]], align 1 +; CHECK-NEXT: [[SRC_PTR:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(3) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[SRC_PTR]], align 1 ; CHECK-NEXT: ret i8 [[LOAD]] ; %cast = addrspacecast ptr addrspace(3) %src.ptr to ptr @@ -14,11 +15,49 @@ define i8 @ptrmask_cast_local_to_flat(ptr addrspace(3) %src.ptr, i64 %mask) { ret i8 %load } +define <3 x ptr addrspace(3)> @ptrmask_vector_cast_local_to_flat(<3 x ptr addrspace(3)> %src.ptr, <3 x i64> %mask) { +; CHECK-LABEL: @ptrmask_vector_cast_local_to_flat( +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast <3 x ptr addrspace(3)> [[SRC_PTR:%.*]] to <3 x ptr> +; CHECK-NEXT: [[MASKED:%.*]] = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> [[CAST]], <3 x i64> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast <3 x ptr> [[MASKED]] to <3 x ptr addrspace(3)> +; CHECK-NEXT: ret <3 x ptr addrspace(3)> [[TMP1]] +; + %cast = addrspacecast <3 x ptr addrspace(3)> %src.ptr to <3 x ptr> + %masked = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %cast, <3 x i64> %mask) + %cast2 = addrspacecast <3 x ptr> %masked to <3 x ptr addrspace(3)> + ret <3 x ptr addrspace(3)> %cast2 +} + +; Casting null does not necessarily result in null again. +define i8 @ptrmask_cast_local_null_to_flat(i64 %mask) { +; CHECK-LABEL: @ptrmask_cast_local_null_to_flat( +; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr addrspacecast (ptr addrspace(3) null to ptr), i64 [[MASK:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(3) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %masked = call ptr @llvm.ptrmask.p0.i64(ptr addrspacecast (ptr addrspace(3) null to ptr), i64 %mask) + %load = load i8, ptr %masked + ret i8 %load +} + +define <3 x ptr addrspace(3)> @ptrmask_vector_cast_local_null_to_flat(<3 x i64> %mask) { +; CHECK-LABEL: @ptrmask_vector_cast_local_null_to_flat( +; CHECK-NEXT: [[MASKED:%.*]] = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> , <3 x i64> [[MASK:%.*]]) +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast <3 x ptr> [[MASKED]] to <3 x ptr addrspace(3)> +; CHECK-NEXT: ret <3 x ptr addrspace(3)> [[CAST]] +; + %masked = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> addrspacecast (<3 x ptr addrspace(3)> to <3 x ptr>), <3 x i64> %mask) + %cast = addrspacecast <3 x ptr> %masked to <3 x ptr addrspace(3)> + ret <3 x ptr addrspace(3)> %cast +} + define i8 @ptrmask_cast_private_to_flat(ptr addrspace(5) %src.ptr, i64 %mask) { ; CHECK-LABEL: @ptrmask_cast_private_to_flat( -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_PTR:%.*]] to ptr +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_PTR1:%.*]] to ptr ; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[CAST]], i64 [[MASK:%.*]]) -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[MASKED]], align 1 +; CHECK-NEXT: [[SRC_PTR:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(5) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(5) [[SRC_PTR]], align 1 ; CHECK-NEXT: ret i8 [[LOAD]] ; %cast = addrspacecast ptr addrspace(5) %src.ptr to ptr @@ -29,9 +68,10 @@ define i8 @ptrmask_cast_private_to_flat(ptr addrspace(5) %src.ptr, i64 %mask) { define i8 @ptrmask_cast_region_to_flat(ptr addrspace(2) %src.ptr, i64 %mask) { ; CHECK-LABEL: @ptrmask_cast_region_to_flat( -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[SRC_PTR:%.*]] to ptr +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[SRC_PTR1:%.*]] to ptr ; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[CAST]], i64 [[MASK:%.*]]) -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[MASKED]], align 1 +; CHECK-NEXT: [[SRC_PTR:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(2) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(2) [[SRC_PTR]], align 1 ; CHECK-NEXT: ret i8 [[LOAD]] ; %cast = addrspacecast ptr addrspace(2) %src.ptr to ptr @@ -77,6 +117,63 @@ define i8 @ptrmask_cast_flat_to_local(ptr %ptr, i64 %mask) { ret i8 %load } +define <3 x ptr addrspace(3)> @ptrmask_vector_cast_flat_to_local(<3 x ptr> %src.ptr, <3 x i64> %mask) { +; CHECK-LABEL: @ptrmask_vector_cast_flat_to_local( +; CHECK-NEXT: [[SRC_PTR:%.*]] = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> [[SRC_PTR1:%.*]], <3 x i64> [[MASK:%.*]]) +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast <3 x ptr> [[SRC_PTR]] to <3 x ptr addrspace(3)> +; CHECK-NEXT: ret <3 x ptr addrspace(3)> [[CAST]] +; + %masked = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %src.ptr, <3 x i64> %mask) + %cast = addrspacecast <3 x ptr> %masked to <3 x ptr addrspace(3)> + ret <3 x ptr addrspace(3)> %cast +} + +; Casting null *does* result in null again if addrspace 0 is casted to a +; smaller addrspace (by default we assume that casting to a smaller addrspace = +; truncating) +define i8 @ptrmask_cast_flat_null_to_local(i64 %mask) { +; CHECK-LABEL: @ptrmask_cast_flat_null_to_local( +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %masked = call ptr @llvm.ptrmask.p0.i64(ptr null, i64 %mask) + %cast = addrspacecast ptr %masked to ptr addrspace(3) + %load = load i8, ptr addrspace(3) %cast + ret i8 %load +} + +define i8 @ptrmask_vector_cast_flat_null_to_local(<3 x i64> %mask, i32 %ptridx, i32 %idx) { +; CHECK-LABEL: @ptrmask_vector_cast_flat_null_to_local( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), i32 [[IDX:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %masked = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> , <3 x i64> %mask) + %cast = addrspacecast <3 x ptr> %masked to <3 x ptr addrspace(3)> + %ptr = extractelement <3 x ptr addrspace(3)> %cast, i32 %ptridx + %gep = getelementptr i8, ptr addrspace(3) %ptr, i32 %idx + %load = load i8, ptr addrspace(3) %gep + ret i8 %load +} + +define i8 @ptrmask_vector_cast_flat_null_with_poison_to_local(<3 x i64> %mask, i32 %ptridx, i32 %idx) { +; CHECK-LABEL: @ptrmask_vector_cast_flat_null_with_poison_to_local( +; CHECK-NEXT: [[MASKED:%.*]] = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> , <3 x i64> [[MASK:%.*]]) +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast <3 x ptr> [[MASKED]] to <3 x ptr addrspace(3)> +; CHECK-NEXT: [[PTR:%.*]] = extractelement <3 x ptr addrspace(3)> [[CAST]], i32 [[PTRIDX:%.*]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(3) [[PTR]], i32 [[IDX:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %masked = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> , <3 x i64> %mask) + %cast = addrspacecast <3 x ptr> %masked to <3 x ptr addrspace(3)> + %ptr = extractelement <3 x ptr addrspace(3)> %cast, i32 %ptridx + %gep = getelementptr i8, ptr addrspace(3) %ptr, i32 %idx + %load = load i8, ptr addrspace(3) %gep + ret i8 %load +} + + define i8 @ptrmask_cast_flat_to_private(ptr %ptr, i64 %mask) { ; CHECK-LABEL: @ptrmask_cast_flat_to_private( ; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 [[MASK:%.*]]) @@ -109,7 +206,8 @@ define i8 @ptrmask_cast_flat_to_global(ptr %ptr, i64 %mask) { define i8 @ptrmask_cast_local_to_flat_global(i64 %mask) { ; CHECK-LABEL: @ptrmask_cast_local_to_flat_global( ; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr addrspacecast (ptr addrspace(3) @lds0 to ptr), i64 [[MASK:%.*]]) -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[MASKED]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(3) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[TMP1]], align 1 ; CHECK-NEXT: ret i8 [[LOAD]] ; %masked = call ptr @llvm.ptrmask.p0.i64(ptr addrspacecast (ptr addrspace(3) @lds0 to ptr), i64 %mask) @@ -150,7 +248,8 @@ define i8 @multi_ptrmask_cast_local_to_flat(ptr addrspace(3) %src.ptr, i64 %mask ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[SRC_PTR:%.*]] to ptr ; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr addrspace(3) [[SRC_PTR]], align 1 ; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[CAST]], i64 [[MASK:%.*]]) -; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[MASKED]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(3) +; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr addrspace(3) [[TMP1]], align 1 ; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD0]], [[LOAD1]] ; CHECK-NEXT: ret i8 [[ADD]] ; @@ -167,7 +266,8 @@ define i8 @multi_ptrmask_cast_region_to_flat(ptr addrspace(2) %src.ptr, i64 %mas ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[SRC_PTR:%.*]] to ptr ; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr addrspace(2) [[SRC_PTR]], align 1 ; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[CAST]], i64 [[MASK:%.*]]) -; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[MASKED]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(2) +; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr addrspace(2) [[TMP1]], align 1 ; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD0]], [[LOAD1]] ; CHECK-NEXT: ret i8 [[ADD]] ; @@ -182,9 +282,10 @@ define i8 @multi_ptrmask_cast_region_to_flat(ptr addrspace(2) %src.ptr, i64 %mas ; Do not fold this since it clears a single high bit. define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffeffffffff(ptr addrspace(3) %src.ptr) { ; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffeffffffff( -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[SRC_PTR:%.*]] to ptr +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[SRC_PTR1:%.*]] to ptr ; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[CAST]], i64 -4294967297) -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[MASKED]], align 1 +; CHECK-NEXT: [[SRC_PTR:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(3) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[SRC_PTR]], align 1 ; CHECK-NEXT: ret i8 [[LOAD]] ; %cast = addrspacecast ptr addrspace(3) %src.ptr to ptr @@ -193,12 +294,26 @@ define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffeffffffff(ptr addrspace( ret i8 %load } +define <3 x ptr addrspace(3)> @ptrmask_vector_cast_local_to_flat_const_mask_fffffffeffffffff(<3 x ptr addrspace(3)> %src.ptr) { +; CHECK-LABEL: @ptrmask_vector_cast_local_to_flat_const_mask_fffffffeffffffff( +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast <3 x ptr addrspace(3)> [[SRC_PTR:%.*]] to <3 x ptr> +; CHECK-NEXT: [[MASKED:%.*]] = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> [[CAST]], <3 x i64> splat (i64 -4294967297)) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast <3 x ptr> [[MASKED]] to <3 x ptr addrspace(3)> +; CHECK-NEXT: ret <3 x ptr addrspace(3)> [[TMP1]] +; + %cast = addrspacecast <3 x ptr addrspace(3)> %src.ptr to <3 x ptr> + %masked = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %cast, <3 x i64> ) + %cast2 = addrspacecast <3 x ptr> %masked to <3 x ptr addrspace(3)> + ret <3 x ptr addrspace(3)> %cast2 +} + ; Do not fold this since it clears a single high bit. define i8 @ptrmask_cast_local_to_flat_const_mask_7fffffffffffffff(ptr addrspace(3) %src.ptr) { ; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_7fffffffffffffff( -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[SRC_PTR:%.*]] to ptr +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[SRC_PTR1:%.*]] to ptr ; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[CAST]], i64 9223372036854775807) -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[MASKED]], align 1 +; CHECK-NEXT: [[SRC_PTR:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(3) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[SRC_PTR]], align 1 ; CHECK-NEXT: ret i8 [[LOAD]] ; %cast = addrspacecast ptr addrspace(3) %src.ptr to ptr @@ -207,6 +322,20 @@ define i8 @ptrmask_cast_local_to_flat_const_mask_7fffffffffffffff(ptr addrspace( ret i8 %load } +; Do not fold: casting null does not necessarily result in null again +define i8 @ptrmask_cast_local_null_to_flat_const_mask_7fffffffffffffff() { +; CHECK-LABEL: @ptrmask_cast_local_null_to_flat_const_mask_7fffffffffffffff( +; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr addrspacecast (ptr addrspace(3) null to ptr), i64 9223372036854775807) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(3) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast ptr addrspace(3) null to ptr + %masked = call ptr @llvm.ptrmask.p0.i64(ptr %cast, i64 9223372036854775807) + %load = load i8, ptr %masked + ret i8 %load +} + define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffff00000000(ptr addrspace(3) %src.ptr) { ; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffff00000000( ; CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[SRC_PTR:%.*]], i32 0) @@ -219,6 +348,28 @@ define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffff00000000(ptr addrspace( ret i8 %load } +define <3 x ptr addrspace(3)> @ptrmask_vector_cast_local_to_flat_const_mask_ffffffff00000000(<3 x ptr addrspace(3)> %src.ptr) { +; CHECK-LABEL: @ptrmask_vector_cast_local_to_flat_const_mask_ffffffff00000000( +; CHECK-NEXT: [[TMP1:%.*]] = call <3 x ptr addrspace(3)> @llvm.ptrmask.v3p3.v3i32(<3 x ptr addrspace(3)> [[SRC_PTR:%.*]], <3 x i32> zeroinitializer) +; CHECK-NEXT: ret <3 x ptr addrspace(3)> [[TMP1]] +; + %cast = addrspacecast <3 x ptr addrspace(3)> %src.ptr to <3 x ptr> + %masked = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %cast, <3 x i64> ) + %cast2 = addrspacecast <3 x ptr> %masked to <3 x ptr addrspace(3)> + ret <3 x ptr addrspace(3)> %cast2 +} + +define i8 @ptrmask_cast_local_null_to_flat_const_mask_ffffffff00000000() { +; CHECK-LABEL: @ptrmask_cast_local_null_to_flat_const_mask_ffffffff00000000( +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) null, align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast ptr addrspace(3) null to ptr + %masked = call ptr @llvm.ptrmask.p0.i64(ptr %cast, i64 -4294967296) + %load = load i8, ptr %masked + ret i8 %load +} + define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffff80000000(ptr addrspace(3) %src.ptr) { ; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffff80000000( ; CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[SRC_PTR:%.*]], i32 -2147483648) @@ -244,6 +395,17 @@ define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffff0000(ptr addrspace( ret i8 %load } +define <3 x ptr addrspace(3)> @ptrmask_vector_cast_local_to_flat_const_mask_ffffffffffff0000(<3 x ptr addrspace(3)> %src.ptr) { +; CHECK-LABEL: @ptrmask_vector_cast_local_to_flat_const_mask_ffffffffffff0000( +; CHECK-NEXT: [[TMP1:%.*]] = call <3 x ptr addrspace(3)> @llvm.ptrmask.v3p3.v3i32(<3 x ptr addrspace(3)> [[SRC_PTR:%.*]], <3 x i32> splat (i32 -65536)) +; CHECK-NEXT: ret <3 x ptr addrspace(3)> [[TMP1]] +; + %cast = addrspacecast <3 x ptr addrspace(3)> %src.ptr to <3 x ptr> + %masked = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %cast, <3 x i64> ) + %cast2 = addrspacecast <3 x ptr> %masked to <3 x ptr addrspace(3)> + ret <3 x ptr addrspace(3)> %cast2 +} + define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffffff00(ptr addrspace(3) %src.ptr) { ; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffffff00( ; CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[SRC_PTR:%.*]], i32 -256) @@ -343,7 +505,72 @@ define i8 @ptrmask_cast_local_to_flat_load_range_mask(ptr addrspace(3) %src.ptr, ret i8 %load } +define <2 x ptr addrspace(3)> @ptrmask_vector_cast_local_to_flat_load_range_mask(<2 x ptr addrspace(3)> %src.ptr, ptr addrspace(1) %mask.ptr) { +; CHECK-LABEL: @ptrmask_vector_cast_local_to_flat_load_range_mask( +; CHECK-NEXT: [[LOAD_MASK:%.*]] = load <2 x i64>, ptr addrspace(1) [[MASK_PTR:%.*]], align 16, !range [[RNG0]] +; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[LOAD_MASK]] to <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x ptr addrspace(3)> @llvm.ptrmask.v2p3.v2i32(<2 x ptr addrspace(3)> [[SRC_PTR:%.*]], <2 x i32> [[TMP1]]) +; CHECK-NEXT: ret <2 x ptr addrspace(3)> [[TMP2]] +; + %load.mask = load <2 x i64>, ptr addrspace(1) %mask.ptr, align 16, !range !0 + %cast = addrspacecast <2 x ptr addrspace(3)> %src.ptr to <2 x ptr> + %masked = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %cast, <2 x i64> %load.mask) + %cast2 = addrspacecast <2 x ptr> %masked to <2 x ptr addrspace(3)> + ret <2 x ptr addrspace(3)> %cast2 +} + +; Non-const masks with no known range should not prevent other ptr-manipulating +; instructions (such as gep) from being converted. +define i8 @ptrmask_cast_local_to_flat_unknown_mask(ptr addrspace(3) %src.ptr, i64 %mask, i64 %idx) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_unknown_mask( +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[SRC_PTR:%.*]] to ptr +; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[CAST]], i64 [[MASK:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(3) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP1]], i64 [[IDX:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast ptr addrspace(3) %src.ptr to ptr + %masked = call ptr @llvm.ptrmask.p0.i64(ptr %cast, i64 %mask) + %gep = getelementptr i8, ptr %masked, i64 %idx + %load = load i8, ptr %gep + ret i8 %load +} + +define <2 x ptr addrspace(3)> @ptrmask_vector_cast_local_to_flat_unknown_mask(<2 x ptr addrspace(3)> %src.ptr, <2 x i64> %mask) { +; CHECK-LABEL: @ptrmask_vector_cast_local_to_flat_unknown_mask( +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast <2 x ptr addrspace(3)> [[SRC_PTR:%.*]] to <2 x ptr> +; CHECK-NEXT: [[MASKED:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[CAST]], <2 x i64> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast <2 x ptr> [[MASKED]] to <2 x ptr addrspace(3)> +; CHECK-NEXT: ret <2 x ptr addrspace(3)> [[TMP1]] +; + %cast = addrspacecast <2 x ptr addrspace(3)> %src.ptr to <2 x ptr> + %masked = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %cast, <2 x i64> %mask) + %cast2 = addrspacecast <2 x ptr> %masked to <2 x ptr addrspace(3)> + ret <2 x ptr addrspace(3)> %cast2 +} + +define i8 @interleaved_ptrmask_cast_local_to_flat_unknown_mask(ptr addrspace(3) %src.ptr, i64 %mask, i64 %idx) { +; CHECK-LABEL: @interleaved_ptrmask_cast_local_to_flat_unknown_mask( +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[GEP1:%.*]] to ptr +; CHECK-NEXT: store i64 [[MASK:%.*]], ptr addrspace(3) [[GEP1]], align 8 +; CHECK-NEXT: [[MASKED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[CAST]], i64 [[MASK]]) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[MASKED]] to ptr addrspace(3) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP1]], i64 [[IDX:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(3) [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast ptr addrspace(3) %src.ptr to ptr + store i64 %mask, ptr %cast + %masked = call ptr @llvm.ptrmask.p0.i64(ptr %cast, i64 %mask) + %gep = getelementptr i8, ptr %masked, i64 %idx + %load = load i8, ptr %gep + ret i8 %load +} + declare ptr @llvm.ptrmask.p0.i64(ptr, i64) #0 +declare <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr>, <2 x i64>) #0 +declare <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr>, <3 x i64>) #0 declare ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5), i32) #0 declare ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3), i32) #0 declare ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1), i64) #0