diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 4480ced637456..3a51b9e6917c3 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3229,9 +3229,11 @@ class LLVM_ABI TargetLoweringBase { /// result is unconditional. /// \p SVI is the shufflevector to RE-interleave the stored vector. /// \p Factor is the interleave factor. + /// \p GapMask is a mask with zeros for components / fields that may not be + /// accessed. virtual bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const { + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 93f6e39b56ab6..c5e97037be336 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -537,28 +537,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore( "number of stored element should be a multiple of Factor"); Value *Mask = nullptr; + auto GapMask = APInt::getAllOnes(Factor); if (SI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. unsigned LaneMaskLen = NumStoredElements / Factor; - APInt GapMask(Factor, 0); std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen)); if (!Mask) return false; - // We haven't supported gap mask for stores. Yet it is possible that we - // already changed the IR, hence returning true here. - if (GapMask.popcount() != Factor) - return true; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " << *Store << "\n"); + LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor + << " and actual factor " << GapMask.popcount() << "\n"); } // Try to create target specific intrinsics to replace the store and // shuffle. - if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor)) + if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor, GapMask)) return false; // Already have a new target specific interleaved store. Erase the old store. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d168cc8d1bd06..3bb0fdd841653 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17604,14 +17604,16 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) { bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); auto *SI = dyn_cast(Store); if (!SI) return false; - assert(!LaneMask && "Unexpected mask on store"); + assert(!LaneMask && GapMask.popcount() == Factor && + "Unexpected mask on store"); auto *VecTy = cast(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 071e96e194286..0347fbf21f279 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -233,8 +233,8 @@ class AArch64TargetLowering : public TargetLowering { ArrayRef Indices, unsigned Factor, const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 830156359e9e8..57352d9567c13 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21720,13 +21720,15 @@ bool ARMTargetLowering::lowerInterleavedLoad( bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); auto *SI = dyn_cast(Store); if (!SI) return false; - assert(!LaneMask && "Unexpected mask on store"); + assert(!LaneMask && GapMask.popcount() == Factor && + "Unexpected mask on store"); auto *VecTy = cast(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 778595e93f84a..d87cb4856c2af 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -688,8 +688,8 @@ class VectorType; ArrayRef Indices, unsigned Factor, const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; bool shouldInsertFencesForAtomic(const Instruction *I) const override; TargetLoweringBase::AtomicExpansionKind diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index fb63ebcfaacea..4581c11356aff 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -435,8 +435,8 @@ class RISCVTargetLowering : public TargetLowering { const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index c7b96f5c3d0c8..5e1063155ba07 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -81,6 +81,12 @@ static const Intrinsic::ID FixedVssegIntrIds[] = { Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, Intrinsic::riscv_seg8_store_mask}; +static const Intrinsic::ID FixedVsssegIntrIds[] = { + Intrinsic::riscv_sseg2_store_mask, Intrinsic::riscv_sseg3_store_mask, + Intrinsic::riscv_sseg4_store_mask, Intrinsic::riscv_sseg5_store_mask, + Intrinsic::riscv_sseg6_store_mask, Intrinsic::riscv_sseg7_store_mask, + Intrinsic::riscv_sseg8_store_mask}; + static const Intrinsic::ID ScalableVssegIntrIds[] = { Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, @@ -275,7 +281,16 @@ bool RISCVTargetLowering::lowerInterleavedLoad( bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { + assert(GapMask.getBitWidth() == Factor); + + // We only support cases where the skipped fields are the trailing ones. + // TODO: Lower to strided store if there is only a single active field. + unsigned MaskFactor = GapMask.popcount(); + if (MaskFactor < 2 || !GapMask.isMask()) + return false; + IRBuilder<> Builder(Store); const DataLayout &DL = Store->getDataLayout(); auto Mask = SVI->getShuffleMask(); @@ -287,21 +302,31 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, Value *Ptr, *VL; Align Alignment; - if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment)) + if (!getMemOperands(MaskFactor, VTy, XLenTy, Store, Ptr, LaneMask, VL, + Alignment)) return false; Type *PtrTy = Ptr->getType(); unsigned AS = PtrTy->getPointerAddressSpace(); - if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL)) return false; - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); + Function *SegStoreFunc; + if (MaskFactor < Factor) + // Strided segmented store. + SegStoreFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), FixedVsssegIntrIds[MaskFactor - 2], + {VTy, PtrTy, XLenTy, XLenTy}); + else + // Normal segmented store. + SegStoreFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), FixedVssegIntrIds[Factor - 2], + {VTy, PtrTy, XLenTy}); SmallVector Ops; SmallVector NewShuffleMask; - for (unsigned i = 0; i < Factor; i++) { + for (unsigned i = 0; i < MaskFactor; i++) { // Collect shuffle mask for this lane. for (unsigned j = 0; j < VTy->getNumElements(); j++) NewShuffleMask.push_back(Mask[i + Factor * j]); @@ -312,8 +337,14 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, NewShuffleMask.clear(); } - Ops.append({Ptr, LaneMask, VL}); - Builder.CreateCall(VssegNFunc, Ops); + Ops.push_back(Ptr); + if (MaskFactor < Factor) { + // Insert the stride argument. + unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); + Ops.push_back(ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes)); + } + Ops.append({LaneMask, VL}); + Builder.CreateCall(SegStoreFunc, Ops); return true; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 3dd79b3249517..24407a46d6495 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1669,8 +1669,8 @@ namespace llvm { /// Lower interleaved store(s) into target specific /// instructions/intrinsics. bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 632db7e4326e2..4188487d75911 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -825,7 +825,8 @@ bool X86TargetLowering::lowerInterleavedLoad( bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -836,7 +837,8 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, auto *SI = dyn_cast(Store); if (!SI) return false; - assert(!LaneMask && "Unexpected mask on store"); + assert(!LaneMask && GapMask.popcount() == Factor && + "Unexpected mask on store"); // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index c426ee7b7d2b1..1dfbdcb516c13 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1638,6 +1638,37 @@ define void @vpstore_factor3_mask(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i ret void } +; mask = all ones, skip the last field. +define void @vpstore_factor3_gap(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: vpstore_factor3_gap: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma +; CHECK-NEXT: vssseg2e32.v v8, (a0), a1 +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> , i32 12) + ret void +} + +; mask = 1010, skip the last field. +define void @vpstore_factor3_gap_with_mask(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: vpstore_factor3_gap_with_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vssseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> , i32 12) + ret void +} + define void @vpstore_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { ; CHECK-LABEL: vpstore_factor4: ; CHECK: # %bb.0: @@ -1998,8 +2029,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI63_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI63_0) +; RV32-NEXT: lui a1, %hi(.LCPI65_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI65_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -2074,8 +2105,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI64_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI64_0) +; RV32-NEXT: lui a0, %hi(.LCPI66_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI66_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 @@ -2165,6 +2196,53 @@ define void @maskedstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) { ret void } +; mask = all ones, skip the last field. +define void @maskedstore_factor3_gap(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: maskedstore_factor3_gap: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vssseg2e32.v v8, (a0), a1 +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + tail call void @llvm.masked.store(<12 x i32> %interleaved.vec, ptr %ptr, i32 4, <12 x i1> ) + ret void +} + +; mask = 1010, skip the last two fields. +define void @maskedstore_factor4_gap_with_mask(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; CHECK-LABEL: maskedstore_factor4_gap_with_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: vssseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + tail call void @llvm.masked.store(<16 x i32> %interleaved.vec, ptr %ptr, i32 4, <16 x i1> ) + ret void +} + +; mask = %m, skip the last two fields. +define void @maskedstore_factor4_gap_by_intrinsic_with_mask(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i1> %m) { +; CHECK-LABEL: maskedstore_factor4_gap_by_intrinsic_with_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vssseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + %interleaved.mask = call <16 x i1> @llvm.vector.interleave4(<4 x i1> %m, <4 x i1> %m, <4 x i1> splat (i1 false), <4 x i1> splat (i1 false)) + tail call void @llvm.masked.store(<16 x i32> %interleaved.vec, ptr %ptr, i32 4, <16 x i1> %interleaved.mask) + ret void +} + define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) { ; CHECK-LABEL: maskedload_factor3_mask: ; CHECK: # %bb.0: @@ -2294,8 +2372,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field( ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI73_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI73_0) +; RV32-NEXT: lui a1, %hi(.LCPI78_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI78_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11