diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 43f08b453536c..e35e9b1bb8119 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21585,6 +21585,8 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = { bool RISCVTargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { + assert(Indices.size() == Shuffles.size()); + IRBuilder<> Builder(LI); auto *VTy = cast(Shuffles[0]->getType()); @@ -21595,6 +21597,27 @@ bool RISCVTargetLowering::lowerInterleavedLoad( auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + // If the segment load is going to be performed segment at a time anyways + // and there's only one element used, use a strided load instead. This + // will be equally fast, and create less vector register pressure. + if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) { + unsigned ScalarSizeInBytes = VTy->getScalarSizeInBits() / 8; + Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); + Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); + Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); + Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); + Value *VL = Builder.getInt32(VTy->getNumElements()); + + CallInst *CI = + Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, + {VTy, BasePtr->getType(), Stride->getType()}, + {BasePtr, Stride, Mask, VL}); + CI->addParamAttr( + 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + Shuffles[0]->replaceAllUsesWith(CI); + return true; + }; + Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); CallInst *VlsegN = Builder.CreateIntrinsic( diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index bf9ed3f3d7165..f59a3737ae76f 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -238,6 +238,27 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { return hasVInstructions() ? MaxInterleaveFactor : 1; } + bool hasOptimizedSegmentLoadStore(unsigned NF) const { + switch (NF) { + case 2: + return hasOptimizedNF2SegmentLoadStore(); + case 3: + return hasOptimizedNF3SegmentLoadStore(); + case 4: + return hasOptimizedNF4SegmentLoadStore(); + case 5: + return hasOptimizedNF5SegmentLoadStore(); + case 6: + return hasOptimizedNF6SegmentLoadStore(); + case 7: + return hasOptimizedNF7SegmentLoadStore(); + case 8: + return hasOptimizedNF8SegmentLoadStore(); + default: + llvm_unreachable("Unexpected NF"); + } + } + // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the // vector hardware implementation which may be less than VLEN. unsigned getDLenFactor() const { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 046f63fe1617c..10b8a355e2fe3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -716,28 +716,6 @@ RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } -static bool hasOptimizedSegmentLoadStore(unsigned NF, - const RISCVSubtarget *ST) { - switch (NF) { - case 2: - return ST->hasOptimizedNF2SegmentLoadStore(); - case 3: - return ST->hasOptimizedNF3SegmentLoadStore(); - case 4: - return ST->hasOptimizedNF4SegmentLoadStore(); - case 5: - return ST->hasOptimizedNF5SegmentLoadStore(); - case 6: - return ST->hasOptimizedNF6SegmentLoadStore(); - case 7: - return ST->hasOptimizedNF7SegmentLoadStore(); - case 8: - return ST->hasOptimizedNF8SegmentLoadStore(); - default: - llvm_unreachable("Unexpected NF"); - } -} - InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, @@ -761,7 +739,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( // Some processors optimize segment loads/stores as one wide memory op + // Factor * LMUL shuffle ops. - if (hasOptimizedSegmentLoadStore(Factor, ST)) { + if (ST->hasOptimizedSegmentLoadStore(Factor)) { InstructionCost Cost = getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind); MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index a5419c7cd1c2d..25ef3050e266a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1198,8 +1198,9 @@ define <4 x i32> @load_factor2_one_active(ptr %ptr) { define <4 x i32> @load_factor3_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor3_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <12 x i32>, ptr %ptr %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> @@ -1209,8 +1210,9 @@ define <4 x i32> @load_factor3_one_active(ptr %ptr) { define <4 x i32> @load_factor4_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor4_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <16 x i32>, ptr %ptr %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> @@ -1220,8 +1222,9 @@ define <4 x i32> @load_factor4_one_active(ptr %ptr) { define <4 x i32> @load_factor5_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor5_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 20 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <20 x i32>, ptr %ptr %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> @@ -1231,30 +1234,35 @@ define <4 x i32> @load_factor5_one_active(ptr %ptr) { define <2 x i16> @load_factor6_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor6_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, a0, 10 +; CHECK-NEXT: li a1, 12 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: vlse16.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <12 x i16>, ptr %ptr - %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> ret <2 x i16> %v0 } define <4 x i8> @load_factor7_one_active(ptr %ptr) vscale_range(8,1024) { ; CHECK-LABEL: load_factor7_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, a0, 1 +; CHECK-NEXT: li a1, 7 ; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma -; CHECK-NEXT: vlseg7e8.v v8, (a0) +; CHECK-NEXT: vlse8.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <32 x i8>, ptr %ptr - %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> + %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> ret <4 x i8> %v0 } define <4 x i8> @load_factor8_one_active(ptr %ptr) vscale_range(8,1024) { ; CHECK-LABEL: load_factor8_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 8 ; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma -; CHECK-NEXT: vlseg8e8.v v8, (a0) +; CHECK-NEXT: vlse8.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <32 x i8>, ptr %ptr %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32>