Skip to content

[RISCV] Prefer strided load for interleave load with only one lane active #115069

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21585,6 +21585,8 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
bool RISCVTargetLowering::lowerInterleavedLoad(
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Indices.size() == Shuffles.size());

IRBuilder<> Builder(LI);

auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
Expand All @@ -21595,6 +21597,27 @@ bool RISCVTargetLowering::lowerInterleavedLoad(

auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());

// If the segment load is going to be performed segment at a time anyways
// and there's only one element used, use a strided load instead. This
// will be equally fast, and create less vector register pressure.
if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
unsigned ScalarSizeInBytes = VTy->getScalarSizeInBits() / 8;
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
Value *VL = Builder.getInt32(VTy->getNumElements());

CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
{BasePtr, Stride, Mask, VL});
CI->addParamAttr(
0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
Shuffles[0]->replaceAllUsesWith(CI);
return true;
};

Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());

CallInst *VlsegN = Builder.CreateIntrinsic(
Expand Down
21 changes: 21 additions & 0 deletions llvm/lib/Target/RISCV/RISCVSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,27 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
return hasVInstructions() ? MaxInterleaveFactor : 1;
}

bool hasOptimizedSegmentLoadStore(unsigned NF) const {
switch (NF) {
case 2:
return hasOptimizedNF2SegmentLoadStore();
case 3:
return hasOptimizedNF3SegmentLoadStore();
case 4:
return hasOptimizedNF4SegmentLoadStore();
case 5:
return hasOptimizedNF5SegmentLoadStore();
case 6:
return hasOptimizedNF6SegmentLoadStore();
case 7:
return hasOptimizedNF7SegmentLoadStore();
case 8:
return hasOptimizedNF8SegmentLoadStore();
default:
llvm_unreachable("Unexpected NF");
}
}

// Returns VLEN divided by DLEN. Where DLEN is the datapath width of the
// vector hardware implementation which may be less than VLEN.
unsigned getDLenFactor() const {
Expand Down
24 changes: 1 addition & 23 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,28 +716,6 @@ RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
}

static bool hasOptimizedSegmentLoadStore(unsigned NF,
const RISCVSubtarget *ST) {
switch (NF) {
case 2:
return ST->hasOptimizedNF2SegmentLoadStore();
case 3:
return ST->hasOptimizedNF3SegmentLoadStore();
case 4:
return ST->hasOptimizedNF4SegmentLoadStore();
case 5:
return ST->hasOptimizedNF5SegmentLoadStore();
case 6:
return ST->hasOptimizedNF6SegmentLoadStore();
case 7:
return ST->hasOptimizedNF7SegmentLoadStore();
case 8:
return ST->hasOptimizedNF8SegmentLoadStore();
default:
llvm_unreachable("Unexpected NF");
}
}

InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
Expand All @@ -761,7 +739,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(

// Some processors optimize segment loads/stores as one wide memory op +
// Factor * LMUL shuffle ops.
if (hasOptimizedSegmentLoadStore(Factor, ST)) {
if (ST->hasOptimizedSegmentLoadStore(Factor)) {
InstructionCost Cost =
getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
Expand Down
24 changes: 16 additions & 8 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1198,8 +1198,9 @@ define <4 x i32> @load_factor2_one_active(ptr %ptr) {
define <4 x i32> @load_factor3_one_active(ptr %ptr) {
; CHECK-LABEL: load_factor3_one_active:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 12
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vlseg3e32.v v8, (a0)
; CHECK-NEXT: vlse32.v v8, (a0), a1
; CHECK-NEXT: ret
%interleaved.vec = load <12 x i32>, ptr %ptr
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
Expand All @@ -1209,8 +1210,9 @@ define <4 x i32> @load_factor3_one_active(ptr %ptr) {
define <4 x i32> @load_factor4_one_active(ptr %ptr) {
; CHECK-LABEL: load_factor4_one_active:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vlseg4e32.v v8, (a0)
; CHECK-NEXT: vlse32.v v8, (a0), a1
; CHECK-NEXT: ret
%interleaved.vec = load <16 x i32>, ptr %ptr
%v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
Expand All @@ -1220,8 +1222,9 @@ define <4 x i32> @load_factor4_one_active(ptr %ptr) {
define <4 x i32> @load_factor5_one_active(ptr %ptr) {
; CHECK-LABEL: load_factor5_one_active:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 20
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vlseg5e32.v v8, (a0)
; CHECK-NEXT: vlse32.v v8, (a0), a1
; CHECK-NEXT: ret
%interleaved.vec = load <20 x i32>, ptr %ptr
%v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
Expand All @@ -1231,30 +1234,35 @@ define <4 x i32> @load_factor5_one_active(ptr %ptr) {
define <2 x i16> @load_factor6_one_active(ptr %ptr) {
; CHECK-LABEL: load_factor6_one_active:
; CHECK: # %bb.0:
; CHECK-NEXT: addi a0, a0, 10
; CHECK-NEXT: li a1, 12
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; CHECK-NEXT: vlseg6e16.v v8, (a0)
; CHECK-NEXT: vlse16.v v8, (a0), a1
; CHECK-NEXT: ret
%interleaved.vec = load <12 x i16>, ptr %ptr
%v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
%v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 5, i32 11>
ret <2 x i16> %v0
}

define <4 x i8> @load_factor7_one_active(ptr %ptr) vscale_range(8,1024) {
; CHECK-LABEL: load_factor7_one_active:
; CHECK: # %bb.0:
; CHECK-NEXT: addi a0, a0, 1
; CHECK-NEXT: li a1, 7
; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
; CHECK-NEXT: vlseg7e8.v v8, (a0)
; CHECK-NEXT: vlse8.v v8, (a0), a1
; CHECK-NEXT: ret
%interleaved.vec = load <32 x i8>, ptr %ptr
%v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
%v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
ret <4 x i8> %v0
}

define <4 x i8> @load_factor8_one_active(ptr %ptr) vscale_range(8,1024) {
; CHECK-LABEL: load_factor8_one_active:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 8
; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
; CHECK-NEXT: vlseg8e8.v v8, (a0)
; CHECK-NEXT: vlse8.v v8, (a0), a1
; CHECK-NEXT: ret
%interleaved.vec = load <32 x i8>, ptr %ptr
%v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
Expand Down
Loading