Skip to content

Commit 4e970d7

Browse files
authored
[AArch64][GlobalISel] Select llvm.aarch64.neon.st* intrinsics (#65491)
Similar to llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
1 parent 87d77d3 commit 4e970d7

File tree

5 files changed

+6294
-14
lines changed

5 files changed

+6294
-14
lines changed

llvm/lib/Target/AArch64/AArch64InstrGISel.td

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,3 +450,48 @@ def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
450450
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
451451
(UMINPv2i32 V64:$Rn, V64:$Rn), dsub),
452452
ssub))>;
453+
454+
// Match stores from lane 0 to the appropriate subreg's store.
455+
multiclass VecStoreLane64_0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
456+
ValueType VTy, ValueType STy,
457+
SubRegIndex SubRegIdx, Operand IndexType,
458+
Instruction STR> {
459+
def : Pat<(storeop (STy (vector_extract (VTy VecListOne64:$Vt), (i64 0))),
460+
(UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
461+
(STR (EXTRACT_SUBREG VecListOne64:$Vt, SubRegIdx),
462+
GPR64sp:$Rn, IndexType:$offset)>;
463+
}
464+
multiclass VecStoreULane64_0Pat<SDPatternOperator StoreOp,
465+
ValueType VTy, ValueType STy,
466+
SubRegIndex SubRegIdx, Instruction STR> {
467+
defm : VecStoreLane64_0Pat<am_unscaled64, StoreOp, VTy, STy, SubRegIdx, simm9, STR>;
468+
}
469+
470+
multiclass VecROStoreLane64_0Pat<ROAddrMode ro, SDPatternOperator storeop,
471+
ValueType VecTy, ValueType STy,
472+
SubRegIndex SubRegIdx,
473+
Instruction STRW, Instruction STRX> {
474+
475+
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne64:$Vt), (i64 0))),
476+
(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
477+
(STRW (EXTRACT_SUBREG VecListOne64:$Vt, SubRegIdx),
478+
GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
479+
480+
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne64:$Vt), (i64 0))),
481+
(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
482+
(STRX (EXTRACT_SUBREG VecListOne64:$Vt, SubRegIdx),
483+
GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
484+
}
485+
486+
let AddedComplexity = 19 in {
487+
def : St1Lane128Pat<store, VectorIndexB, v16i8, i8, ST1i8>;
488+
def : St1Lane64Pat<store, VectorIndexB, v8i8, i8, ST1i8>;
489+
490+
defm : VecStoreLane64_0Pat<am_indexed16, store, v4i16, i16, hsub, uimm12s2, STRHui>;
491+
defm : VecStoreLane64_0Pat<am_indexed32, store, v2i32, i32, ssub, uimm12s4, STRSui>;
492+
493+
defm : VecStoreULane64_0Pat<store, v4i16, i16, hsub, STURHi>;
494+
defm : VecStoreULane64_0Pat<store, v2i32, i32, ssub, STURSi>;
495+
defm : VecROStoreLane64_0Pat<ro16, store, v4i16, i16, hsub, STRHroW, STRHroX>;
496+
defm : VecROStoreLane64_0Pat<ro32, store, v2i32, i32, ssub, STRSroW, STRSroX>;
497+
}

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3684,12 +3684,12 @@ multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
36843684
SubRegIndex SubRegIdx,
36853685
Instruction STRW, Instruction STRX> {
36863686

3687-
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
3687+
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), (i64 0))),
36883688
(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
36893689
(STRW (SubRegTy (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx)),
36903690
GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
36913691

3692-
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
3692+
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), (i64 0))),
36933693
(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
36943694
(STRX (SubRegTy (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx)),
36953695
GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
@@ -3823,7 +3823,7 @@ multiclass VecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop
38233823
ValueType SubRegTy,
38243824
SubRegIndex SubRegIdx, Operand IndexType,
38253825
Instruction STR> {
3826-
def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)),
3826+
def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), (i64 0))),
38273827
(UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
38283828
(STR (SubRegTy (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx)),
38293829
GPR64sp:$Rn, IndexType:$offset)>;

llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp

Lines changed: 229 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,10 @@ class AArch64InstructionSelector : public InstructionSelector {
194194
MachineInstr &I);
195195
bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
196196
MachineInstr &I);
197+
void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
198+
unsigned Opc);
199+
bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
200+
unsigned Opc);
197201
bool selectIntrinsicWithSideEffects(MachineInstr &I,
198202
MachineRegisterInfo &MRI);
199203
bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -5697,7 +5701,56 @@ bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
56975701
!emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI))
56985702
return false;
56995703
}
5704+
return true;
5705+
}
5706+
5707+
void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
5708+
unsigned NumVecs,
5709+
unsigned Opc) {
5710+
MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
5711+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
5712+
Register Ptr = I.getOperand(1 + NumVecs).getReg();
5713+
5714+
SmallVector<Register, 2> Regs(NumVecs);
5715+
std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
5716+
Regs.begin(), [](auto MO) { return MO.getReg(); });
5717+
5718+
Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
5719+
: createDTuple(Regs, MIB);
5720+
auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
5721+
Store.cloneMemRefs(I);
5722+
constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5723+
}
5724+
5725+
bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
5726+
MachineInstr &I, unsigned NumVecs, unsigned Opc) {
5727+
MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
5728+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
5729+
bool Narrow = Ty.getSizeInBits() == 64;
5730+
5731+
SmallVector<Register, 2> Regs(NumVecs);
5732+
std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
5733+
Regs.begin(), [](auto MO) { return MO.getReg(); });
5734+
5735+
if (Narrow)
5736+
transform(Regs, Regs.begin(), [this](Register Reg) {
5737+
return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
5738+
->getOperand(0)
5739+
.getReg();
5740+
});
57005741

5742+
Register Tuple = createQTuple(Regs, MIB);
5743+
5744+
auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI);
5745+
if (!LaneNo)
5746+
return false;
5747+
Register Ptr = I.getOperand(1 + NumVecs + 1).getReg();
5748+
auto Store = MIB.buildInstr(Opc, {}, {})
5749+
.addReg(Tuple)
5750+
.addImm(LaneNo->getZExtValue())
5751+
.addReg(Ptr);
5752+
Store.cloneMemRefs(I);
5753+
constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
57015754
return true;
57025755
}
57035756

@@ -6005,11 +6058,80 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
60056058
selectVectorLoadIntrinsic(Opc, 4, I);
60066059
break;
60076060
}
6061+
case Intrinsic::aarch64_neon_st1x2: {
6062+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
6063+
unsigned Opc;
6064+
if (Ty == LLT::fixed_vector(8, S8))
6065+
Opc = AArch64::ST1Twov8b;
6066+
else if (Ty == LLT::fixed_vector(16, S8))
6067+
Opc = AArch64::ST1Twov16b;
6068+
else if (Ty == LLT::fixed_vector(4, S16))
6069+
Opc = AArch64::ST1Twov4h;
6070+
else if (Ty == LLT::fixed_vector(8, S16))
6071+
Opc = AArch64::ST1Twov8h;
6072+
else if (Ty == LLT::fixed_vector(2, S32))
6073+
Opc = AArch64::ST1Twov2s;
6074+
else if (Ty == LLT::fixed_vector(4, S32))
6075+
Opc = AArch64::ST1Twov4s;
6076+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6077+
Opc = AArch64::ST1Twov2d;
6078+
else if (Ty == S64 || Ty == P0)
6079+
Opc = AArch64::ST1Twov1d;
6080+
else
6081+
llvm_unreachable("Unexpected type for st1x2!");
6082+
selectVectorStoreIntrinsic(I, 2, Opc);
6083+
break;
6084+
}
6085+
case Intrinsic::aarch64_neon_st1x3: {
6086+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
6087+
unsigned Opc;
6088+
if (Ty == LLT::fixed_vector(8, S8))
6089+
Opc = AArch64::ST1Threev8b;
6090+
else if (Ty == LLT::fixed_vector(16, S8))
6091+
Opc = AArch64::ST1Threev16b;
6092+
else if (Ty == LLT::fixed_vector(4, S16))
6093+
Opc = AArch64::ST1Threev4h;
6094+
else if (Ty == LLT::fixed_vector(8, S16))
6095+
Opc = AArch64::ST1Threev8h;
6096+
else if (Ty == LLT::fixed_vector(2, S32))
6097+
Opc = AArch64::ST1Threev2s;
6098+
else if (Ty == LLT::fixed_vector(4, S32))
6099+
Opc = AArch64::ST1Threev4s;
6100+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6101+
Opc = AArch64::ST1Threev2d;
6102+
else if (Ty == S64 || Ty == P0)
6103+
Opc = AArch64::ST1Threev1d;
6104+
else
6105+
llvm_unreachable("Unexpected type for st1x3!");
6106+
selectVectorStoreIntrinsic(I, 3, Opc);
6107+
break;
6108+
}
6109+
case Intrinsic::aarch64_neon_st1x4: {
6110+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
6111+
unsigned Opc;
6112+
if (Ty == LLT::fixed_vector(8, S8))
6113+
Opc = AArch64::ST1Fourv8b;
6114+
else if (Ty == LLT::fixed_vector(16, S8))
6115+
Opc = AArch64::ST1Fourv16b;
6116+
else if (Ty == LLT::fixed_vector(4, S16))
6117+
Opc = AArch64::ST1Fourv4h;
6118+
else if (Ty == LLT::fixed_vector(8, S16))
6119+
Opc = AArch64::ST1Fourv8h;
6120+
else if (Ty == LLT::fixed_vector(2, S32))
6121+
Opc = AArch64::ST1Fourv2s;
6122+
else if (Ty == LLT::fixed_vector(4, S32))
6123+
Opc = AArch64::ST1Fourv4s;
6124+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6125+
Opc = AArch64::ST1Fourv2d;
6126+
else if (Ty == S64 || Ty == P0)
6127+
Opc = AArch64::ST1Fourv1d;
6128+
else
6129+
llvm_unreachable("Unexpected type for st1x4!");
6130+
selectVectorStoreIntrinsic(I, 4, Opc);
6131+
break;
6132+
}
60086133
case Intrinsic::aarch64_neon_st2: {
6009-
Register Src1 = I.getOperand(1).getReg();
6010-
Register Src2 = I.getOperand(2).getReg();
6011-
Register Ptr = I.getOperand(3).getReg();
6012-
LLT Ty = MRI.getType(Src1);
6134+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
60136135
unsigned Opc;
60146136
if (Ty == LLT::fixed_vector(8, S8))
60156137
Opc = AArch64::ST2Twov8b;
@@ -6029,12 +6151,109 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
60296151
Opc = AArch64::ST1Twov1d;
60306152
else
60316153
llvm_unreachable("Unexpected type for st2!");
6032-
SmallVector<Register, 2> Regs = {Src1, Src2};
6033-
Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6034-
: createDTuple(Regs, MIB);
6035-
auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
6036-
Store.cloneMemRefs(I);
6037-
constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6154+
selectVectorStoreIntrinsic(I, 2, Opc);
6155+
break;
6156+
}
6157+
case Intrinsic::aarch64_neon_st3: {
6158+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
6159+
unsigned Opc;
6160+
if (Ty == LLT::fixed_vector(8, S8))
6161+
Opc = AArch64::ST3Threev8b;
6162+
else if (Ty == LLT::fixed_vector(16, S8))
6163+
Opc = AArch64::ST3Threev16b;
6164+
else if (Ty == LLT::fixed_vector(4, S16))
6165+
Opc = AArch64::ST3Threev4h;
6166+
else if (Ty == LLT::fixed_vector(8, S16))
6167+
Opc = AArch64::ST3Threev8h;
6168+
else if (Ty == LLT::fixed_vector(2, S32))
6169+
Opc = AArch64::ST3Threev2s;
6170+
else if (Ty == LLT::fixed_vector(4, S32))
6171+
Opc = AArch64::ST3Threev4s;
6172+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6173+
Opc = AArch64::ST3Threev2d;
6174+
else if (Ty == S64 || Ty == P0)
6175+
Opc = AArch64::ST1Threev1d;
6176+
else
6177+
llvm_unreachable("Unexpected type for st3!");
6178+
selectVectorStoreIntrinsic(I, 3, Opc);
6179+
break;
6180+
}
6181+
case Intrinsic::aarch64_neon_st4: {
6182+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
6183+
unsigned Opc;
6184+
if (Ty == LLT::fixed_vector(8, S8))
6185+
Opc = AArch64::ST4Fourv8b;
6186+
else if (Ty == LLT::fixed_vector(16, S8))
6187+
Opc = AArch64::ST4Fourv16b;
6188+
else if (Ty == LLT::fixed_vector(4, S16))
6189+
Opc = AArch64::ST4Fourv4h;
6190+
else if (Ty == LLT::fixed_vector(8, S16))
6191+
Opc = AArch64::ST4Fourv8h;
6192+
else if (Ty == LLT::fixed_vector(2, S32))
6193+
Opc = AArch64::ST4Fourv2s;
6194+
else if (Ty == LLT::fixed_vector(4, S32))
6195+
Opc = AArch64::ST4Fourv4s;
6196+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6197+
Opc = AArch64::ST4Fourv2d;
6198+
else if (Ty == S64 || Ty == P0)
6199+
Opc = AArch64::ST1Fourv1d;
6200+
else
6201+
llvm_unreachable("Unexpected type for st4!");
6202+
selectVectorStoreIntrinsic(I, 4, Opc);
6203+
break;
6204+
}
6205+
case Intrinsic::aarch64_neon_st2lane: {
6206+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
6207+
unsigned Opc;
6208+
if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6209+
Opc = AArch64::ST2i8;
6210+
else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6211+
Opc = AArch64::ST2i16;
6212+
else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6213+
Opc = AArch64::ST2i32;
6214+
else if (Ty == LLT::fixed_vector(2, S64) ||
6215+
Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6216+
Opc = AArch64::ST2i64;
6217+
else
6218+
llvm_unreachable("Unexpected type for st2lane!");
6219+
if (!selectVectorStoreLaneIntrinsic(I, 2, Opc))
6220+
return false;
6221+
break;
6222+
}
6223+
case Intrinsic::aarch64_neon_st3lane: {
6224+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
6225+
unsigned Opc;
6226+
if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6227+
Opc = AArch64::ST3i8;
6228+
else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6229+
Opc = AArch64::ST3i16;
6230+
else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6231+
Opc = AArch64::ST3i32;
6232+
else if (Ty == LLT::fixed_vector(2, S64) ||
6233+
Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6234+
Opc = AArch64::ST3i64;
6235+
else
6236+
llvm_unreachable("Unexpected type for st3lane!");
6237+
if (!selectVectorStoreLaneIntrinsic(I, 3, Opc))
6238+
return false;
6239+
break;
6240+
}
6241+
case Intrinsic::aarch64_neon_st4lane: {
6242+
LLT Ty = MRI.getType(I.getOperand(1).getReg());
6243+
unsigned Opc;
6244+
if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6245+
Opc = AArch64::ST4i8;
6246+
else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6247+
Opc = AArch64::ST4i16;
6248+
else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6249+
Opc = AArch64::ST4i32;
6250+
else if (Ty == LLT::fixed_vector(2, S64) ||
6251+
Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6252+
Opc = AArch64::ST4i64;
6253+
else
6254+
llvm_unreachable("Unexpected type for st4lane!");
6255+
if (!selectVectorStoreLaneIntrinsic(I, 4, Opc))
6256+
return false;
60386257
break;
60396258
}
60406259
case Intrinsic::aarch64_mops_memset_tag: {

0 commit comments

Comments
 (0)