Skip to content

Commit a939eb6

Browse files
committed
Support additional data types
1 parent 1b51292 commit a939eb6

File tree

3 files changed

+393
-99
lines changed

3 files changed

+393
-99
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 207 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -7329,7 +7329,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
73297329
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
73307330
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
73317331
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7332-
case AArch64MachineCombinerPattern::SPLIT_LD:
7332+
case AArch64MachineCombinerPattern::GATHER_i32:
7333+
case AArch64MachineCombinerPattern::GATHER_i16:
7334+
case AArch64MachineCombinerPattern::GATHER_i8:
73337335
return true;
73347336
} // end switch (Pattern)
73357337
return false;
@@ -7370,32 +7372,27 @@ static bool getMiscPatterns(MachineInstr &Root,
73707372
return false;
73717373
}
73727374

7373-
/// Search for patterns where we use LD1i32 instructions to load into
7374-
/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
7375-
/// by loading into 2 Neon registers instead.
7376-
static bool getLoadPatterns(MachineInstr &Root,
7377-
SmallVectorImpl<unsigned> &Patterns) {
7375+
static bool getGatherPattern(MachineInstr &Root,
7376+
SmallVectorImpl<unsigned> &Patterns,
7377+
unsigned LoadLaneOpCode,
7378+
unsigned NumLanes) {
73787379
const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
73797380
const TargetRegisterInfo *TRI =
73807381
Root.getMF()->getSubtarget().getRegisterInfo();
7381-
// Enable this only on Darwin targets, where it should be profitable. Other
7382-
// targets can remove this check if it is profitable there as well.
7383-
if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
7384-
return false;
7385-
7386-
// The pattern searches for loads into single lanes.
7387-
if (Root.getOpcode() != AArch64::LD1i32)
7388-
return false;
73897382

73907383
// The root of the pattern must load into the last lane of the vector.
7391-
if (Root.getOperand(2).getImm() != 3)
7384+
if (Root.getOperand(2).getImm() != NumLanes - 1)
73927385
return false;
73937386

73947387
// Check that we have load into all lanes except lane 0.
7388+
// For each load we also want to check that:
7389+
// 1. It has a single debug use (since we will be replacing the virtual register)
7390+
// 2. That the addressing mode only uses a single offset register.
73957391
auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7396-
SmallSet<unsigned, 4> RemainingLanes({1, 2});
7392+
auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
7393+
SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
73977394
while (RemainingLanes.begin() != RemainingLanes.end() &&
7398-
CurrInstr->getOpcode() == AArch64::LD1i32 &&
7395+
CurrInstr->getOpcode() == LoadLaneOpCode &&
73997396
MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
74007397
CurrInstr->getNumOperands() == 4) {
74017398
RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
@@ -7409,25 +7406,202 @@ static bool getLoadPatterns(MachineInstr &Root,
74097406
if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
74107407
return false;
74117408

7412-
// Verify that the subreg to reg loads an i32 into the first lane.
7409+
// Verify that the subreg to reg loads an integer into the first lane.
74137410
auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
7414-
if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != 32)
7411+
unsigned SingleLaneSizeInBits = 128 / NumLanes;
7412+
if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
74157413
return false;
74167414

74177415
// Verify that it also has a single non debug use.
74187416
if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
74197417
return false;
74207418

7421-
Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
7419+
switch (NumLanes) {
7420+
case 4:
7421+
Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i32);
7422+
break;
7423+
case 8:
7424+
Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i16);
7425+
break;
7426+
case 16:
7427+
Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i8);
7428+
break;
7429+
default:
7430+
llvm_unreachable("Got bad number of lanes for gather pattern.");
7431+
}
7432+
74227433
return true;
74237434
}
74247435

7436+
/// Search for patterns where we use LD1 instructions to load into
7437+
/// separate lanes of an 128 bit Neon register. We can increase MLP
7438+
/// by loading into 2 Neon registers instead.
7439+
static bool getLoadPatterns(MachineInstr &Root,
7440+
SmallVectorImpl<unsigned> &Patterns) {
7441+
// Enable this only on Darwin targets, where it should be profitable. Other
7442+
// targets can remove this check if it is profitable there as well.
7443+
if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
7444+
return false;
7445+
7446+
// The pattern searches for loads into single lanes.
7447+
switch (Root.getOpcode()) {
7448+
case AArch64::LD1i32:
7449+
return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
7450+
case AArch64::LD1i16:
7451+
return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
7452+
case AArch64::LD1i8:
7453+
return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
7454+
default:
7455+
return false;
7456+
}
7457+
}
7458+
7459+
static void generateGatherPattern(
7460+
MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7461+
SmallVectorImpl<MachineInstr *> &DelInstrs,
7462+
DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
7463+
unsigned NumLanes) {
7464+
7465+
MachineFunction &MF = *Root.getParent()->getParent();
7466+
MachineRegisterInfo &MRI = MF.getRegInfo();
7467+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7468+
7469+
// Gather the initial load instructions to build the pattern
7470+
SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
7471+
MachineInstr *CurrInstr = &Root;
7472+
for (unsigned i = 0; i < NumLanes - 1; ++i) {
7473+
LoadToLaneInstrs.push_back(CurrInstr);
7474+
CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
7475+
}
7476+
7477+
MachineInstr *SubregToReg = CurrInstr;
7478+
LoadToLaneInstrs.push_back(
7479+
MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
7480+
auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
7481+
7482+
const TargetRegisterClass *FPR128RegClass =
7483+
MRI.getRegClass(Root.getOperand(0).getReg());
7484+
7485+
auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
7486+
Register SrcRegister, unsigned Lane,
7487+
Register OffsetRegister) {
7488+
auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
7489+
MachineInstrBuilder LoadIndexIntoRegister =
7490+
BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
7491+
NewRegister)
7492+
.addReg(SrcRegister)
7493+
.addImm(Lane)
7494+
.addReg(OffsetRegister, getKillRegState(true));
7495+
InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
7496+
InsInstrs.push_back(LoadIndexIntoRegister);
7497+
return NewRegister;
7498+
};
7499+
7500+
// Helper to create load instruction based on opcode
7501+
auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
7502+
Register OffsetReg) -> MachineInstrBuilder {
7503+
unsigned Opcode;
7504+
switch (NumLanes) {
7505+
case 4:
7506+
Opcode = AArch64::LDRSui;
7507+
break;
7508+
case 8:
7509+
Opcode = AArch64::LDRHui;
7510+
break;
7511+
case 16:
7512+
Opcode = AArch64::LDRBui;
7513+
break;
7514+
default:
7515+
llvm_unreachable("Got unsupported number of lanes in machine-combiner gather pattern");
7516+
}
7517+
// Immediate offset load
7518+
return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
7519+
.addReg(OffsetReg)
7520+
.addImm(0); // immediate offset
7521+
};
7522+
7523+
// Load the remaining lanes into register 0.
7524+
auto LanesToLoadToReg0 =
7525+
llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
7526+
LoadToLaneInstrsAscending.begin() + NumLanes / 2);
7527+
auto PrevReg = SubregToReg->getOperand(0).getReg();
7528+
for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
7529+
PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
7530+
DelInstrs.push_back(LoadInstr);
7531+
}
7532+
auto LastLoadReg0 = PrevReg;
7533+
7534+
// First load into register 1. Perform a LDRSui to zero out the upper lanes in a single instruction.
7535+
auto Lane0Load = *LoadToLaneInstrsAscending.begin();
7536+
auto OriginalSplitLoad = *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
7537+
auto DestRegForMiddleIndex = MRI.createVirtualRegister(
7538+
MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
7539+
7540+
MachineInstrBuilder MiddleIndexLoadInstr = CreateLoadInstruction(
7541+
NumLanes, DestRegForMiddleIndex,
7542+
OriginalSplitLoad->getOperand(3).getReg());
7543+
7544+
InstrIdxForVirtReg.insert(std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
7545+
InsInstrs.push_back(MiddleIndexLoadInstr);
7546+
DelInstrs.push_back(OriginalSplitLoad);
7547+
7548+
// Subreg To Reg instruction for register 1.
7549+
auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
7550+
unsigned SubregType;
7551+
switch (NumLanes) {
7552+
case 4:
7553+
SubregType = AArch64::ssub;
7554+
break;
7555+
case 8:
7556+
SubregType = AArch64::hsub;
7557+
break;
7558+
case 16:
7559+
SubregType = AArch64::bsub;
7560+
break;
7561+
default:
7562+
llvm_unreachable("Got invalid NumLanes for machine-combiner gather pattern");
7563+
}
7564+
7565+
auto SubRegToRegInstr =
7566+
BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
7567+
DestRegForSubregToReg)
7568+
.addImm(0)
7569+
.addReg(DestRegForMiddleIndex, getKillRegState(true))
7570+
.addImm(SubregType);
7571+
InstrIdxForVirtReg.insert(
7572+
std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
7573+
InsInstrs.push_back(SubRegToRegInstr);
7574+
7575+
// Load remaining lanes into register 1.
7576+
auto LanesToLoadToReg1 = llvm::make_range(
7577+
LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, LoadToLaneInstrsAscending.end());
7578+
PrevReg = SubRegToRegInstr->getOperand(0).getReg();
7579+
for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
7580+
PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
7581+
if (Index == NumLanes / 2 - 2) {
7582+
break;
7583+
}
7584+
DelInstrs.push_back(LoadInstr);
7585+
}
7586+
auto LastLoadReg1 = PrevReg;
7587+
7588+
// Create the final zip instruction to combine the results.
7589+
MachineInstrBuilder ZipInstr =
7590+
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
7591+
Root.getOperand(0).getReg())
7592+
.addReg(LastLoadReg0)
7593+
.addReg(LastLoadReg1);
7594+
InsInstrs.push_back(ZipInstr);
7595+
}
7596+
74257597
CombinerObjective
74267598
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
74277599
switch (Pattern) {
74287600
case AArch64MachineCombinerPattern::SUBADD_OP1:
74297601
case AArch64MachineCombinerPattern::SUBADD_OP2:
7430-
case AArch64MachineCombinerPattern::SPLIT_LD:
7602+
case AArch64MachineCombinerPattern::GATHER_i32:
7603+
case AArch64MachineCombinerPattern::GATHER_i16:
7604+
case AArch64MachineCombinerPattern::GATHER_i8:
74317605
return CombinerObjective::MustReduceDepth;
74327606
default:
74337607
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -8791,82 +8965,18 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
87918965
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
87928966
break;
87938967
}
8794-
case AArch64MachineCombinerPattern::SPLIT_LD: {
8795-
// Gather the initial load instructions to build the pattern
8796-
MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8797-
MachineInstr *Lane1Load =
8798-
MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
8799-
MachineInstr *SubregToReg =
8800-
MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
8801-
MachineInstr *Lane0Load =
8802-
MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg());
8803-
8804-
const TargetRegisterClass *FPR128RegClass =
8805-
MRI.getRegClass(Root.getOperand(0).getReg());
8806-
8807-
auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
8808-
Register SrcRegister, unsigned Lane,
8809-
Register OffsetRegister) {
8810-
auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8811-
MachineInstrBuilder LoadIndexIntoRegister =
8812-
BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8813-
NewRegister)
8814-
.addReg(SrcRegister)
8815-
.addImm(Lane)
8816-
.addReg(OffsetRegister, getKillRegState(true));
8817-
InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8818-
InsInstrs.push_back(LoadIndexIntoRegister);
8819-
return NewRegister;
8820-
};
8821-
8822-
// Helper to create load instruction based on opcode
8823-
auto CreateLoadInstruction = [&](unsigned Opcode, Register DestReg,
8824-
Register OffsetReg) -> MachineInstrBuilder {
8825-
return BuildMI(MF, MIMetadata(Root), TII->get(AArch64::LDRSui), DestReg)
8826-
.addReg(OffsetReg)
8827-
.addImm(0); // immediate offset
8828-
};
8829-
8830-
// Load index 1 into register 0 lane 1
8831-
Register Index1LoadReg =
8832-
LoadLaneToRegister(Lane1Load, SubregToReg->getOperand(0).getReg(), 1,
8833-
Lane1Load->getOperand(3).getReg());
8834-
DelInstrs.push_back(Lane1Load);
8835-
8836-
// Load index 2 into register 1 lane 0
8837-
auto DestRegForIndex2 = MRI.createVirtualRegister(
8838-
MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8839-
8840-
MachineInstrBuilder Index2LoadInstr = CreateLoadInstruction(
8841-
Lane0Load->getOpcode(), DestRegForIndex2,
8842-
Lane2Load->getOperand(3).getReg());
8843-
8844-
InstrIdxForVirtReg.insert(std::make_pair(DestRegForIndex2, InsInstrs.size()));
8845-
InsInstrs.push_back(Index2LoadInstr);
8846-
DelInstrs.push_back(Lane2Load);
8847-
8848-
// Convert fpr32 to fpr128 using subreg
8849-
auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8850-
auto SubRegToRegInstr = BuildMI(MF, MIMetadata(Root),
8851-
TII->get(SubregToReg->getOpcode()),
8852-
DestRegForSubregToReg)
8853-
.addImm(0)
8854-
.addReg(DestRegForIndex2, getKillRegState(true))
8855-
.addImm(AArch64::ssub);
8856-
InstrIdxForVirtReg.insert(std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8857-
InsInstrs.push_back(SubRegToRegInstr);
8858-
8859-
// Load index 3 into register 1 lane 1
8860-
auto Index3LoadReg = LoadLaneToRegister(&Root, DestRegForSubregToReg, 1,
8861-
Root.getOperand(3).getReg());
8862-
8863-
// Create the final zip instruction to combine the results
8864-
MachineInstrBuilder ZipInstr =
8865-
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8866-
Root.getOperand(0).getReg())
8867-
.addReg(Index1LoadReg)
8868-
.addReg(Index3LoadReg);
8869-
InsInstrs.push_back(ZipInstr);
8968+
case AArch64MachineCombinerPattern::GATHER_i32: {
8969+
generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
8970+
Pattern, 4);
8971+
break;
8972+
}
8973+
case AArch64MachineCombinerPattern::GATHER_i16: {
8974+
generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 8);
8975+
break;
8976+
}
8977+
case AArch64MachineCombinerPattern::GATHER_i8: {
8978+
generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 16);
8979+
break;
88708980
}
88718981

88728982
} // end switch (Pattern)

llvm/lib/Target/AArch64/AArch64InstrInfo.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,9 @@ enum AArch64MachineCombinerPattern : unsigned {
173173

174174
FNMADD,
175175

176-
SPLIT_LD,
176+
GATHER_i32,
177+
GATHER_i16,
178+
GATHER_i8
177179
};
178180
class AArch64InstrInfo final : public AArch64GenInstrInfo {
179181
const AArch64RegisterInfo RI;

0 commit comments

Comments
 (0)