@@ -7329,7 +7329,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
7329
7329
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7330
7330
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7331
7331
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7332
- case AArch64MachineCombinerPattern::SPLIT_LD:
7332
+ case AArch64MachineCombinerPattern::GATHER_i32:
7333
+ case AArch64MachineCombinerPattern::GATHER_i16:
7334
+ case AArch64MachineCombinerPattern::GATHER_i8:
7333
7335
return true ;
7334
7336
} // end switch (Pattern)
7335
7337
return false ;
@@ -7370,32 +7372,27 @@ static bool getMiscPatterns(MachineInstr &Root,
7370
7372
return false ;
7371
7373
}
7372
7374
7373
- // / Search for patterns where we use LD1i32 instructions to load into
7374
- // / 4 separate lanes of a 128 bit Neon register. We can increase ILP
7375
- // / by loading into 2 Neon registers instead.
7376
- static bool getLoadPatterns (MachineInstr &Root,
7377
- SmallVectorImpl<unsigned > &Patterns) {
7375
+ static bool getGatherPattern (MachineInstr &Root,
7376
+ SmallVectorImpl<unsigned > &Patterns,
7377
+ unsigned LoadLaneOpCode,
7378
+ unsigned NumLanes) {
7378
7379
const MachineRegisterInfo &MRI = Root.getMF ()->getRegInfo ();
7379
7380
const TargetRegisterInfo *TRI =
7380
7381
Root.getMF ()->getSubtarget ().getRegisterInfo ();
7381
- // Enable this only on Darwin targets, where it should be profitable. Other
7382
- // targets can remove this check if it is profitable there as well.
7383
- if (!Root.getMF ()->getTarget ().getTargetTriple ().isOSDarwin ())
7384
- return false ;
7385
-
7386
- // The pattern searches for loads into single lanes.
7387
- if (Root.getOpcode () != AArch64::LD1i32)
7388
- return false ;
7389
7382
7390
7383
// The root of the pattern must load into the last lane of the vector.
7391
- if (Root.getOperand (2 ).getImm () != 3 )
7384
+ if (Root.getOperand (2 ).getImm () != NumLanes - 1 )
7392
7385
return false ;
7393
7386
7394
7387
// Check that we have load into all lanes except lane 0.
7388
+ // For each load we also want to check that:
7389
+ // 1. It has a single debug use (since we will be replacing the virtual register)
7390
+ // 2. That the addressing mode only uses a single offset register.
7395
7391
auto *CurrInstr = MRI.getUniqueVRegDef (Root.getOperand (1 ).getReg ());
7396
- SmallSet<unsigned , 4 > RemainingLanes ({1 , 2 });
7392
+ auto Range = llvm::seq<unsigned >(1 , NumLanes - 1 );
7393
+ SmallSet<unsigned , 4 > RemainingLanes (Range.begin (), Range.end ());
7397
7394
while (RemainingLanes.begin () != RemainingLanes.end () &&
7398
- CurrInstr->getOpcode () == AArch64::LD1i32 &&
7395
+ CurrInstr->getOpcode () == LoadLaneOpCode &&
7399
7396
MRI.hasOneNonDBGUse (CurrInstr->getOperand (0 ).getReg ()) &&
7400
7397
CurrInstr->getNumOperands () == 4 ) {
7401
7398
RemainingLanes.erase (CurrInstr->getOperand (2 ).getImm ());
@@ -7409,25 +7406,202 @@ static bool getLoadPatterns(MachineInstr &Root,
7409
7406
if (CurrInstr->getOpcode () != TargetOpcode::SUBREG_TO_REG)
7410
7407
return false ;
7411
7408
7412
- // Verify that the subreg to reg loads an i32 into the first lane.
7409
+ // Verify that the subreg to reg loads an integer into the first lane.
7413
7410
auto Lane0LoadReg = CurrInstr->getOperand (2 ).getReg ();
7414
- if (TRI->getRegSizeInBits (Lane0LoadReg, MRI) != 32 )
7411
+ unsigned SingleLaneSizeInBits = 128 / NumLanes;
7412
+ if (TRI->getRegSizeInBits (Lane0LoadReg, MRI) != SingleLaneSizeInBits)
7415
7413
return false ;
7416
7414
7417
7415
// Verify that it also has a single non debug use.
7418
7416
if (!MRI.hasOneNonDBGUse (Lane0LoadReg))
7419
7417
return false ;
7420
7418
7421
- Patterns.push_back (AArch64MachineCombinerPattern::SPLIT_LD);
7419
+ switch (NumLanes) {
7420
+ case 4 :
7421
+ Patterns.push_back (AArch64MachineCombinerPattern::GATHER_i32);
7422
+ break ;
7423
+ case 8 :
7424
+ Patterns.push_back (AArch64MachineCombinerPattern::GATHER_i16);
7425
+ break ;
7426
+ case 16 :
7427
+ Patterns.push_back (AArch64MachineCombinerPattern::GATHER_i8);
7428
+ break ;
7429
+ default :
7430
+ llvm_unreachable (" Got bad number of lanes for gather pattern." );
7431
+ }
7432
+
7422
7433
return true ;
7423
7434
}
7424
7435
7436
+ // / Search for patterns where we use LD1 instructions to load into
7437
+ // / separate lanes of an 128 bit Neon register. We can increase MLP
7438
+ // / by loading into 2 Neon registers instead.
7439
+ static bool getLoadPatterns (MachineInstr &Root,
7440
+ SmallVectorImpl<unsigned > &Patterns) {
7441
+ // Enable this only on Darwin targets, where it should be profitable. Other
7442
+ // targets can remove this check if it is profitable there as well.
7443
+ if (!Root.getMF ()->getTarget ().getTargetTriple ().isOSDarwin ())
7444
+ return false ;
7445
+
7446
+ // The pattern searches for loads into single lanes.
7447
+ switch (Root.getOpcode ()) {
7448
+ case AArch64::LD1i32:
7449
+ return getGatherPattern (Root, Patterns, Root.getOpcode (), 4 );
7450
+ case AArch64::LD1i16:
7451
+ return getGatherPattern (Root, Patterns, Root.getOpcode (), 8 );
7452
+ case AArch64::LD1i8:
7453
+ return getGatherPattern (Root, Patterns, Root.getOpcode (), 16 );
7454
+ default :
7455
+ return false ;
7456
+ }
7457
+ }
7458
+
7459
+ static void generateGatherPattern (
7460
+ MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7461
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
7462
+ DenseMap<Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern,
7463
+ unsigned NumLanes) {
7464
+
7465
+ MachineFunction &MF = *Root.getParent ()->getParent ();
7466
+ MachineRegisterInfo &MRI = MF.getRegInfo ();
7467
+ const TargetInstrInfo *TII = MF.getSubtarget ().getInstrInfo ();
7468
+
7469
+ // Gather the initial load instructions to build the pattern
7470
+ SmallVector<MachineInstr *, 16 > LoadToLaneInstrs;
7471
+ MachineInstr *CurrInstr = &Root;
7472
+ for (unsigned i = 0 ; i < NumLanes - 1 ; ++i) {
7473
+ LoadToLaneInstrs.push_back (CurrInstr);
7474
+ CurrInstr = MRI.getUniqueVRegDef (CurrInstr->getOperand (1 ).getReg ());
7475
+ }
7476
+
7477
+ MachineInstr *SubregToReg = CurrInstr;
7478
+ LoadToLaneInstrs.push_back (
7479
+ MRI.getUniqueVRegDef (SubregToReg->getOperand (2 ).getReg ()));
7480
+ auto LoadToLaneInstrsAscending = llvm::reverse (LoadToLaneInstrs);
7481
+
7482
+ const TargetRegisterClass *FPR128RegClass =
7483
+ MRI.getRegClass (Root.getOperand (0 ).getReg ());
7484
+
7485
+ auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
7486
+ Register SrcRegister, unsigned Lane,
7487
+ Register OffsetRegister) {
7488
+ auto NewRegister = MRI.createVirtualRegister (FPR128RegClass);
7489
+ MachineInstrBuilder LoadIndexIntoRegister =
7490
+ BuildMI (MF, MIMetadata (*OriginalInstr), TII->get (Root.getOpcode ()),
7491
+ NewRegister)
7492
+ .addReg (SrcRegister)
7493
+ .addImm (Lane)
7494
+ .addReg (OffsetRegister, getKillRegState (true ));
7495
+ InstrIdxForVirtReg.insert (std::make_pair (NewRegister, InsInstrs.size ()));
7496
+ InsInstrs.push_back (LoadIndexIntoRegister);
7497
+ return NewRegister;
7498
+ };
7499
+
7500
+ // Helper to create load instruction based on opcode
7501
+ auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
7502
+ Register OffsetReg) -> MachineInstrBuilder {
7503
+ unsigned Opcode;
7504
+ switch (NumLanes) {
7505
+ case 4 :
7506
+ Opcode = AArch64::LDRSui;
7507
+ break ;
7508
+ case 8 :
7509
+ Opcode = AArch64::LDRHui;
7510
+ break ;
7511
+ case 16 :
7512
+ Opcode = AArch64::LDRBui;
7513
+ break ;
7514
+ default :
7515
+ llvm_unreachable (" Got unsupported number of lanes in machine-combiner gather pattern" );
7516
+ }
7517
+ // Immediate offset load
7518
+ return BuildMI (MF, MIMetadata (Root), TII->get (Opcode), DestReg)
7519
+ .addReg (OffsetReg)
7520
+ .addImm (0 ); // immediate offset
7521
+ };
7522
+
7523
+ // Load the remaining lanes into register 0.
7524
+ auto LanesToLoadToReg0 =
7525
+ llvm::make_range (LoadToLaneInstrsAscending.begin () + 1 ,
7526
+ LoadToLaneInstrsAscending.begin () + NumLanes / 2 );
7527
+ auto PrevReg = SubregToReg->getOperand (0 ).getReg ();
7528
+ for (auto [Index, LoadInstr] : llvm::enumerate (LanesToLoadToReg0)) {
7529
+ PrevReg = LoadLaneToRegister (LoadInstr, PrevReg, Index + 1 , LoadInstr->getOperand (3 ).getReg ());
7530
+ DelInstrs.push_back (LoadInstr);
7531
+ }
7532
+ auto LastLoadReg0 = PrevReg;
7533
+
7534
+ // First load into register 1. Perform a LDRSui to zero out the upper lanes in a single instruction.
7535
+ auto Lane0Load = *LoadToLaneInstrsAscending.begin ();
7536
+ auto OriginalSplitLoad = *std::next (LoadToLaneInstrsAscending.begin (), NumLanes / 2 );
7537
+ auto DestRegForMiddleIndex = MRI.createVirtualRegister (
7538
+ MRI.getRegClass (Lane0Load->getOperand (0 ).getReg ()));
7539
+
7540
+ MachineInstrBuilder MiddleIndexLoadInstr = CreateLoadInstruction (
7541
+ NumLanes, DestRegForMiddleIndex,
7542
+ OriginalSplitLoad->getOperand (3 ).getReg ());
7543
+
7544
+ InstrIdxForVirtReg.insert (std::make_pair (DestRegForMiddleIndex, InsInstrs.size ()));
7545
+ InsInstrs.push_back (MiddleIndexLoadInstr);
7546
+ DelInstrs.push_back (OriginalSplitLoad);
7547
+
7548
+ // Subreg To Reg instruction for register 1.
7549
+ auto DestRegForSubregToReg = MRI.createVirtualRegister (FPR128RegClass);
7550
+ unsigned SubregType;
7551
+ switch (NumLanes) {
7552
+ case 4 :
7553
+ SubregType = AArch64::ssub;
7554
+ break ;
7555
+ case 8 :
7556
+ SubregType = AArch64::hsub;
7557
+ break ;
7558
+ case 16 :
7559
+ SubregType = AArch64::bsub;
7560
+ break ;
7561
+ default :
7562
+ llvm_unreachable (" Got invalid NumLanes for machine-combiner gather pattern" );
7563
+ }
7564
+
7565
+ auto SubRegToRegInstr =
7566
+ BuildMI (MF, MIMetadata (Root), TII->get (SubregToReg->getOpcode ()),
7567
+ DestRegForSubregToReg)
7568
+ .addImm (0 )
7569
+ .addReg (DestRegForMiddleIndex, getKillRegState (true ))
7570
+ .addImm (SubregType);
7571
+ InstrIdxForVirtReg.insert (
7572
+ std::make_pair (DestRegForSubregToReg, InsInstrs.size ()));
7573
+ InsInstrs.push_back (SubRegToRegInstr);
7574
+
7575
+ // Load remaining lanes into register 1.
7576
+ auto LanesToLoadToReg1 = llvm::make_range (
7577
+ LoadToLaneInstrsAscending.begin () + NumLanes / 2 + 1 , LoadToLaneInstrsAscending.end ());
7578
+ PrevReg = SubRegToRegInstr->getOperand (0 ).getReg ();
7579
+ for (auto [Index, LoadInstr] : llvm::enumerate (LanesToLoadToReg1)) {
7580
+ PrevReg = LoadLaneToRegister (LoadInstr, PrevReg, Index + 1 , LoadInstr->getOperand (3 ).getReg ());
7581
+ if (Index == NumLanes / 2 - 2 ) {
7582
+ break ;
7583
+ }
7584
+ DelInstrs.push_back (LoadInstr);
7585
+ }
7586
+ auto LastLoadReg1 = PrevReg;
7587
+
7588
+ // Create the final zip instruction to combine the results.
7589
+ MachineInstrBuilder ZipInstr =
7590
+ BuildMI (MF, MIMetadata (Root), TII->get (AArch64::ZIP1v2i64),
7591
+ Root.getOperand (0 ).getReg ())
7592
+ .addReg (LastLoadReg0)
7593
+ .addReg (LastLoadReg1);
7594
+ InsInstrs.push_back (ZipInstr);
7595
+ }
7596
+
7425
7597
CombinerObjective
7426
7598
AArch64InstrInfo::getCombinerObjective (unsigned Pattern) const {
7427
7599
switch (Pattern) {
7428
7600
case AArch64MachineCombinerPattern::SUBADD_OP1:
7429
7601
case AArch64MachineCombinerPattern::SUBADD_OP2:
7430
- case AArch64MachineCombinerPattern::SPLIT_LD:
7602
+ case AArch64MachineCombinerPattern::GATHER_i32:
7603
+ case AArch64MachineCombinerPattern::GATHER_i16:
7604
+ case AArch64MachineCombinerPattern::GATHER_i8:
7431
7605
return CombinerObjective::MustReduceDepth;
7432
7606
default :
7433
7607
return TargetInstrInfo::getCombinerObjective (Pattern);
@@ -8791,82 +8965,18 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
8791
8965
MUL = genFNegatedMAD (MF, MRI, TII, Root, InsInstrs);
8792
8966
break ;
8793
8967
}
8794
- case AArch64MachineCombinerPattern::SPLIT_LD: {
8795
- // Gather the initial load instructions to build the pattern
8796
- MachineInstr *Lane2Load = MRI.getUniqueVRegDef (Root.getOperand (1 ).getReg ());
8797
- MachineInstr *Lane1Load =
8798
- MRI.getUniqueVRegDef (Lane2Load->getOperand (1 ).getReg ());
8799
- MachineInstr *SubregToReg =
8800
- MRI.getUniqueVRegDef (Lane1Load->getOperand (1 ).getReg ());
8801
- MachineInstr *Lane0Load =
8802
- MRI.getUniqueVRegDef (SubregToReg->getOperand (2 ).getReg ());
8803
-
8804
- const TargetRegisterClass *FPR128RegClass =
8805
- MRI.getRegClass (Root.getOperand (0 ).getReg ());
8806
-
8807
- auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
8808
- Register SrcRegister, unsigned Lane,
8809
- Register OffsetRegister) {
8810
- auto NewRegister = MRI.createVirtualRegister (FPR128RegClass);
8811
- MachineInstrBuilder LoadIndexIntoRegister =
8812
- BuildMI (MF, MIMetadata (*OriginalInstr), TII->get (Root.getOpcode ()),
8813
- NewRegister)
8814
- .addReg (SrcRegister)
8815
- .addImm (Lane)
8816
- .addReg (OffsetRegister, getKillRegState (true ));
8817
- InstrIdxForVirtReg.insert (std::make_pair (NewRegister, InsInstrs.size ()));
8818
- InsInstrs.push_back (LoadIndexIntoRegister);
8819
- return NewRegister;
8820
- };
8821
-
8822
- // Helper to create load instruction based on opcode
8823
- auto CreateLoadInstruction = [&](unsigned Opcode, Register DestReg,
8824
- Register OffsetReg) -> MachineInstrBuilder {
8825
- return BuildMI (MF, MIMetadata (Root), TII->get (AArch64::LDRSui), DestReg)
8826
- .addReg (OffsetReg)
8827
- .addImm (0 ); // immediate offset
8828
- };
8829
-
8830
- // Load index 1 into register 0 lane 1
8831
- Register Index1LoadReg =
8832
- LoadLaneToRegister (Lane1Load, SubregToReg->getOperand (0 ).getReg (), 1 ,
8833
- Lane1Load->getOperand (3 ).getReg ());
8834
- DelInstrs.push_back (Lane1Load);
8835
-
8836
- // Load index 2 into register 1 lane 0
8837
- auto DestRegForIndex2 = MRI.createVirtualRegister (
8838
- MRI.getRegClass (Lane0Load->getOperand (0 ).getReg ()));
8839
-
8840
- MachineInstrBuilder Index2LoadInstr = CreateLoadInstruction (
8841
- Lane0Load->getOpcode (), DestRegForIndex2,
8842
- Lane2Load->getOperand (3 ).getReg ());
8843
-
8844
- InstrIdxForVirtReg.insert (std::make_pair (DestRegForIndex2, InsInstrs.size ()));
8845
- InsInstrs.push_back (Index2LoadInstr);
8846
- DelInstrs.push_back (Lane2Load);
8847
-
8848
- // Convert fpr32 to fpr128 using subreg
8849
- auto DestRegForSubregToReg = MRI.createVirtualRegister (FPR128RegClass);
8850
- auto SubRegToRegInstr = BuildMI (MF, MIMetadata (Root),
8851
- TII->get (SubregToReg->getOpcode ()),
8852
- DestRegForSubregToReg)
8853
- .addImm (0 )
8854
- .addReg (DestRegForIndex2, getKillRegState (true ))
8855
- .addImm (AArch64::ssub);
8856
- InstrIdxForVirtReg.insert (std::make_pair (DestRegForSubregToReg, InsInstrs.size ()));
8857
- InsInstrs.push_back (SubRegToRegInstr);
8858
-
8859
- // Load index 3 into register 1 lane 1
8860
- auto Index3LoadReg = LoadLaneToRegister (&Root, DestRegForSubregToReg, 1 ,
8861
- Root.getOperand (3 ).getReg ());
8862
-
8863
- // Create the final zip instruction to combine the results
8864
- MachineInstrBuilder ZipInstr =
8865
- BuildMI (MF, MIMetadata (Root), TII->get (AArch64::ZIP1v2i64),
8866
- Root.getOperand (0 ).getReg ())
8867
- .addReg (Index1LoadReg)
8868
- .addReg (Index3LoadReg);
8869
- InsInstrs.push_back (ZipInstr);
8968
+ case AArch64MachineCombinerPattern::GATHER_i32: {
8969
+ generateGatherPattern (Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
8970
+ Pattern, 4 );
8971
+ break ;
8972
+ }
8973
+ case AArch64MachineCombinerPattern::GATHER_i16: {
8974
+ generateGatherPattern (Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 8 );
8975
+ break ;
8976
+ }
8977
+ case AArch64MachineCombinerPattern::GATHER_i8: {
8978
+ generateGatherPattern (Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 16 );
8979
+ break ;
8870
8980
}
8871
8981
8872
8982
} // end switch (Pattern)
0 commit comments