@@ -187,7 +187,15 @@ class AArch64InstructionSelector : public InstructionSelector {
187187 ComplexRendererFns selectAddrModeIndexed (MachineOperand &Root) const {
188188 return selectAddrModeIndexed (Root, Width / 8 );
189189 }
190+
191+ bool isWorthFoldingIntoExtendedReg (MachineInstr &MI,
192+ const MachineRegisterInfo &MRI) const ;
193+ ComplexRendererFns
194+ selectAddrModeShiftedExtendXReg (MachineOperand &Root,
195+ unsigned SizeInBytes) const ;
190196 ComplexRendererFns selectAddrModeRegisterOffset (MachineOperand &Root) const ;
197+ ComplexRendererFns selectAddrModeXRO (MachineOperand &Root,
198+ unsigned SizeInBytes) const ;
191199
192200 void renderTruncImm (MachineInstrBuilder &MIB, const MachineInstr &MI) const ;
193201
@@ -1238,8 +1246,8 @@ bool AArch64InstructionSelector::earlySelectLoad(
12381246 if (DstSize != 64 )
12391247 return false ;
12401248
1241- // Check if we can do any folding from GEPs etc. into the load.
1242- auto ImmFn = selectAddrModeRegisterOffset (I.getOperand (1 ));
1249+ // Check if we can do any folding from GEPs/shifts etc. into the load.
1250+ auto ImmFn = selectAddrModeXRO (I.getOperand (1 ), MemBytes );
12431251 if (!ImmFn)
12441252 return false ;
12451253
@@ -3995,6 +4003,98 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
39954003 }};
39964004}
39974005
4006+ // / Return true if it is worth folding MI into an extended register. That is,
4007+ // / if it's safe to pull it into the addressing mode of a load or store as a
4008+ // / shift.
4009+ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg (
4010+ MachineInstr &MI, const MachineRegisterInfo &MRI) const {
4011+ // Always fold if there is one use, or if we're optimizing for size.
4012+ Register DefReg = MI.getOperand (0 ).getReg ();
4013+ if (MRI.hasOneUse (DefReg) ||
4014+ MI.getParent ()->getParent ()->getFunction ().hasMinSize ())
4015+ return true ;
4016+
4017+ // It's better to avoid folding and recomputing shifts when we don't have a
4018+ // fastpath.
4019+ if (!STI.hasLSLFast ())
4020+ return false ;
4021+
4022+ // We have a fastpath, so folding a shift in and potentially computing it
4023+ // many times may be beneficial. Check if this is only used in memory ops.
4024+ // If it is, then we should fold.
4025+ return all_of (MRI.use_instructions (DefReg),
4026+ [](MachineInstr &Use) { return Use.mayLoadOrStore (); });
4027+ }
4028+
4029+ // / This is used for computing addresses like this:
4030+ // /
4031+ // / ldr x1, [x2, x3, lsl #3]
4032+ // /
4033+ // / Where x2 is the base register, and x3 is an offset register. The shift-left
4034+ // / is a constant value specific to this load instruction. That is, we'll never
4035+ // / see anything other than a 3 here (which corresponds to the size of the
4036+ // / element being loaded.)
4037+ InstructionSelector::ComplexRendererFns
4038+ AArch64InstructionSelector::selectAddrModeShiftedExtendXReg (
4039+ MachineOperand &Root, unsigned SizeInBytes) const {
4040+ if (!Root.isReg ())
4041+ return None;
4042+ MachineRegisterInfo &MRI = Root.getParent ()->getMF ()->getRegInfo ();
4043+
4044+ // Make sure that the memory op is a valid size.
4045+ int64_t LegalShiftVal = Log2_32 (SizeInBytes);
4046+ if (LegalShiftVal == 0 )
4047+ return None;
4048+
4049+ // We want to find something like this:
4050+ //
4051+ // val = G_CONSTANT LegalShiftVal
4052+ // shift = G_SHL off_reg val
4053+ // ptr = G_GEP base_reg shift
4054+ // x = G_LOAD ptr
4055+ //
4056+ // And fold it into this addressing mode:
4057+ //
4058+ // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
4059+
4060+ // Check if we can find the G_GEP.
4061+ MachineInstr *Gep = getOpcodeDef (TargetOpcode::G_GEP, Root.getReg (), MRI);
4062+ if (!Gep || !isWorthFoldingIntoExtendedReg (*Gep, MRI))
4063+ return None;
4064+
4065+ // Now try to match the G_SHL.
4066+ MachineInstr *Shl =
4067+ getOpcodeDef (TargetOpcode::G_SHL, Gep->getOperand (2 ).getReg (), MRI);
4068+ if (!Shl || !isWorthFoldingIntoExtendedReg (*Shl, MRI))
4069+ return None;
4070+
4071+ // Now, try to find the specific G_CONSTANT.
4072+ auto ValAndVReg =
4073+ getConstantVRegValWithLookThrough (Shl->getOperand (2 ).getReg (), MRI);
4074+ if (!ValAndVReg)
4075+ return None;
4076+
4077+ // The value must fit into 3 bits, and must be positive. Make sure that is
4078+ // true.
4079+ int64_t ImmVal = ValAndVReg->Value ;
4080+ if ((ImmVal & 0x7 ) != ImmVal)
4081+ return None;
4082+
4083+ // We are only allowed to shift by LegalShiftVal. This shift value is built
4084+ // into the instruction, so we can't just use whatever we want.
4085+ if (ImmVal != LegalShiftVal)
4086+ return None;
4087+
4088+ // We can use the LHS of the GEP as the base, and the LHS of the shift as an
4089+ // offset. Signify that we are shifting by setting the shift flag to 1.
4090+ return {{
4091+ [=](MachineInstrBuilder &MIB) { MIB.add (Gep->getOperand (1 )); },
4092+ [=](MachineInstrBuilder &MIB) { MIB.add (Shl->getOperand (1 )); },
4093+ [=](MachineInstrBuilder &MIB) { MIB.addImm (0 ); },
4094+ [=](MachineInstrBuilder &MIB) { MIB.addImm (1 ); },
4095+ }};
4096+ }
4097+
39984098// / This is used for computing addresses like this:
39994099// /
40004100// / ldr x1, [x2, x3]
@@ -4008,11 +4108,6 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset(
40084108 MachineOperand &Root) const {
40094109 MachineRegisterInfo &MRI = Root.getParent ()->getMF ()->getRegInfo ();
40104110
4011- // If we have a constant offset, then we probably don't want to match a
4012- // register offset.
4013- if (isBaseWithConstantOffset (Root, MRI))
4014- return None;
4015-
40164111 // We need a GEP.
40174112 MachineInstr *Gep = MRI.getVRegDef (Root.getReg ());
40184113 if (!Gep || Gep->getOpcode () != TargetOpcode::G_GEP)
@@ -4033,6 +4128,28 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset(
40334128 }};
40344129}
40354130
4131+ // / This is intended to be equivalent to selectAddrModeXRO in
4132+ // / AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
4133+ InstructionSelector::ComplexRendererFns
4134+ AArch64InstructionSelector::selectAddrModeXRO (MachineOperand &Root,
4135+ unsigned SizeInBytes) const {
4136+ MachineRegisterInfo &MRI = Root.getParent ()->getMF ()->getRegInfo ();
4137+
4138+ // If we have a constant offset, then we probably don't want to match a
4139+ // register offset.
4140+ if (isBaseWithConstantOffset (Root, MRI))
4141+ return None;
4142+
4143+ // Try to fold shifts into the addressing mode.
4144+ auto AddrModeFns = selectAddrModeShiftedExtendXReg (Root, SizeInBytes);
4145+ if (AddrModeFns)
4146+ return AddrModeFns;
4147+
4148+ // If that doesn't work, see if it's possible to fold in registers from
4149+ // a GEP.
4150+ return selectAddrModeRegisterOffset (Root);
4151+ }
4152+
40364153// / Select a "register plus unscaled signed 9-bit immediate" address. This
40374154// / should only match when there is an offset that is not valid for a scaled
40384155// / immediate addressing mode. The "Size" argument is the size in bytes of the
0 commit comments