@@ -7527,6 +7527,11 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
75277527 return ;
75287528
75297529 unsigned Opcode = MI.getOpcode ();
7530+ if (Opcode == AMDGPU::REG_SEQUENCE) {
7531+ legalizeSpecialInst_t16 (MI, MRI);
7532+ return ;
7533+ }
7534+
75307535 MachineBasicBlock *MBB = MI.getParent ();
75317536 // Legalize operands and check for size mismatch
75327537 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands () ||
@@ -7565,50 +7570,63 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
75657570 legalizeOperandsVALUt16 (MI, OpIdx, MRI);
75667571}
75677572
7568- // Legalize size mismatches between 16bit and 32bit registers in v2s copy
7569- // lowering (lower the copy itself). Including cases:
7570- // 1. sreg32 = copy vgpr16 => vgpr32 = REG_SEQUENCE(vgpr16, lo16)
7571- // 2. sreg32 = copy .lo16:vgpr32 / sreg32 = copy .hi16:vgpr32
7572- // => vgpr16 = copy .hi/lo16:vgpr32
7573- // vgpr32 = REG_SEQUENCE(vgpr16, lo16)
7573+ // Legalize operands of size-mismatches special inst between 16bit and 32bit
7574+ // in moveToVALU lowering in true16 mode. This caused by 16bit
7575+ // placed in both vgpr16 and sreg32 by isel. Including cases:
7576+ // Copy
7577+ // 1. dst32 = copy vgpr16 => dst32 = REG_SEQUENCE(vgpr16, lo16)
7578+ // 2. dst32 = copy .lo16:vgpr32 / dst32 = copy .hi16:vgpr32
7579+ // => dst32 = REG_SEQUENCE(.lo16/hi16:vgpr32, lo16)
75747580// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
7581+ //
7582+ // Reg_sequence
7583+ // dst32 = reg_sequence(vgpr32, lo16/hi16)
7584+ // => dst32 = reg_sequence(.lo16:vgpr32, lo16/hi16)
7585+ //
75757586// This can be removed after we have sgpr16 in place.
7576- bool SIInstrInfo::legalizeV2SCopyt16 (MachineInstr &Copy,
7577- MachineRegisterInfo &MRI,
7578- SIInstrWorklist &Worklist) const {
7579- Register DstReg = Copy.getOperand (0 ).getReg ();
7580- Register SrcReg = Copy.getOperand (1 ).getReg ();
7581- Register SrcSubReg = Copy.getOperand (1 ).getSubReg ();
7582- const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass (Copy);
7583- const TargetRegisterClass *SrcRegRC = getOpRegClass (Copy, 1 );
7584- bool KeepCopy;
7585-
7586- if (RI.getMatchingSuperRegClass (NewDstRC, SrcRegRC, AMDGPU::lo16)) {
7587- KeepCopy = 0 ;
7588- } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
7589- (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
7590- KeepCopy = 1 ;
7591- Register NewDstReg = MRI.createVirtualRegister (&AMDGPU::VGPR_16RegClass);
7592- Copy.getOperand (0 ).setReg (NewDstReg);
7593- SrcReg = NewDstReg;
7594- } else
7595- return false ;
7587+ void SIInstrInfo::legalizeSpecialInst_t16 (MachineInstr &Inst,
7588+ MachineRegisterInfo &MRI) const {
7589+ unsigned Opcode = Inst.getOpcode ();
7590+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass (Inst);
7591+ switch (Opcode) {
7592+ case AMDGPU::COPY: {
7593+ Register SrcReg = Inst.getOperand (1 ).getReg ();
7594+ if (!SrcReg.isVirtual () || !RI.isVGPR (MRI, SrcReg))
7595+ return ;
75967596
7597- Register NewDstReg = MRI.createVirtualRegister (NewDstRC);
7598- Register Undef = MRI.createVirtualRegister (&AMDGPU::VGPR_16RegClass);
7599- BuildMI (*Copy.getParent (), &Copy, Copy.getDebugLoc (),
7600- get (AMDGPU::IMPLICIT_DEF), Undef);
7601- BuildMI (*Copy.getParent (), std::next (Copy.getIterator ()), Copy.getDebugLoc (),
7602- get (AMDGPU::REG_SEQUENCE), NewDstReg)
7603- .addReg (SrcReg)
7604- .addImm (AMDGPU::lo16)
7605- .addReg (Undef)
7606- .addImm (AMDGPU::hi16);
7607- if (!KeepCopy)
7608- Copy.eraseFromParent ();
7609- MRI.replaceRegWith (DstReg, NewDstReg);
7610- addUsersToMoveToVALUWorklist (NewDstReg, MRI, Worklist);
7611- return true ;
7597+ bool SetSubReg = false ;
7598+ Register SrcSubReg = Inst.getOperand (1 ).getSubReg ();
7599+ const TargetRegisterClass *SrcRegRC = getOpRegClass (Inst, 1 );
7600+ if (RI.getMatchingSuperRegClass (NewDstRC, SrcRegRC, AMDGPU::lo16)) {
7601+ } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
7602+ (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
7603+ SetSubReg = true ;
7604+ } else
7605+ return ;
7606+
7607+ Register Undef = MRI.createVirtualRegister (&AMDGPU::VGPR_16RegClass);
7608+ BuildMI (*Inst.getParent (), &Inst, Inst.getDebugLoc (),
7609+ get (AMDGPU::IMPLICIT_DEF), Undef);
7610+ Inst.setDesc (get (AMDGPU::REG_SEQUENCE));
7611+ if (SetSubReg)
7612+ Inst.getOperand (1 ).setSubReg (SrcSubReg);
7613+
7614+ Inst.addOperand (MachineOperand::CreateImm (AMDGPU::lo16));
7615+ Inst.addOperand (MachineOperand::CreateReg (Undef, 0 ));
7616+ Inst.addOperand (MachineOperand::CreateImm (AMDGPU::hi16));
7617+ } break ;
7618+ case AMDGPU::REG_SEQUENCE: {
7619+ for (unsigned I = 0 , E = (Inst.getNumOperands () - 1 ) / 2 ; I < E; ++I) {
7620+ Register SrcReg = Inst.getOperand (1 + 2 * I).getReg ();
7621+ auto SubReg = Inst.getOperand (1 + 2 * I + 1 ).getImm ();
7622+ if (SrcReg.isVirtual () && RI.isVGPR (MRI, SrcReg) &&
7623+ MRI.constrainRegClass (SrcReg, &AMDGPU::VGPR_32RegClass) &&
7624+ (SubReg == AMDGPU::lo16 || SubReg == AMDGPU::hi16)) {
7625+ Inst.getOperand (1 + 2 * I).setSubReg (AMDGPU::lo16);
7626+ }
7627+ }
7628+ } break ;
7629+ }
76127630}
76137631
76147632void SIInstrInfo::moveToVALU (SIInstrWorklist &Worklist,
@@ -8129,14 +8147,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
81298147 return ;
81308148 }
81318149
8132- // If this is a v2s copy between 16bit and 32bit reg,
8133- // replace vgpr copy to reg_sequence
8134- if (ST.useRealTrue16Insts () && Inst.isCopy () &&
8135- Inst.getOperand (1 ).getReg ().isVirtual () &&
8136- RI.isVGPR (MRI, Inst.getOperand (1 ).getReg ())) {
8137- if (legalizeV2SCopyt16 (Inst, MRI, Worklist))
8138- return ;
8139- }
8150+ if (ST.useRealTrue16Insts ())
8151+ legalizeSpecialInst_t16 (Inst, MRI);
81408152
81418153 if (Inst.isCopy () && Inst.getOperand (1 ).getReg ().isVirtual () &&
81428154 NewDstRC == RI.getRegClassForReg (MRI, Inst.getOperand (1 ).getReg ())) {
0 commit comments