Skip to content

Commit e580552

Browse files
committed
AMDGPU/GlobalISel: Select v2s32->v2s16 G_TRUNC
It would be nice if there was a way to avoid the tied operand, but as far as I can tell there isn't a way to use or with op_sel to achieve this
1 parent 361f2a7 commit e580552

File tree

3 files changed

+259
-11
lines changed

3 files changed

+259
-11
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 75 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1208,9 +1208,6 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
12081208
Register SrcReg = I.getOperand(1).getReg();
12091209
const LLT DstTy = MRI->getType(DstReg);
12101210
const LLT SrcTy = MRI->getType(SrcReg);
1211-
if (!DstTy.isScalar())
1212-
return false;
1213-
12141211
const LLT S1 = LLT::scalar(1);
12151212

12161213
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -1225,6 +1222,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
12251222
return false;
12261223
}
12271224

1225+
const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1226+
12281227
unsigned DstSize = DstTy.getSizeInBits();
12291228
unsigned SrcSize = SrcTy.getSizeInBits();
12301229

@@ -1233,24 +1232,89 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
12331232
const TargetRegisterClass *DstRC
12341233
= TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
12351234

1235+
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1236+
!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1237+
LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1238+
return false;
1239+
}
1240+
1241+
if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1242+
MachineBasicBlock *MBB = I.getParent();
1243+
const DebugLoc &DL = I.getDebugLoc();
1244+
1245+
Register LoReg = MRI->createVirtualRegister(DstRC);
1246+
Register HiReg = MRI->createVirtualRegister(DstRC);
1247+
BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1248+
.addReg(SrcReg, 0, AMDGPU::sub0);
1249+
BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1250+
.addReg(SrcReg, 0, AMDGPU::sub1);
1251+
1252+
if (IsVALU && STI.hasSDWA()) {
1253+
// Write the low 16-bits of the high element into the high 16-bits of the
1254+
// low element.
1255+
MachineInstr *MovSDWA =
1256+
BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1257+
.addImm(0) // $src0_modifiers
1258+
.addReg(HiReg) // $src0
1259+
.addImm(0) // $clamp
1260+
.addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
1261+
.addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1262+
.addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
1263+
.addReg(LoReg, RegState::Implicit);
1264+
MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1265+
} else {
1266+
Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1267+
Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1268+
Register ImmReg = MRI->createVirtualRegister(DstRC);
1269+
if (IsVALU) {
1270+
BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1271+
.addImm(16)
1272+
.addReg(HiReg);
1273+
} else {
1274+
BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1275+
.addReg(HiReg)
1276+
.addImm(16);
1277+
}
1278+
1279+
unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1280+
unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1281+
unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1282+
1283+
BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1284+
.addImm(0xffff);
1285+
BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1286+
.addReg(LoReg)
1287+
.addReg(ImmReg);
1288+
BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1289+
.addReg(TmpReg0)
1290+
.addReg(TmpReg1);
1291+
}
1292+
1293+
I.eraseFromParent();
1294+
return true;
1295+
}
1296+
1297+
if (!DstTy.isScalar())
1298+
return false;
1299+
12361300
if (SrcSize > 32) {
12371301
int SubRegIdx = sizeToSubRegIndex(DstSize);
12381302
if (SubRegIdx == -1)
12391303
return false;
12401304

12411305
// Deal with weird cases where the class only partially supports the subreg
12421306
// index.
1243-
SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1244-
if (!SrcRC)
1307+
const TargetRegisterClass *SrcWithSubRC
1308+
= TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1309+
if (!SrcWithSubRC)
12451310
return false;
12461311

1247-
I.getOperand(1).setSubReg(SubRegIdx);
1248-
}
1312+
if (SrcWithSubRC != SrcRC) {
1313+
if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1314+
return false;
1315+
}
12491316

1250-
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1251-
!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1252-
LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1253-
return false;
1317+
I.getOperand(1).setSubReg(SubRegIdx);
12541318
}
12551319

12561320
I.setDesc(TII.get(TargetOpcode::COPY));
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s
3+
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
4+
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
5+
6+
---
7+
8+
name: trunc_sgpr_v2s32_to_v2s16
9+
legalized: true
10+
regBankSelected: true
11+
12+
body: |
13+
bb.0:
14+
liveins: $sgpr0_sgpr1
15+
; GFX6-LABEL: name: trunc_sgpr_v2s32_to_v2s16
16+
; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
17+
; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
18+
; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
19+
; GFX6: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def $scc
20+
; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
21+
; GFX6: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
22+
; GFX6: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], [[S_AND_B32_]], implicit-def $scc
23+
; GFX6: S_ENDPGM 0, implicit [[S_OR_B32_]]
24+
; GFX8-LABEL: name: trunc_sgpr_v2s32_to_v2s16
25+
; GFX8: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
26+
; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
27+
; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
28+
; GFX8: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def $scc
29+
; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
30+
; GFX8: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
31+
; GFX8: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], [[S_AND_B32_]], implicit-def $scc
32+
; GFX8: S_ENDPGM 0, implicit [[S_OR_B32_]]
33+
%0:sgpr(<2 x s32>) =COPY $sgpr0_sgpr1
34+
%1:sgpr(<2 x s16>) = G_TRUNC %0
35+
S_ENDPGM 0, implicit %1
36+
...
37+
38+
---
39+
40+
name: trunc_vgpr_v2s32_to_v2s16
41+
legalized: true
42+
regBankSelected: true
43+
44+
body: |
45+
bb.0:
46+
liveins: $vgpr0_vgpr1
47+
; GFX6-LABEL: name: trunc_vgpr_v2s32_to_v2s16
48+
; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
49+
; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
50+
; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
51+
; GFX6: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY2]], implicit $exec
52+
; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
53+
; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
54+
; GFX6: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_LSHLREV_B32_e64_]], [[V_AND_B32_e64_]], implicit $exec
55+
; GFX6: S_ENDPGM 0, implicit [[V_OR_B32_e64_]]
56+
; GFX8-LABEL: name: trunc_vgpr_v2s32_to_v2s16
57+
; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
58+
; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
59+
; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
60+
; GFX8: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[COPY2]], 0, 5, 2, 4, implicit $exec, implicit [[COPY1]](tied-def 0)
61+
; GFX8: S_ENDPGM 0, implicit [[V_MOV_B32_sdwa]]
62+
%0:vgpr(<2 x s32>) =COPY $vgpr0_vgpr1
63+
%1:vgpr(<2 x s16>) = G_TRUNC %0
64+
S_ENDPGM 0, implicit %1
65+
...
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
3+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
4+
5+
define i16 @v_trunc_i32_to_i16(i32 %src) {
6+
; GFX7-LABEL: v_trunc_i32_to_i16:
7+
; GFX7: ; %bb.0:
8+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9+
; GFX7-NEXT: s_setpc_b64 s[30:31]
10+
;
11+
; GFX8-LABEL: v_trunc_i32_to_i16:
12+
; GFX8: ; %bb.0:
13+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14+
; GFX8-NEXT: s_setpc_b64 s[30:31]
15+
%trunc = trunc i32 %src to i16
16+
ret i16 %trunc
17+
}
18+
19+
define amdgpu_ps i16 @s_trunc_i32_to_i16(i32 inreg %src) {
20+
; GFX7-LABEL: s_trunc_i32_to_i16:
21+
; GFX7: ; %bb.0:
22+
; GFX7-NEXT: ; return to shader part epilog
23+
;
24+
; GFX8-LABEL: s_trunc_i32_to_i16:
25+
; GFX8: ; %bb.0:
26+
; GFX8-NEXT: ; return to shader part epilog
27+
%trunc = trunc i32 %src to i16
28+
ret i16 %trunc
29+
}
30+
31+
define i16 @v_trunc_i64_to_i16(i64 %src) {
32+
; GFX7-LABEL: v_trunc_i64_to_i16:
33+
; GFX7: ; %bb.0:
34+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35+
; GFX7-NEXT: s_setpc_b64 s[30:31]
36+
;
37+
; GFX8-LABEL: v_trunc_i64_to_i16:
38+
; GFX8: ; %bb.0:
39+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40+
; GFX8-NEXT: s_setpc_b64 s[30:31]
41+
%trunc = trunc i64 %src to i16
42+
ret i16 %trunc
43+
}
44+
45+
define amdgpu_ps i16 @s_trunc_i64_to_i16(i64 inreg %src) {
46+
; GFX7-LABEL: s_trunc_i64_to_i16:
47+
; GFX7: ; %bb.0:
48+
; GFX7-NEXT: ; return to shader part epilog
49+
;
50+
; GFX8-LABEL: s_trunc_i64_to_i16:
51+
; GFX8: ; %bb.0:
52+
; GFX8-NEXT: ; return to shader part epilog
53+
%trunc = trunc i64 %src to i16
54+
ret i16 %trunc
55+
}
56+
57+
define amdgpu_ps i16 @s_trunc_i128_to_i16(i128 inreg %src) {
58+
; GFX7-LABEL: s_trunc_i128_to_i16:
59+
; GFX7: ; %bb.0:
60+
; GFX7-NEXT: ; return to shader part epilog
61+
;
62+
; GFX8-LABEL: s_trunc_i128_to_i16:
63+
; GFX8: ; %bb.0:
64+
; GFX8-NEXT: ; return to shader part epilog
65+
%trunc = trunc i128 %src to i16
66+
ret i16 %trunc
67+
}
68+
69+
define i16 @v_trunc_i128_to_i16(i128 %src) {
70+
; GFX7-LABEL: v_trunc_i128_to_i16:
71+
; GFX7: ; %bb.0:
72+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73+
; GFX7-NEXT: s_setpc_b64 s[30:31]
74+
;
75+
; GFX8-LABEL: v_trunc_i128_to_i16:
76+
; GFX8: ; %bb.0:
77+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78+
; GFX8-NEXT: s_setpc_b64 s[30:31]
79+
%trunc = trunc i128 %src to i16
80+
ret i16 %trunc
81+
}
82+
83+
define i32 @v_trunc_v2i32_to_v2i16(<2 x i32> %src) {
84+
; GFX7-LABEL: v_trunc_v2i32_to_v2i16:
85+
; GFX7: ; %bb.0:
86+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87+
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
88+
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
89+
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
90+
; GFX7-NEXT: s_setpc_b64 s[30:31]
91+
;
92+
; GFX8-LABEL: v_trunc_v2i32_to_v2i16:
93+
; GFX8: ; %bb.0:
94+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95+
; GFX8-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
96+
; GFX8-NEXT: s_setpc_b64 s[30:31]
97+
%trunc = trunc <2 x i32> %src to <2 x i16>
98+
%cast = bitcast <2 x i16> %trunc to i32
99+
ret i32 %cast
100+
}
101+
102+
define amdgpu_ps i32 @s_trunc_v2i32_to_v2i16(<2 x i32> inreg %src) {
103+
; GFX7-LABEL: s_trunc_v2i32_to_v2i16:
104+
; GFX7: ; %bb.0:
105+
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
106+
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
107+
; GFX7-NEXT: s_or_b32 s0, s1, s0
108+
; GFX7-NEXT: ; return to shader part epilog
109+
;
110+
; GFX8-LABEL: s_trunc_v2i32_to_v2i16:
111+
; GFX8: ; %bb.0:
112+
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
113+
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
114+
; GFX8-NEXT: s_or_b32 s0, s1, s0
115+
; GFX8-NEXT: ; return to shader part epilog
116+
%trunc = trunc <2 x i32> %src to <2 x i16>
117+
%cast = bitcast <2 x i16> %trunc to i32
118+
ret i32 %cast
119+
}

0 commit comments

Comments
 (0)