Skip to content

Commit cda47e5

Browse files
author
Abhinav Garg
committed
Address review comments: Scalarize v2s16 for uniform operation and implement combine logic for ReadAnyLane + Trunc + AnyExt.
1 parent 48f1915 commit cda47e5

File tree

6 files changed

+99
-132
lines changed

6 files changed

+99
-132
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
2525
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
2626
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
27+
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
2728
#include "llvm/CodeGen/GlobalISel/Utils.h"
2829
#include "llvm/CodeGen/MachineFunctionPass.h"
2930
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -34,9 +35,17 @@
3435

3536
using namespace llvm;
3637
using namespace AMDGPU;
38+
using namespace llvm::MIPatternMatch;
3739

3840
namespace {
3941

42+
// AMDGPU-specific pattern matchers
43+
template <typename SrcTy>
44+
inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
45+
m_GAMDGPUReadAnyLane(const SrcTy &Src) {
46+
return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
47+
}
48+
4049
class AMDGPURegBankLegalize : public MachineFunctionPass {
4150
public:
4251
static char ID;
@@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
160169

161170
Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
162171
// Src = G_AMDGPU_READANYLANE RALSrc
163-
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
164-
if (RAL)
172+
Register RALSrc;
173+
if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
165174
return RALSrc;
166175

176+
// TruncSrc = G_AMDGPU_READANYLANE RALSrc
177+
// AextSrc = G_TRUNC TruncSrc
178+
// Src = G_ANYEXT AextSrc
179+
if (mi_match(Src, MRI,
180+
m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
181+
return RALSrc;
182+
}
183+
167184
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
168185
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
169186
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
616616
MI.eraseFromParent();
617617
}
618618

619+
void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
620+
Register Dst = MI.getOperand(0).getReg();
621+
assert(MRI.getType(Dst) == V2S16);
622+
auto [Op0Lo32, Op0Hi32] = unpackAExt(MI.getOperand(1).getReg());
623+
auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(2).getReg());
624+
unsigned Opc = MI.getOpcode();
625+
auto Flags = MI.getFlags();
626+
auto Op0Lo = B.buildTrunc(SgprRB_S16, Op0Lo32);
627+
auto Op0Hi = B.buildTrunc(SgprRB_S16, Op0Hi32);
628+
auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
629+
auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
630+
auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op0Lo, Op1Lo}, Flags);
631+
auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op0Hi, Op1Hi}, Flags);
632+
B.buildMergeLikeInstr(Dst, {Lo, Hi});
633+
MI.eraseFromParent();
634+
}
635+
619636
void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
620637
Register Dst = MI.getOperand(0).getReg();
621638
LLT DstTy = MRI.getType(Dst);
@@ -688,6 +705,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
688705
return lowerUnpackBitShift(MI);
689706
case UnpackMinMax:
690707
return lowerUnpackMinMax(MI);
708+
case ScalarizeToS16:
709+
return lowerSplitTo16(MI);
691710
case Ext32To64: {
692711
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
693712
MachineInstrBuilder Hi;

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class RegBankLegalizeHelper {
7272
static constexpr LLT P6 = LLT::pointer(6, 32);
7373

7474
MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
75+
MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
7576
MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
7677
MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
7778

@@ -121,6 +122,7 @@ class RegBankLegalizeHelper {
121122
void lowerV_BFE(MachineInstr &MI);
122123
void lowerS_BFE(MachineInstr &MI);
123124
void lowerSplitTo32(MachineInstr &MI);
125+
void lowerSplitTo16(MachineInstr &MI);
124126
void lowerSplitTo32Select(MachineInstr &MI);
125127
void lowerSplitTo32SExtInReg(MachineInstr &MI);
126128
void lowerUnpackMinMax(MachineInstr &MI);

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -914,7 +914,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
914914
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
915915
.Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
916916
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
917-
.Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
917+
.Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
918+
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
919+
hasSALUFloat)
918920
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
919921
.Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
920922
.Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ enum LoweringMethodID {
221221
V_BFE,
222222
VgprToVccCopy,
223223
SplitTo32,
224+
ScalarizeToS16,
224225
SplitTo32Select,
225226
SplitTo32SExtInReg,
226227
Ext32To64,
Lines changed: 55 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,19 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
3-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
4-
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
5-
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
6-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
7-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
8-
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
9-
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
5+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
106

117
define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
12-
; GFX11-SDAG-FAKE16-LABEL: fadd_s16_uniform:
13-
; GFX11-SDAG-FAKE16: ; %bb.0:
14-
; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
15-
; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog
8+
; GFX11-FAKE16-LABEL: fadd_s16_uniform:
9+
; GFX11-FAKE16: ; %bb.0:
10+
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
11+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
1612
;
17-
; GFX11-SDAG-TRUE16-LABEL: fadd_s16_uniform:
18-
; GFX11-SDAG-TRUE16: ; %bb.0:
19-
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
20-
; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
21-
;
22-
; GFX11-GISEL-FAKE16-LABEL: fadd_s16_uniform:
23-
; GFX11-GISEL-FAKE16: ; %bb.0:
24-
; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
25-
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26-
; GFX11-GISEL-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
27-
; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0
28-
; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog
29-
;
30-
; GFX11-GISEL-TRUE16-LABEL: fadd_s16_uniform:
31-
; GFX11-GISEL-TRUE16: ; %bb.0:
32-
; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
33-
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
34-
; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
35-
; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s0
36-
; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
13+
; GFX11-TRUE16-LABEL: fadd_s16_uniform:
14+
; GFX11-TRUE16: ; %bb.0:
15+
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
16+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
3717
;
3818
; GFX12-LABEL: fadd_s16_uniform:
3919
; GFX12: ; %bb.0:
@@ -46,45 +26,25 @@ define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
4626
}
4727

4828
define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
49-
; GFX11-SDAG-FAKE16-LABEL: fadd_s16_div:
50-
; GFX11-SDAG-FAKE16: ; %bb.0:
51-
; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
52-
; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog
53-
;
54-
; GFX11-SDAG-TRUE16-LABEL: fadd_s16_div:
55-
; GFX11-SDAG-TRUE16: ; %bb.0:
56-
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
57-
; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
58-
;
59-
; GFX11-GISEL-FAKE16-LABEL: fadd_s16_div:
60-
; GFX11-GISEL-FAKE16: ; %bb.0:
61-
; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
62-
; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog
63-
;
64-
; GFX11-GISEL-TRUE16-LABEL: fadd_s16_div:
65-
; GFX11-GISEL-TRUE16: ; %bb.0:
66-
; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
67-
; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
68-
;
69-
; GFX12-SDAG-FAKE16-LABEL: fadd_s16_div:
70-
; GFX12-SDAG-FAKE16: ; %bb.0:
71-
; GFX12-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
72-
; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
73-
;
74-
; GFX12-SDAG-TRUE16-LABEL: fadd_s16_div:
75-
; GFX12-SDAG-TRUE16: ; %bb.0:
76-
; GFX12-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
77-
; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
78-
;
79-
; GFX12-GISEL-FAKE16-LABEL: fadd_s16_div:
80-
; GFX12-GISEL-FAKE16: ; %bb.0:
81-
; GFX12-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
82-
; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog
83-
;
84-
; GFX12-GISEL-TRUE16-LABEL: fadd_s16_div:
85-
; GFX12-GISEL-TRUE16: ; %bb.0:
86-
; GFX12-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
87-
; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
29+
; GFX11-FAKE16-LABEL: fadd_s16_div:
30+
; GFX11-FAKE16: ; %bb.0:
31+
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
32+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
33+
;
34+
; GFX11-TRUE16-LABEL: fadd_s16_div:
35+
; GFX11-TRUE16: ; %bb.0:
36+
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
37+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
38+
;
39+
; GFX12-FAKE16-LABEL: fadd_s16_div:
40+
; GFX12-FAKE16: ; %bb.0:
41+
; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
42+
; GFX12-FAKE16-NEXT: ; return to shader part epilog
43+
;
44+
; GFX12-TRUE16-LABEL: fadd_s16_div:
45+
; GFX12-TRUE16: ; %bb.0:
46+
; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
47+
; GFX12-TRUE16-NEXT: ; return to shader part epilog
8848
%fadd = fadd half %a, %b
8949
ret half %fadd
9050
}
@@ -155,92 +115,58 @@ define amdgpu_ps double @fadd_s64_div(double %a, double %b) {
155115
ret double %fadd
156116
}
157117

158-
define <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
118+
define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
159119
; GFX11-LABEL: fadd_v2s16_uniform:
160120
; GFX11: ; %bb.0:
161-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162121
; GFX11-NEXT: v_pk_add_f16 v0, s0, s1
163-
; GFX11-NEXT: s_setpc_b64 s[30:31]
122+
; GFX11-NEXT: ; return to shader part epilog
164123
;
165124
; GFX12-LABEL: fadd_v2s16_uniform:
166125
; GFX12: ; %bb.0:
167-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
168-
; GFX12-NEXT: s_wait_expcnt 0x0
169-
; GFX12-NEXT: s_wait_samplecnt 0x0
170-
; GFX12-NEXT: s_wait_bvhcnt 0x0
171-
; GFX12-NEXT: s_wait_kmcnt 0x0
172-
; GFX12-NEXT: v_pk_add_f16 v0, s0, s1
173-
; GFX12-NEXT: s_setpc_b64 s[30:31]
126+
; GFX12-NEXT: s_lshr_b32 s2, s0, 16
127+
; GFX12-NEXT: s_lshr_b32 s3, s1, 16
128+
; GFX12-NEXT: s_add_f16 s0, s0, s1
129+
; GFX12-NEXT: s_add_f16 s1, s2, s3
130+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
131+
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1
132+
; GFX12-NEXT: v_mov_b32_e32 v0, s0
133+
; GFX12-NEXT: ; return to shader part epilog
174134
%fadd = fadd <2 x half> %a, %b
175135
ret <2 x half> %fadd
176136
}
177137

178-
define <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
179-
; GFX11-LABEL: fadd_v2s16_div:
180-
; GFX11: ; %bb.0:
181-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182-
; GFX11-NEXT: v_pk_add_f16 v0, v0, v1
183-
; GFX11-NEXT: s_setpc_b64 s[30:31]
184-
;
185-
; GFX12-LABEL: fadd_v2s16_div:
186-
; GFX12: ; %bb.0:
187-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
188-
; GFX12-NEXT: s_wait_expcnt 0x0
189-
; GFX12-NEXT: s_wait_samplecnt 0x0
190-
; GFX12-NEXT: s_wait_bvhcnt 0x0
191-
; GFX12-NEXT: s_wait_kmcnt 0x0
192-
; GFX12-NEXT: v_pk_add_f16 v0, v0, v1
193-
; GFX12-NEXT: s_setpc_b64 s[30:31]
138+
define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
139+
; GCN-LABEL: fadd_v2s16_div:
140+
; GCN: ; %bb.0:
141+
; GCN-NEXT: v_pk_add_f16 v0, v0, v1
142+
; GCN-NEXT: ; return to shader part epilog
194143
%fadd = fadd <2 x half> %a, %b
195144
ret <2 x half> %fadd
196145
}
197146

198-
define <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
147+
define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
199148
; GFX11-LABEL: fadd_v2s32_uniform:
200149
; GFX11: ; %bb.0:
201-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
202150
; GFX11-NEXT: v_add_f32_e64 v0, s0, s2
203151
; GFX11-NEXT: v_add_f32_e64 v1, s1, s3
204-
; GFX11-NEXT: s_setpc_b64 s[30:31]
152+
; GFX11-NEXT: ; return to shader part epilog
205153
;
206154
; GFX12-LABEL: fadd_v2s32_uniform:
207155
; GFX12: ; %bb.0:
208-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
209-
; GFX12-NEXT: s_wait_expcnt 0x0
210-
; GFX12-NEXT: s_wait_samplecnt 0x0
211-
; GFX12-NEXT: s_wait_bvhcnt 0x0
212-
; GFX12-NEXT: s_wait_kmcnt 0x0
213156
; GFX12-NEXT: s_add_f32 s0, s0, s2
214157
; GFX12-NEXT: s_add_f32 s1, s1, s3
215-
; GFX12-NEXT: s_wait_alu 0xfffe
216-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
158+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
217159
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
218-
; GFX12-NEXT: s_setpc_b64 s[30:31]
160+
; GFX12-NEXT: ; return to shader part epilog
219161
%fadd = fadd <2 x float> %a, %b
220162
ret <2 x float> %fadd
221163
}
222164

223-
define <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
224-
; GFX11-LABEL: fadd_v2s32_div:
225-
; GFX11: ; %bb.0:
226-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227-
; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
228-
; GFX11-NEXT: s_setpc_b64 s[30:31]
229-
;
230-
; GFX12-LABEL: fadd_v2s32_div:
231-
; GFX12: ; %bb.0:
232-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
233-
; GFX12-NEXT: s_wait_expcnt 0x0
234-
; GFX12-NEXT: s_wait_samplecnt 0x0
235-
; GFX12-NEXT: s_wait_bvhcnt 0x0
236-
; GFX12-NEXT: s_wait_kmcnt 0x0
237-
; GFX12-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
238-
; GFX12-NEXT: s_setpc_b64 s[30:31]
165+
define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
166+
; GCN-LABEL: fadd_v2s32_div:
167+
; GCN: ; %bb.0:
168+
; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
169+
; GCN-NEXT: ; return to shader part epilog
239170
%fadd = fadd <2 x float> %a, %b
240171
ret <2 x float> %fadd
241172
}
242-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
243-
; GFX11-GISEL: {{.*}}
244-
; GFX11-SDAG: {{.*}}
245-
; GFX12-GISEL: {{.*}}
246-
; GFX12-SDAG: {{.*}}

0 commit comments

Comments
 (0)