Skip to content

Commit 1d0a85a

Browse files
authored
[AMDGPU][True16][CodeGen] Add patterns to reduce intermediates (#162047)
Add patterns which reduce or operations to register sequences when combining i16 values to i32. This removes many intermediate VGPRs and reduces registers pressure.
1 parent 36f26d4 commit 1d0a85a

14 files changed

+7775
-10223
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3726,6 +3726,23 @@ def : GCNPat <
37263726
} // End foreach Ty = ...
37273727
} // End AddedComplexity = 1
37283728

3729+
let True16Predicate = UseRealTrue16Insts in {
3730+
def : GCNPat<
3731+
(i32 (DivergentBinFrag<or>
3732+
(i32 (zext i16:$src_lo)),
3733+
(i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi)))))
3734+
)),
3735+
(REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
3736+
>;
3737+
def : GCNPat<
3738+
(i32 (DivergentBinFrag<or>
3739+
(i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi))))),
3740+
(i32 (zext i16:$src_lo))
3741+
)),
3742+
(REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
3743+
>;
3744+
}
3745+
37293746
let True16Predicate = UseRealTrue16Insts in
37303747
def : GCNPat <
37313748
(v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))),

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Lines changed: 4156 additions & 5167 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll

Lines changed: 357 additions & 518 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll

Lines changed: 420 additions & 568 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll

Lines changed: 800 additions & 1134 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll

Lines changed: 36 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -2279,17 +2279,13 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
22792279
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
22802280
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
22812281
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
2282-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
2283-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2284-
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
2282+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
2283+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
2284+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2285+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
22852286
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
2286-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2287-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
22882287
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
22892288
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
2290-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
2291-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
2292-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
22932289
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
22942290
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
22952291
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
@@ -2301,13 +2297,9 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
23012297
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
23022298
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
23032299
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
2304-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
2305-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2306-
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
2300+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2301+
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
23072302
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
2308-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2309-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
2310-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
23112303
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
23122304
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
23132305
;
@@ -4530,17 +4522,13 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
45304522
; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false
45314523
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
45324524
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
4533-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
4534-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
4535-
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
4525+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
4526+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
4527+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
4528+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
45364529
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
4537-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
4538-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
45394530
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
45404531
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
4541-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
4542-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
4543-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
45444532
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
45454533
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2
45464534
; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true
@@ -4552,13 +4540,9 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
45524540
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
45534541
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
45544542
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
4555-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
4556-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
4557-
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
4543+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
4544+
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
45584545
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
4559-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
4560-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
4561-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
45624546
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
45634547
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
45644548
;
@@ -6487,17 +6471,13 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
64876471
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
64886472
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
64896473
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
6490-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
6491-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6492-
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
6474+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
6475+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
6476+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6477+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
64936478
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
6494-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6495-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
64966479
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
64976480
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
6498-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
6499-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
6500-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
65016481
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
65026482
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
65036483
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
@@ -6509,13 +6489,9 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
65096489
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
65106490
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
65116491
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
6512-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
6513-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6514-
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
6492+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6493+
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
65156494
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
6516-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6517-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
6518-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
65196495
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
65206496
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
65216497
;
@@ -8138,17 +8114,13 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
81388114
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
81398115
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
81408116
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
8141-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
8142-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
8143-
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
8117+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
8118+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
8119+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8120+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
81448121
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
8145-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
8146-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
81478122
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
81488123
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
8149-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
8150-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
8151-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
81528124
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
81538125
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
81548126
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
@@ -8160,13 +8132,9 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
81608132
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
81618133
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
81628134
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
8163-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
8164-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
8165-
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
8135+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8136+
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
81668137
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
8167-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
8168-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
8169-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
81708138
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
81718139
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
81728140
;
@@ -9502,17 +9470,13 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
95029470
; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false
95039471
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
95049472
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
9505-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
9506-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
9507-
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
9473+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
9474+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
9475+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9476+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
95089477
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
9509-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9510-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
95119478
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
95129479
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
9513-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
9514-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
9515-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
95169480
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
95179481
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2
95189482
; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true
@@ -9524,13 +9488,9 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
95249488
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
95259489
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
95269490
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
9527-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
9528-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
9529-
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
9491+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9492+
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
95309493
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
9531-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9532-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
9533-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
95349494
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
95359495
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
95369496
;
@@ -10212,17 +10172,13 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
1021210172
; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false
1021310173
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
1021410174
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
10215-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
10216-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
10217-
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
10175+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
10176+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
10177+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10178+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
1021810179
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
10219-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
10220-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
1022110180
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
1022210181
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
10223-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
10224-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
10225-
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
1022610182
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
1022710183
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
1022810184
; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true
@@ -10234,13 +10190,9 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
1023410190
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1023510191
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
1023610192
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
10237-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
10238-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
10239-
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
10193+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10194+
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
1024010195
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
10241-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
10242-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
10243-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
1024410196
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
1024510197
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
1024610198
;

0 commit comments

Comments
 (0)