@@ -17,98 +17,93 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
1717; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
1818; CHECK-NEXT: s_mov_b32 exec_lo, s4
1919; CHECK-NEXT: ; implicit-def: $vgpr8
20- ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
21- ; CHECK-NEXT: v_mov_b32_e32 v14, v1
20+ ; CHECK-NEXT: v_mov_b32_e32 v8, v0
21+ ; CHECK-NEXT: s_or_saveexec_b32 s21, -1
22+ ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
23+ ; CHECK-NEXT: s_mov_b32 exec_lo, s21
24+ ; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
25+ ; CHECK-NEXT: v_mov_b32_e32 v15, v1
26+ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
27+ ; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
28+ ; CHECK-NEXT: v_mov_b32_e32 v14, v2
2229; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
23- ; CHECK-NEXT: v_mov_b32_e32 v13, v2
30+ ; CHECK-NEXT: v_mov_b32_e32 v13, v3
2431; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
25- ; CHECK-NEXT: v_mov_b32_e32 v12, v3
32+ ; CHECK-NEXT: v_mov_b32_e32 v12, v4
2633; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
27- ; CHECK-NEXT: v_mov_b32_e32 v11, v4
34+ ; CHECK-NEXT: v_mov_b32_e32 v11, v5
2835; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
29- ; CHECK-NEXT: v_mov_b32_e32 v10, v5
36+ ; CHECK-NEXT: v_mov_b32_e32 v10, v6
3037; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
31- ; CHECK-NEXT: v_mov_b32_e32 v9, v6
38+ ; CHECK-NEXT: v_mov_b32_e32 v9, v7
3239; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
33- ; CHECK-NEXT: v_mov_b32_e32 v8, v7
34- ; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
35- ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
36- ; CHECK-NEXT: v_mov_b32_e32 v1, v14
37- ; CHECK-NEXT: v_mov_b32_e32 v2, v13
38- ; CHECK-NEXT: v_mov_b32_e32 v3, v12
39- ; CHECK-NEXT: v_mov_b32_e32 v4, v11
40- ; CHECK-NEXT: v_mov_b32_e32 v5, v10
41- ; CHECK-NEXT: v_mov_b32_e32 v6, v9
42- ; CHECK-NEXT: v_mov_b32_e32 v7, v8
43- ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
40+ ; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
41+ ; CHECK-NEXT: v_mov_b32_e32 v2, v15
42+ ; CHECK-NEXT: v_mov_b32_e32 v3, v14
43+ ; CHECK-NEXT: v_mov_b32_e32 v4, v13
44+ ; CHECK-NEXT: v_mov_b32_e32 v5, v12
45+ ; CHECK-NEXT: v_mov_b32_e32 v6, v11
46+ ; CHECK-NEXT: v_mov_b32_e32 v7, v10
47+ ; CHECK-NEXT: v_mov_b32_e32 v8, v9
48+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
4449; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
4550; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
4651; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
4752; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
4853; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
4954; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
5055; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
51- ; CHECK-NEXT: s_or_saveexec_b32 s21, -1
52- ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
53- ; CHECK-NEXT: s_mov_b32 exec_lo, s21
56+ ; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
5457; CHECK-NEXT: s_mov_b32 s8, 0
5558; CHECK-NEXT: s_mov_b32 s4, s8
5659; CHECK-NEXT: s_mov_b32 s5, s8
5760; CHECK-NEXT: s_mov_b32 s6, s8
5861; CHECK-NEXT: s_mov_b32 s7, s8
59- ; CHECK-NEXT: s_waitcnt vmcnt(0)
6062; CHECK-NEXT: v_writelane_b32 v0, s4, 0
6163; CHECK-NEXT: v_writelane_b32 v0, s5, 1
6264; CHECK-NEXT: v_writelane_b32 v0, s6, 2
6365; CHECK-NEXT: v_writelane_b32 v0, s7, 3
64- ; CHECK-NEXT: s_or_saveexec_b32 s21, -1
65- ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
66- ; CHECK-NEXT: s_mov_b32 exec_lo, s21
6766; CHECK-NEXT: s_mov_b32 s6, 0
6867; CHECK-NEXT: s_mov_b32 s4, s6
6968; CHECK-NEXT: s_mov_b32 s5, s6
70- ; CHECK-NEXT: v_mov_b32_e32 v0, s4
71- ; CHECK-NEXT: v_mov_b32_e32 v1, s5
72- ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
69+ ; CHECK-NEXT: v_mov_b32_e32 v1, s4
70+ ; CHECK-NEXT: v_mov_b32_e32 v2, s5
7371; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
74- ; CHECK-NEXT: s_or_saveexec_b32 s21, -1
75- ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
76- ; CHECK-NEXT: s_mov_b32 exec_lo, s21
72+ ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
7773; CHECK-NEXT: s_mov_b32 s4, exec_lo
78- ; CHECK-NEXT: s_waitcnt vmcnt(0)
7974; CHECK-NEXT: v_writelane_b32 v0, s4, 4
8075; CHECK-NEXT: s_or_saveexec_b32 s21, -1
81- ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
76+ ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
8277; CHECK-NEXT: s_mov_b32 exec_lo, s21
8378; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
84- ; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
79+ ; CHECK-NEXT: s_or_saveexec_b32 s21, -1
80+ ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
81+ ; CHECK-NEXT: s_mov_b32 exec_lo, s21
8582; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
8683; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
8784; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
8885; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
8986; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
9087; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
9188; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
92- ; CHECK-NEXT: buffer_load_dword v0 , off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
89+ ; CHECK-NEXT: buffer_load_dword v16 , off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
9390; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
9491; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
9592; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
9693; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
9794; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
9895; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
9996; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
97+ ; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
10098; CHECK-NEXT: s_waitcnt vmcnt(0)
101- ; CHECK-NEXT: v_readfirstlane_b32 s12, v7
102- ; CHECK-NEXT: v_readfirstlane_b32 s10, v6
103- ; CHECK-NEXT: v_readfirstlane_b32 s9, v5
104- ; CHECK-NEXT: v_readfirstlane_b32 s8, v4
105- ; CHECK-NEXT: v_readfirstlane_b32 s7, v3
106- ; CHECK-NEXT: v_readfirstlane_b32 s6, v2
107- ; CHECK-NEXT: v_readfirstlane_b32 s5, v1
108- ; CHECK-NEXT: v_readfirstlane_b32 s4, v0
109- ; CHECK-NEXT: s_or_saveexec_b32 s21, -1
110- ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
111- ; CHECK-NEXT: s_mov_b32 exec_lo, s21
99+ ; CHECK-NEXT: v_readfirstlane_b32 s12, v8
100+ ; CHECK-NEXT: v_readfirstlane_b32 s10, v7
101+ ; CHECK-NEXT: v_readfirstlane_b32 s9, v6
102+ ; CHECK-NEXT: v_readfirstlane_b32 s8, v5
103+ ; CHECK-NEXT: v_readfirstlane_b32 s7, v4
104+ ; CHECK-NEXT: v_readfirstlane_b32 s6, v3
105+ ; CHECK-NEXT: v_readfirstlane_b32 s5, v2
106+ ; CHECK-NEXT: v_readfirstlane_b32 s4, v1
112107; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
113108; CHECK-NEXT: s_mov_b32 s13, s10
114109; CHECK-NEXT: s_mov_b32 s14, s9
@@ -117,7 +112,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
117112; CHECK-NEXT: s_mov_b32 s17, s6
118113; CHECK-NEXT: s_mov_b32 s18, s5
119114; CHECK-NEXT: s_mov_b32 s19, s4
120- ; CHECK-NEXT: s_waitcnt vmcnt(0)
121115; CHECK-NEXT: v_writelane_b32 v0, s12, 5
122116; CHECK-NEXT: v_writelane_b32 v0, s13, 6
123117; CHECK-NEXT: v_writelane_b32 v0, s14, 7
@@ -126,45 +120,38 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
126120; CHECK-NEXT: v_writelane_b32 v0, s17, 10
127121; CHECK-NEXT: v_writelane_b32 v0, s18, 11
128122; CHECK-NEXT: v_writelane_b32 v0, s19, 12
129- ; CHECK-NEXT: s_or_saveexec_b32 s21, -1
130- ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
131- ; CHECK-NEXT: s_mov_b32 exec_lo, s21
132- ; CHECK-NEXT: v_mov_b32_e32 v6, v8
133123; CHECK-NEXT: v_mov_b32_e32 v7, v9
134- ; CHECK-NEXT: v_mov_b32_e32 v4 , v10
124+ ; CHECK-NEXT: v_mov_b32_e32 v8 , v10
135125; CHECK-NEXT: v_mov_b32_e32 v5, v11
136- ; CHECK-NEXT: v_mov_b32_e32 v2 , v12
126+ ; CHECK-NEXT: v_mov_b32_e32 v6 , v12
137127; CHECK-NEXT: v_mov_b32_e32 v3, v13
138- ; CHECK-NEXT: v_mov_b32_e32 v0 , v14
128+ ; CHECK-NEXT: v_mov_b32_e32 v4 , v14
139129; CHECK-NEXT: v_mov_b32_e32 v1, v15
130+ ; CHECK-NEXT: v_mov_b32_e32 v2, v16
140131; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13]
141132; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15]
142133; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17]
143134; CHECK-NEXT: s_mov_b64 s[6:7], s[18:19]
144- ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[6:7 ]
145- ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[4:5 ]
135+ ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[7:8 ]
136+ ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[5:6 ]
146137; CHECK-NEXT: s_and_b32 s4, s4, s5
147- ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3 ]
138+ ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[3:4 ]
148139; CHECK-NEXT: s_and_b32 s4, s4, s5
149- ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1]
150- ; CHECK-NEXT: s_or_saveexec_b32 s21, -1
151- ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
152- ; CHECK-NEXT: s_mov_b32 exec_lo, s21
140+ ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2]
153141; CHECK-NEXT: s_and_b32 s4, s4, s5
154142; CHECK-NEXT: s_and_saveexec_b32 s4, s4
155- ; CHECK-NEXT: s_waitcnt vmcnt(0)
156143; CHECK-NEXT: v_writelane_b32 v0, s4, 13
157144; CHECK-NEXT: s_or_saveexec_b32 s21, -1
158- ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
145+ ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
159146; CHECK-NEXT: s_mov_b32 exec_lo, s21
160147; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
148+ ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
149+ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
161150; CHECK-NEXT: s_or_saveexec_b32 s21, -1
162- ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
151+ ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
163152; CHECK-NEXT: s_mov_b32 exec_lo, s21
164153; CHECK-NEXT: s_waitcnt vmcnt(0)
165154; CHECK-NEXT: v_readlane_b32 s4, v2, 13
166- ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
167- ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
168155; CHECK-NEXT: v_readlane_b32 s8, v2, 5
169156; CHECK-NEXT: v_readlane_b32 s9, v2, 6
170157; CHECK-NEXT: v_readlane_b32 s10, v2, 7
@@ -177,24 +164,23 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
177164; CHECK-NEXT: v_readlane_b32 s17, v2, 1
178165; CHECK-NEXT: v_readlane_b32 s18, v2, 2
179166; CHECK-NEXT: v_readlane_b32 s19, v2, 3
180- ; CHECK-NEXT: s_waitcnt vmcnt(0)
181167; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
182168; CHECK-NEXT: s_waitcnt vmcnt(0)
183- ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
169+ ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
184170; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4
185171; CHECK-NEXT: s_cbranch_execnz .LBB0_1
186172; CHECK-NEXT: ; %bb.3:
187173; CHECK-NEXT: s_or_saveexec_b32 s21, -1
188- ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
174+ ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
189175; CHECK-NEXT: s_mov_b32 exec_lo, s21
190176; CHECK-NEXT: s_waitcnt vmcnt(0)
191177; CHECK-NEXT: v_readlane_b32 s4, v0, 4
192178; CHECK-NEXT: s_mov_b32 exec_lo, s4
193179; CHECK-NEXT: ; %bb.4:
194180; CHECK-NEXT: s_or_saveexec_b32 s21, -1
195- ; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
181+ ; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
196182; CHECK-NEXT: s_mov_b32 exec_lo, s21
197- ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
183+ ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
198184; CHECK-NEXT: ; implicit-def: $sgpr4
199185; CHECK-NEXT: v_mov_b32_e32 v1, s4
200186; CHECK-NEXT: v_mov_b32_e32 v2, s4
0 commit comments