55 ; ModuleID = 'test-wmma-convergent'
66 target triple = "amdgcn-amd-amdhsa"
77
8- define void @wmma_test(ptr addrspace(4) %a, ptr addrspace(4) %b, ptr addrspace(4) %c, float %scale ) {
8+ define void @wmma_test() {
99 entry :
1010 br label %if.then
1111
@@ -25,189 +25,53 @@ body: |
2525 ; CHECK-LABEL: name: wmma_test
2626 ; CHECK: bb.0.entry:
2727 ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
28- ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
2928 ; CHECK-NEXT: {{ $}}
30- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
31- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
32- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
33- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
34- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
35- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
36- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
37- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
38- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
39- ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
40- ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_]], [[COPY1]], implicit $exec
41- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
42- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR1:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
43- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub3
44- ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY6]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
45- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub2
46- ; CHECK-NEXT: [[V_PK_ADD_F16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY7]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
47- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub1
48- ; CHECK-NEXT: [[V_PK_ADD_F16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY8]], 8, [[COPY8]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
49- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0
50- ; CHECK-NEXT: [[V_PK_ADD_F16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY9]], 8, [[COPY9]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
51- ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_F16_3]], %subreg.sub0, [[V_PK_ADD_F16_2]], %subreg.sub1, [[V_PK_ADD_F16_1]], %subreg.sub2, [[V_PK_ADD_F16_]], %subreg.sub3
52- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub3
53- ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 939538432
54- ; CHECK-NEXT: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY10]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
55- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub2
56- ; CHECK-NEXT: [[V_PK_MUL_F16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY11]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
57- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub1
58- ; CHECK-NEXT: [[V_PK_MUL_F16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY12]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
59- ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub0
60- ; CHECK-NEXT: [[V_PK_MUL_F16_3:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY13]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
61- ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_MUL_F16_3]], %subreg.sub0, [[V_PK_MUL_F16_2]], %subreg.sub1, [[V_PK_MUL_F16_1]], %subreg.sub2, [[V_PK_MUL_F16_]], %subreg.sub3
62- ; CHECK-NEXT: early-clobber %42:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[REG_SEQUENCE2]], 8, [[REG_SEQUENCE3]], 8, 0, 0, 0, implicit $exec
63- ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1
64- ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[S_MOV_B32_2]], implicit $exec
65- ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
66- ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[V_AND_B32_e64_]], [[S_MOV_B32_3]], implicit $exec
67- ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
29+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
30+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
31+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
32+ ; CHECK-NEXT: early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[DEF]], 8, [[DEF1]], 8, 0, 0, 0, implicit $exec
33+ ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[DEF2]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
6834 ; CHECK-NEXT: S_BRANCH %bb.1
6935 ; CHECK-NEXT: {{ $}}
7036 ; CHECK-NEXT: bb.1.if.then:
7137 ; CHECK-NEXT: successors: %bb.2(0x80000000)
7238 ; CHECK-NEXT: {{ $}}
73- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
74- ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
75- ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
76- ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_192 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3, [[COPY15]], %subreg.sub4, [[COPY14]], %subreg.sub5
77- ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 3
78- ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_4]], [[COPY1]], implicit $exec
79- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
80- ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B32_e64_1]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
81- ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE4]].sub5
82- ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE4]].sub4
83- ; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY17]], %subreg.sub0, [[COPY16]], %subreg.sub1
84- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
85- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub0
86- ; CHECK-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY18]], implicit $exec
87- ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY %42.sub1
88- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY %42.sub3
89- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY %42.sub5
90- ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY %42.sub7
91- ; CHECK-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY %42.sub6
92- ; CHECK-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY %42.sub4
93- ; CHECK-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY %42.sub2
94- ; CHECK-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY %42.sub0
95- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
96- ; CHECK-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
97- ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY26]], 0, 0, 0, [[COPY27]], 0, 0, implicit $mode, implicit $exec
98- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
99- ; CHECK-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
100- ; CHECK-NEXT: [[V_FMA_MIXLO_F16_1:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY25]], 0, 0, 0, [[COPY28]], 0, 0, implicit $mode, implicit $exec
101- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
102- ; CHECK-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
103- ; CHECK-NEXT: [[V_FMA_MIXLO_F16_2:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY24]], 0, 0, 0, [[COPY29]], 0, 0, implicit $mode, implicit $exec
104- ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
105- ; CHECK-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
106- ; CHECK-NEXT: [[V_FMA_MIXLO_F16_3:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY23]], 0, 0, 0, [[COPY30]], 0, 0, implicit $mode, implicit $exec
107- ; CHECK-NEXT: [[V_FMA_MIXHI_F16_:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY22]], 0, 0, 0, [[V_FMA_MIXLO_F16_3]], 0, 0, implicit $mode, implicit $exec
108- ; CHECK-NEXT: [[V_FMA_MIXHI_F16_1:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY21]], 0, 0, 0, [[V_FMA_MIXLO_F16_2]], 0, 0, implicit $mode, implicit $exec
109- ; CHECK-NEXT: [[V_FMA_MIXHI_F16_2:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY20]], 0, 0, 0, [[V_FMA_MIXLO_F16_1]], 0, 0, implicit $mode, implicit $exec
110- ; CHECK-NEXT: [[V_FMA_MIXHI_F16_3:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY19]], 0, 0, 0, [[V_FMA_MIXLO_F16_]], 0, 0, implicit $mode, implicit $exec
111- ; CHECK-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_FMA_MIXHI_F16_3]], %subreg.sub0, [[V_FMA_MIXHI_F16_2]], %subreg.sub1, [[V_FMA_MIXHI_F16_1]], %subreg.sub2, [[V_FMA_MIXHI_F16_]], %subreg.sub3
112- ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE7]]
113- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_LSHLREV_B32_e64_2]], [[COPY31]], [[REG_SEQUENCE6]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
39+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %3.sub1
40+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY %3.sub3
41+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY %3.sub5
42+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %3.sub7
43+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %3.sub6
44+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %3.sub4
45+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %3.sub2
46+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY %3.sub0
11447 ; CHECK-NEXT: {{ $}}
11548 ; CHECK-NEXT: bb.2.if.end:
11649 ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
11750 ; CHECK-NEXT: S_ENDPGM 0
51+
11852 bb.0.entry:
119- successors: %bb.1(0x40000000), %bb.2(0x40000000)
120- liveins: $vgpr0, $sgpr0_sgpr1
53+ successors: %bb.1, %bb.2
12154
122- %6:sgpr_64 = COPY $sgpr0_sgpr1
123- %5:vgpr_32 = COPY $vgpr0
124- %7:sgpr_128 = S_LOAD_DWORDX4_IMM %6:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
125- %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %6:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
126- %9:sreg_32 = COPY %8.sub1:sreg_64_xexec
127- %10:sreg_32 = COPY %8.sub0:sreg_64_xexec
128- %11:sreg_32 = COPY %7.sub3:sgpr_128
129- %12:sreg_32 = COPY %7.sub2:sgpr_128
130- %13:sreg_32 = COPY %7.sub1:sgpr_128
131- %14:sreg_32 = COPY %7.sub0:sgpr_128
132- %15:sgpr_192 = REG_SEQUENCE %14:sreg_32, %subreg.sub0, %13:sreg_32, %subreg.sub1, %12:sreg_32, %subreg.sub2, %11:sreg_32, %subreg.sub3, %10:sreg_32, %subreg.sub4, %9:sreg_32, %subreg.sub5
133- %1:sgpr_192 = COPY %15:sgpr_192
134- %16:sreg_64_xexec_xnull = REG_SEQUENCE %14:sreg_32, %subreg.sub0, %13:sreg_32, %subreg.sub1
135- %17:sreg_64_xexec_xnull = REG_SEQUENCE %12:sreg_32, %subreg.sub0, %11:sreg_32, %subreg.sub1
136- %18:sreg_32 = S_MOV_B32 3
137- %19:vgpr_32 = V_LSHLREV_B32_e64 %18:sreg_32, %5:vgpr_32, implicit $exec
138- %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
139- %100:vreg_64 = REG_SEQUENCE %19:vgpr_32, %subreg.sub0, %101:vgpr_32, %subreg.sub1
140- %2:vreg_64 = COPY %100:vreg_64
141- %22:sreg_32 = S_MOV_B32 4
142- %23:vgpr_32 = V_LSHLREV_B32_e64 %22:sreg_32, %5:vgpr_32, implicit $exec
143- %24:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %16:sreg_64_xexec_xnull, %23:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 1)
144- %25:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %17:sreg_64_xexec_xnull, %23:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 1)
145- %26:vgpr_32 = COPY %24.sub3:vreg_128
146- %27:vgpr_32 = V_PK_ADD_F16 8, %26:vgpr_32, 8, %26:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
147- %28:vgpr_32 = COPY %24.sub2:vreg_128
148- %29:vgpr_32 = V_PK_ADD_F16 8, %28:vgpr_32, 8, %28:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
149- %30:vgpr_32 = COPY %24.sub1:vreg_128
150- %31:vgpr_32 = V_PK_ADD_F16 8, %30:vgpr_32, 8, %30:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
151- %32:vgpr_32 = COPY %24.sub0:vreg_128
152- %33:vgpr_32 = V_PK_ADD_F16 8, %32:vgpr_32, 8, %32:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
153- %99:vreg_128 = REG_SEQUENCE %33:vgpr_32, %subreg.sub0, %31:vgpr_32, %subreg.sub1, %29:vgpr_32, %subreg.sub2, %27:vgpr_32, %subreg.sub3
154- %35:vgpr_32 = COPY %25.sub3:vreg_128
155- %36:sreg_32 = S_MOV_B32 939538432
156- %37:vgpr_32 = V_PK_MUL_F16 8, %35:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
157- %38:vgpr_32 = COPY %25.sub2:vreg_128
158- %39:vgpr_32 = V_PK_MUL_F16 8, %38:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
159- %40:vgpr_32 = COPY %25.sub1:vreg_128
160- %41:vgpr_32 = V_PK_MUL_F16 8, %40:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
161- %42:vgpr_32 = COPY %25.sub0:vreg_128
162- %43:vgpr_32 = V_PK_MUL_F16 8, %42:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
163- %98:vreg_128 = REG_SEQUENCE %43:vgpr_32, %subreg.sub0, %41:vgpr_32, %subreg.sub1, %39:vgpr_32, %subreg.sub2, %37:vgpr_32, %subreg.sub3
164- early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %99:vreg_128, 8, %98:vreg_128, 8, 0, 0, 0, implicit $exec
165- %47:sreg_32 = S_MOV_B32 1
166- %48:vgpr_32 = V_AND_B32_e64 %5:vgpr_32, %47:sreg_32, implicit $exec
167- %49:sreg_32 = S_MOV_B32 0
168- %50:sreg_32 = V_CMP_EQ_U32_e64 %48:vgpr_32, %49:sreg_32, implicit $exec
169- %4:sreg_32 = SI_IF %50:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
55+ %0:vreg_128 = IMPLICIT_DEF
56+ %1:vreg_128 = IMPLICIT_DEF
57+ %2:sreg_32 = IMPLICIT_DEF
58+ early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %0:vreg_128, 8, %1:vreg_128, 8, 0, 0, 0, implicit $exec
59+ %4:sreg_32 = SI_IF %2:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
17060 S_BRANCH %bb.1
17161
17262 bb.1.if.then:
173- successors: %bb.2(0x80000000)
63+ successors: %bb.2
17464
175- %51:sreg_32 = COPY %1.sub5:sgpr_192
176- %52:sreg_32 = COPY %1.sub4:sgpr_192
177- %53:sreg_64_xexec_xnull = REG_SEQUENCE %52:sreg_32, %subreg.sub0, %51:sreg_32, %subreg.sub1
178- %54:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %6:sgpr_64, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
179- %55:vgpr_32 = COPY %2.sub0:vreg_64
180- %57:vgpr_32 = V_LSHLREV_B32_e64 %47:sreg_32, %55:vgpr_32, implicit $exec
181- %58:vgpr_32 = COPY %3.sub1:vreg_256
182- %59:vgpr_32 = COPY %3.sub3:vreg_256
183- %60:vgpr_32 = COPY %3.sub5:vreg_256
184- %61:vgpr_32 = COPY %3.sub7:vreg_256
185- %62:vgpr_32 = COPY %3.sub6:vreg_256
186- %63:vgpr_32 = COPY %3.sub4:vreg_256
187- %64:vgpr_32 = COPY %3.sub2:vreg_256
188- %65:vgpr_32 = COPY %3.sub0:vreg_256
189- %67:sreg_32 = IMPLICIT_DEF
190- %68:vgpr_32 = COPY %67:sreg_32
191- %66:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %65:vgpr_32, 0, 0, 0, %68:vgpr_32, 0, 0, implicit $mode, implicit $exec
192- %70:sreg_32 = IMPLICIT_DEF
193- %71:vgpr_32 = COPY %70:sreg_32
194- %69:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %64:vgpr_32, 0, 0, 0, %71:vgpr_32, 0, 0, implicit $mode, implicit $exec
195- %73:sreg_32 = IMPLICIT_DEF
196- %74:vgpr_32 = COPY %73:sreg_32
197- %72:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %63:vgpr_32, 0, 0, 0, %74:vgpr_32, 0, 0, implicit $mode, implicit $exec
198- %76:sreg_32 = IMPLICIT_DEF
199- %77:vgpr_32 = COPY %76:sreg_32
200- %75:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %62:vgpr_32, 0, 0, 0, %77:vgpr_32, 0, 0, implicit $mode, implicit $exec
201- %78:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %61:vgpr_32, 0, 0, 0, %75:vgpr_32, 0, 0, implicit $mode, implicit $exec
202- %79:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %60:vgpr_32, 0, 0, 0, %72:vgpr_32, 0, 0, implicit $mode, implicit $exec
203- %80:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %59:vgpr_32, 0, 0, 0, %69:vgpr_32, 0, 0, implicit $mode, implicit $exec
204- %81:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %58:vgpr_32, 0, 0, 0, %66:vgpr_32, 0, 0, implicit $mode, implicit $exec
205- %97:vreg_128 = REG_SEQUENCE %81:vgpr_32, %subreg.sub0, %80:vgpr_32, %subreg.sub1, %79:vgpr_32, %subreg.sub2, %78:vgpr_32, %subreg.sub3
206- %83:vreg_128 = COPY %97:vreg_128
207- GLOBAL_STORE_DWORDX4_SADDR %57:vgpr_32, %83:vreg_128, %53:sreg_64_xexec_xnull, 0, 0, implicit $exec :: (store (s128), addrspace 1)
65+ %5:vgpr_32 = COPY %3.sub1:vreg_256
66+ %6:vgpr_32 = COPY %3.sub3:vreg_256
67+ %7:vgpr_32 = COPY %3.sub5:vreg_256
68+ %8:vgpr_32 = COPY %3.sub7:vreg_256
69+ %9:vgpr_32 = COPY %3.sub6:vreg_256
70+ %10:vgpr_32 = COPY %3.sub4:vreg_256
71+ %11:vgpr_32 = COPY %3.sub2:vreg_256
72+ %12:vgpr_32 = COPY %3.sub0:vreg_256
20873
20974 bb.2.if.end:
210-
21175 SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
21276 S_ENDPGM 0
21377
0 commit comments