Skip to content

Commit 200b206

Browse files
committed
AMDGPU: Use V_MAC_F32 for fmad.ftz
This avoids regressions in a future patch. I'm confused by the use of the gfx9 usage legacy_mad. Was this a pointless instruction rename, or uses fmul_legacy handling? Why is regular mac avilable in that case?
1 parent 75af694 commit 200b206

File tree

5 files changed

+50
-38
lines changed

5 files changed

+50
-38
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -848,20 +848,29 @@ def : GCNPat <
848848
// VOP2 Patterns
849849
//===----------------------------------------------------------------------===//
850850

851-
multiclass FMADPat <ValueType vt, Instruction inst> {
852-
def : GCNPat <
853-
(vt (fmad (VOP3NoMods vt:$src0),
854-
(VOP3NoMods vt:$src1),
855-
(VOP3NoMods vt:$src2))),
851+
// TODO: Check only no src2 mods?
852+
class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
853+
: GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
854+
(vt (VOP3NoMods vt:$src1)),
855+
(vt (VOP3NoMods vt:$src2)))),
856856
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
857857
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
858-
>;
858+
>;
859+
860+
861+
// Prefer mac form when there are no modifiers.
862+
let AddedComplexity = 9 in {
863+
def : FMADPat <f32, V_MAC_F32_e64, fmad>;
864+
def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
865+
866+
let SubtargetPredicate = Has16BitInsts in {
867+
def : FMADPat <f16, V_MAC_F16_e64, fmad>;
868+
def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
859869
}
860870

861-
defm : FMADPat <f16, V_MAC_F16_e64>;
862-
defm : FMADPat <f32, V_MAC_F32_e64>;
871+
}
863872

864-
class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
873+
class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
865874
: GCNPat<
866875
(Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
867876
(Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
@@ -870,9 +879,8 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
870879
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
871880
>;
872881

873-
// FIXME: This should select to V_MAC_F32
874-
def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
875-
def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
882+
def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
883+
def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
876884
let SubtargetPredicate = Has16BitInsts;
877885
}
878886

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ body: |
1919
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
2020
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
2121
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
22-
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
23-
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
22+
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
23+
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
2424
%0:vgpr(s32) = COPY $vgpr0
2525
%1:vgpr(s32) = COPY $vgpr1
2626
%2:vgpr(s32) = COPY $vgpr2
@@ -43,8 +43,8 @@ body: |
4343
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
4444
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
4545
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
46-
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
47-
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
46+
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
47+
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
4848
%0:sgpr(s32) = COPY $sgpr0
4949
%1:vgpr(s32) = COPY $vgpr0
5050
%2:vgpr(s32) = COPY $vgpr1
@@ -67,8 +67,8 @@ body: |
6767
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
6868
; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
6969
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
70-
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
71-
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
70+
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
71+
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
7272
%0:vgpr(s32) = COPY $vgpr0
7373
%1:sgpr(s32) = COPY $sgpr0
7474
%2:vgpr(s32) = COPY $vgpr1
@@ -91,8 +91,9 @@ body: |
9191
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
9292
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
9393
; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0
94-
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
95-
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
94+
; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
95+
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY3]], 0, 0, implicit $exec
96+
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
9697
%0:vgpr(s32) = COPY $vgpr0
9798
%1:vgpr(s32) = COPY $vgpr0
9899
%2:sgpr(s32) = COPY $sgpr0
@@ -116,8 +117,8 @@ body: |
116117
; GCN: liveins: $sgpr0, $vgpr0
117118
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
118119
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
119-
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
120-
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
120+
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
121+
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
121122
%0:sgpr(s32) = COPY $sgpr0
122123
%1:vgpr(s32) = COPY $vgpr0
123124
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %1
@@ -138,8 +139,9 @@ body: |
138139
; GCN: liveins: $sgpr0, $vgpr0
139140
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
140141
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
141-
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
142-
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
142+
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
143+
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
144+
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
143145
%0:sgpr(s32) = COPY $sgpr0
144146
%1:vgpr(s32) = COPY $vgpr0
145147
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %1, %0
@@ -160,8 +162,9 @@ body: |
160162
; GCN: liveins: $sgpr0, $vgpr0
161163
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
162164
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
163-
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY1]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
164-
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
165+
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
166+
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $exec
167+
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
165168
%0:sgpr(s32) = COPY $sgpr0
166169
%1:vgpr(s32) = COPY $vgpr0
167170
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %1, %0, %0
@@ -181,8 +184,9 @@ body: |
181184
; GCN-LABEL: name: fmad_ftz_s32_vsss
182185
; GCN: liveins: $sgpr0, $vgpr0
183186
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
184-
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
185-
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
187+
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
188+
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
189+
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
186190
%0:sgpr(s32) = COPY $sgpr0
187191
%1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %0
188192
S_ENDPGM 0, implicit %1

llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
137137
}
138138

139139
; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
140-
; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
141-
; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
140+
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000
141+
; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}}
142142
; GCN-NOT: v_mul
143143
; GCN-NOT: v_max
144144
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55
declare half @llvm.amdgcn.fmad.ftz.f16(half %a, half %b, half %c)
66

77
; GCN-LABEL: {{^}}mad_f16:
8-
; GFX8: v_ma{{[dc]}}_f16
9-
; GFX9: v_mad_legacy_f16
8+
; GCN: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
109
define amdgpu_kernel void @mad_f16(
1110
half addrspace(1)* %r,
1211
half addrspace(1)* %a,
@@ -34,9 +33,7 @@ define amdgpu_kernel void @mad_f16_imm_a(
3433
}
3534

3635
; GCN-LABEL: {{^}}mad_f16_imm_b:
37-
; GCN: s_movk_i32 [[KB:s[0-9]+]], 0x4800
38-
; GFX8: v_mad_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
39-
; GFX9: v_mad_legacy_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
36+
; GCN: v_mac_f16_e32 {{v[0-9]+}}, 0x4800, {{v[0-9]+$}}
4037
define amdgpu_kernel void @mad_f16_imm_b(
4138
half addrspace(1)* %r,
4239
half addrspace(1)* %a,

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ define amdgpu_kernel void @mad_f32_imm_a(
3535

3636
; GCN-LABEL: {{^}}mad_f32_imm_b:
3737
; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x41000000
38-
; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, [[KB]],
38+
; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{[s][0-9]+}}, [[KB]]
3939
define amdgpu_kernel void @mad_f32_imm_b(
4040
float addrspace(1)* %r,
4141
float addrspace(1)* %a,
@@ -48,8 +48,11 @@ define amdgpu_kernel void @mad_f32_imm_b(
4848
}
4949

5050
; GCN-LABEL: {{^}}mad_f32_imm_c:
51-
; GCN: v_mov_b32_e32 [[KC:v[0-9]+]], 0x41000000
52-
; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, {{v[0-9]+}}, [[KC]]{{$}}
51+
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000
52+
; GCN: s_load_dword [[A:s[0-9]+]]
53+
; GCN: s_load_dword [[B:s[0-9]+]]
54+
; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
55+
; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}}
5356
define amdgpu_kernel void @mad_f32_imm_c(
5457
float addrspace(1)* %r,
5558
float addrspace(1)* %a,

0 commit comments

Comments
 (0)