AMDGPU: Use V_MAC_F32 for fmad.ftz

arsenm · arsenm · commit 200b20639ac2 · 2020-03-10T14:41:06.000-07:00
This avoids regressions in a future patch. I'm confused by the use of
the gfx9 usage legacy_mad. Was this a pointless instruction rename, or
uses fmul_legacy handling? Why is regular mac avilable in that case?
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -848,20 +848,29 @@ def : GCNPat <
 // VOP2 Patterns
 //===----------------------------------------------------------------------===//
 
-multiclass FMADPat <ValueType vt, Instruction inst> {
-  def : GCNPat <
-    (vt (fmad (VOP3NoMods vt:$src0),
-              (VOP3NoMods vt:$src1),
-              (VOP3NoMods vt:$src2))),
+// TODO: Check only no src2 mods?
+class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
+  : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
+                      (vt (VOP3NoMods vt:$src1)),
+                      (vt (VOP3NoMods vt:$src2)))),
     (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
           SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
-  >;
+>;
+
+
+// Prefer mac form when there are no modifiers.
+let AddedComplexity = 9 in {
+def : FMADPat <f32, V_MAC_F32_e64, fmad>;
+def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
+
+let SubtargetPredicate = Has16BitInsts in {
+def : FMADPat <f16, V_MAC_F16_e64, fmad>;
+def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
 }
 
-defm : FMADPat <f16, V_MAC_F16_e64>;
-defm : FMADPat <f32, V_MAC_F32_e64>;
+}
 
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
   : GCNPat<
   (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
                (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
@@ -870,9 +879,8 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
   $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
-// FIXME: This should select to V_MAC_F32
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
-def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
   let SubtargetPredicate = Has16BitInsts;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
@@ -19,8 +19,8 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -43,8 +43,8 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(s32) = COPY $vgpr1
@@ -67,8 +67,8 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
     %2:vgpr(s32) = COPY $vgpr1
@@ -91,8 +91,9 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY3]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:sgpr(s32) = COPY $sgpr0
@@ -116,8 +117,8 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %1
@@ -138,8 +139,9 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %1, %0
@@ -160,8 +162,9 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY1]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %1, %0, %0
@@ -181,8 +184,9 @@ body: |
     ; GCN-LABEL: name: fmad_ftz_s32_vsss
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %0
     S_ENDPGM 0, implicit %1
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -137,8 +137,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
 }
 
 ; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
-; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
-; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000
+; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}}
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
@@ -5,8 +5,7 @@
 declare half @llvm.amdgcn.fmad.ftz.f16(half %a, half %b, half %c)
 
 ; GCN-LABEL: {{^}}mad_f16:
-; GFX8: v_ma{{[dc]}}_f16
-; GFX9: v_mad_legacy_f16
+; GCN: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
 define amdgpu_kernel void @mad_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
@@ -34,9 +33,7 @@ define amdgpu_kernel void @mad_f16_imm_a(
 }
 
 ; GCN-LABEL: {{^}}mad_f16_imm_b:
-; GCN:  s_movk_i32 [[KB:s[0-9]+]], 0x4800
-; GFX8: v_mad_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
-; GFX9: v_mad_legacy_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
+; GCN: v_mac_f16_e32 {{v[0-9]+}}, 0x4800, {{v[0-9]+$}}
 define amdgpu_kernel void @mad_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @mad_f32_imm_a(
 
 ; GCN-LABEL: {{^}}mad_f32_imm_b:
 ; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x41000000
-; GCN:  v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, [[KB]],
+; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{[s][0-9]+}}, [[KB]]
 define amdgpu_kernel void @mad_f32_imm_b(
     float addrspace(1)* %r,
     float addrspace(1)* %a,
@@ -48,8 +48,11 @@ define amdgpu_kernel void @mad_f32_imm_b(
 }
 
 ; GCN-LABEL: {{^}}mad_f32_imm_c:
-; GCN: v_mov_b32_e32 [[KC:v[0-9]+]], 0x41000000
-; GCN:  v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, {{v[0-9]+}}, [[KC]]{{$}}
+; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000
+; GCN: s_load_dword [[A:s[0-9]+]]
+; GCN: s_load_dword [[B:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}}
 define amdgpu_kernel void @mad_f32_imm_c(
     float addrspace(1)* %r,
     float addrspace(1)* %a,