From d85f641e5cf9c777ed4d91f1d08e95098536d1bc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 4 Sep 2024 22:19:17 +0400 Subject: [PATCH 1/6] AMDGPU: Fix incorrectly selecting fp8/bf8 conversion intrinsics Trying to codegen these on targets without the instructions should fail to select. Not sure if all the predicates are correct. We had a fake one disconnected to a feature which was always true. Fixes: SWDEV-482274 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 11 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 +- .../AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll | 100 ++++++++++++++++++ 4 files changed, 117 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index dc94edf85586f..389c91eafc66e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -685,6 +685,13 @@ def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts", "Has fp8 and bf8 conversion instructions" >; +def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug", + "HasCvtFP8Vop1Bug", + "true", + "FP8/BF8 VOP1 form of conversion to F32 is unreliable", + [FeatureFP8ConversionInsts] +>; + def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "HasPkFmacF16Inst", "true", @@ -1438,7 +1445,7 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeaturePackedFP32Ops, FeatureMAIInsts, FeatureFP8Insts, - FeatureFP8ConversionInsts, + FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug, FeaturePkFmacF16Inst, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, @@ -1648,7 +1655,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, - FeatureFP8ConversionInsts, + FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug, FeaturePackedTID, FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index e6b7342d5fffc..276f0c3977f62 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -158,6 +158,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMAIInsts = false; bool HasFP8Insts = false; bool HasFP8ConversionInsts = false; + bool HasCvtFP8Vop1Bug = false; bool HasPkFmacF16Inst = false; bool HasAtomicFMinFMaxF32GlobalInsts = false; bool HasAtomicFMinFMaxF64GlobalInsts = false; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 871a7c3c2579e..75d4bb162e16f 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -582,8 +582,8 @@ class Cvt_F32_F8_Pat; -let SubtargetPredicate = isGFX9Only in { -let OtherPredicates = [HasCvtFP8VOP1Bug] in { +let SubtargetPredicate = HasFP8ConversionInsts in { +let OtherPredicates = [HasCvtFP8VOP1Bug, HasSDWA] in { def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>; def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)), @@ -597,11 +597,14 @@ let OtherPredicates = [HasNoCvtFP8VOP1Bug] in { (V_CVT_F32_BF8_e32 $src)>; } +let OtherPredicates = [HasSDWA] in { foreach Index = [1, 2, 3] in { def : Cvt_F32_F8_Pat; def : Cvt_F32_F8_Pat; } -} // End SubtargetPredicate = isGFX9Only +} // End OtherPredicates = [HasSDWA] + +} // End SubtargetPredicate = HasFP8ConversionInsts class Cvt_PK_F32_F8_Pat : GCNPat< @@ -611,7 +614,7 @@ class Cvt_PK_F32_F8_Pat; -let SubtargetPredicate = isGFX9Only in { +let SubtargetPredicate = HasFP8ConversionInsts, OtherPredicates = [HasSDWA] in { foreach Index = [0, -1] in { def : Cvt_PK_F32_F8_Pat; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll new file mode 100644 index 0000000000000..29812993d541e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll @@ -0,0 +1,100 @@ +; RUN: split-file %s %t + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR %s + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR %s + + +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR-GISEL %s + +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR-GISEL %s + + + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-fp8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-fp8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD1-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-bf8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-bf8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD1-ERR %s + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-fp8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-fp8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD1-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-bf8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-bf8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD1-ERR %s + + +;--- fp8-byte0-err.ll +; ERR-FP8-BYTE0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.fp8 +; ERR-FP8-BYTE0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.fp8), %{{[0-9]+}}:vgpr(s32), 0 + +define float @test_cvt_f32_fp8_byte0(i32 %a) { + %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0) + ret float %ret +} + +;--- fp8-byte1-err.ll +; ERR-FP8-BYTE1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.fp8 +; ERR-FP8-BYTE1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.fp8), %{{[0-9]+}}:vgpr(s32), 1 +define float @test_cvt_f32_fp8_byte1(i32 %a) { + %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) + ret float %ret +} + +;--- bf8-byte0-err.ll +; ERR-BF8-BYTE0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.bf8 +; ERR-BF8-BYTE0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.bf8), %{{[0-9]+}}:vgpr(s32), 0 +define float @test_cvt_f32_bf8_byte0(i32 %a) { + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) + ret float %ret +} + +;--- bf8-byte1-err.ll +; ERR-BF8-BYTE1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.bf8 +; ERR-BF8-BYTE1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.bf8), %{{[0-9]+}}:vgpr(s32), 1 +define float @test_cvt_f32_bf8_byte1(i32 %a) { + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1) + ret float %ret +} + +;--- pk-fp8-word0-err.ll +; ERR-PK-FP8-WORD0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.fp8 +; ERR-PK-FP8-WORD0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.fp8), %{{[0-9]+}}:vgpr(s32), 0 +define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) { + %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false) + ret <2 x float> %ret +} + +;--- pk-fp8-word1-err.ll +; ERR-PK-FP8-WORD1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.fp8 +; ERR-PK-FP8-WORD1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.fp8), %{{[0-9]+}}:vgpr(s32), 1 +define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) { + %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) + ret <2 x float> %ret +} + +;--- pk-bf8-word0-err.ll +; ERR-PK-BF8-WORD0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.bf8 +; ERR-PK-BF8-WORD0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.bf8), %{{[0-9]+}}:vgpr(s32), 0 +define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) { + %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) + ret <2 x float> %ret +} + +;--- pk-bf8-word1-err.ll +; ERR-PK-BF8-WORD1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.bf8 +; ERR-PK-BF8-WORD1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.bf8), %{{[0-9]+}}:vgpr(s32), 1 +define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) { + %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true) + ret <2 x float> %ret +} From 7f8fd29d11c3b189f2be583acbcf94d5d09ea29b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 15 Sep 2024 15:26:42 +0400 Subject: [PATCH 2/6] Formatting --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 389c91eafc66e..b3dade8a98c89 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1445,7 +1445,8 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeaturePackedFP32Ops, FeatureMAIInsts, FeatureFP8Insts, - FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug, + FeatureFP8ConversionInsts, + FeatureCvtFP8VOP1Bug, FeaturePkFmacF16Inst, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, @@ -1655,7 +1656,8 @@ def FeatureISAVersion12 : FeatureSet< FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, - FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug, + FeatureFP8ConversionInsts, + FeatureCvtFP8VOP1Bug, FeaturePackedTID, FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, From cb93a8915bb6afd6ecc8ef650809d450cbb9dda7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Sep 2024 19:38:39 +0400 Subject: [PATCH 3/6] Remove from gfx12 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b3dade8a98c89..25117544d6a84 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1657,7 +1657,6 @@ def FeatureISAVersion12 : FeatureSet< FeatureImageInsts, FeatureExtendedImageInsts, FeatureFP8ConversionInsts, - FeatureCvtFP8VOP1Bug, FeaturePackedTID, FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, From 5c72cf0c7d58699b6960f49a6dfee81893630198 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Sep 2024 19:52:30 +0400 Subject: [PATCH 4/6] Remove HasSDWA OtherPredicate --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 75d4bb162e16f..18a548f7ba723 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -583,7 +583,7 @@ class Cvt_F32_F8_Pat; let SubtargetPredicate = HasFP8ConversionInsts in { -let OtherPredicates = [HasCvtFP8VOP1Bug, HasSDWA] in { +let OtherPredicates = [HasCvtFP8VOP1Bug] in { def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>; def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)), From e5063d75e037660a3752de1922f211bfae9b4be5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Sep 2024 22:39:23 +0400 Subject: [PATCH 5/6] Fix hasCvtFP8VOP1Bug predicate --- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 276f0c3977f62..1945812609316 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1353,7 +1353,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasSplitBarriers() const { return getGeneration() >= GFX12; } // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. - bool hasCvtFP8VOP1Bug() const { return true; } + bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; } // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a // no-return form. From cc9a3b9e75442e4e811f73eb66ceed50d100fe14 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Sep 2024 22:56:26 +0400 Subject: [PATCH 6/6] Avoid selecting e32 case on gfx12; not sure how to best check for requiring the op_sel form --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 18a548f7ba723..be98d201a64a7 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -590,7 +590,7 @@ let OtherPredicates = [HasCvtFP8VOP1Bug] in { (V_CVT_F32_BF8_sdwa 0, $src, 0, 0, 0)>; } -let OtherPredicates = [HasNoCvtFP8VOP1Bug] in { +let OtherPredicates = [HasNoCvtFP8VOP1Bug, HasSDWA] in { // FIXME: HasSDWA is a substitute for !gfx12 def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_e32 $src)>; def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)),