[AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR

ritter-x2a · ritter-x2a · commit 656adb4e5ad4 · 2025-02-05T05:50:12.000-05:00
gfx940 and gfx941 are no longer supported. This is one of a series of
PRs to remove them from the code base.

For SWDEV-512631
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -602,7 +602,7 @@ def AMDGPU_MFMAOp :
     order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
 
     The negateA, negateB, and negateC flags are only supported for double-precision
-    operations on gfx940+.
+    operations on gfx942+.
   }];
   let assemblyFormat = [{
     $sourceA `*` $sourceB `+` $destC
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -382,11 +382,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k">
 def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">;
 def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">;
 def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">;
-// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a
-// NEG bitfield. See IntrinsicsAMDGPU.td for more info.
+// Note: in gfx942, unlike in gfx90a, the f64 xdlops use the "blgp" argument as
+// a NEG bitfield. See IntrinsicsAMDGPU.td for more info.
 def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">;
 def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">;
-// New in gfx940.
+// New in gfx942.
 def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">;
 def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">;
 def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">;
@@ -409,7 +409,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">;
 def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>;
 def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>;
 
-// 2:4 Sparsity ops (GFX940)
+// 2:4 Sparsity ops (GFX942)
 def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">;
 def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">;
 def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -80,7 +80,7 @@ namespace {
 // Define commonly used chipsets versions for convenience.
 constexpr Chipset kGfx908 = Chipset(9, 0, 8);
 constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
-constexpr Chipset kGfx940 = Chipset(9, 4, 0);
+constexpr Chipset kGfx942 = Chipset(9, 4, 2);
 
 /// Define lowering patterns for raw buffer ops
 template <typename GpuOp, typename Intrinsic>
@@ -483,7 +483,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
     destElem = destType.getElementType();
 
   if (sourceElem.isF32() && destElem.isF32()) {
-    if (mfma.getReducePrecision() && chipset >= kGfx940) {
+    if (mfma.getReducePrecision() && chipset >= kGfx942) {
       if (m == 32 && n == 32 && k == 4 && b == 1)
         return ROCDL::mfma_f32_32x32x4_xf32::getOperationName();
       if (m == 16 && n == 16 && k == 8 && b == 1)
@@ -551,9 +551,9 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
       return ROCDL::mfma_i32_32x32x8i8::getOperationName();
     if (m == 16 && n == 16 && k == 16 && b == 1)
       return ROCDL::mfma_i32_16x16x16i8::getOperationName();
-    if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940)
+    if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx942)
       return ROCDL::mfma_i32_32x32x16_i8::getOperationName();
-    if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx940)
+    if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx942)
       return ROCDL::mfma_i32_16x16x32_i8::getOperationName();
   }
 
@@ -565,7 +565,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
   }
 
   if (isa<Float8E5M2FNUZType>(sourceElem) && destElem.isF32() &&
-      chipset >= kGfx940) {
+      chipset >= kGfx942) {
     // Known to be correct because there are no scalar f8 instructions and
     // because a length mismatch will have been caught by the verifier.
     Type sourceBElem =
@@ -585,7 +585,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
   }
 
   if (isa<Float8E4M3FNUZType>(sourceElem) && destElem.isF32() &&
-      chipset >= kGfx940) {
+      chipset >= kGfx942) {
     Type sourceBElem =
         cast<VectorType>(mfma.getSourceB().getType()).getElementType();
     if (m == 16 && n == 16 && k == 32 && b == 1) {
@@ -653,8 +653,8 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern<MFMAOp> {
       return op->emitOpError("MFMA only supported on gfx908+");
     uint32_t getBlgpField = static_cast<uint32_t>(op.getBlgp());
     if (op.getNegateA() || op.getNegateB() || op.getNegateC()) {
-      if (chipset < kGfx940)
-        return op.emitOpError("negation unsupported on older than gfx940");
+      if (chipset < kGfx942)
+        return op.emitOpError("negation unsupported on older than gfx942");
       getBlgpField |=
           op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2);
     }
@@ -775,7 +775,7 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
     ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = op.getLoc();
-  if (chipset.majorVersion != 9 || chipset < kGfx940)
+  if (chipset.majorVersion != 9 || chipset < kGfx942)
     return rewriter.notifyMatchFailure(
         loc, "Fp8 conversion instructions are not available on target "
              "architecture and their emulation is not implemented");
@@ -819,7 +819,7 @@ LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite(
     PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = op.getLoc();
-  if (chipset.majorVersion != 9 || chipset < kGfx940)
+  if (chipset.majorVersion != 9 || chipset < kGfx942)
     return rewriter.notifyMatchFailure(
         loc, "Fp8 conversion instructions are not available on target "
              "architecture and their emulation is not implemented");
@@ -856,7 +856,7 @@ LogicalResult PackedStochRoundFp8OpLowering::matchAndRewrite(
     PackedStochRoundFp8Op op, PackedStochRoundFp8OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = op.getLoc();
-  if (chipset.majorVersion != 9 || chipset < kGfx940)
+  if (chipset.majorVersion != 9 || chipset < kGfx942)
     return rewriter.notifyMatchFailure(
         loc, "Fp8 conversion instructions are not available on target "
              "architecture and their emulation is not implemented");
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -381,7 +381,7 @@ void ArithToAMDGPUConversionPass::runOnOperation() {
   }
 
   bool convertFP8Arithmetic =
-      maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 0);
+      maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 2);
   arith::populateArithToAMDGPUConversionPatterns(
       patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz,
       *maybeChipset);
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -179,7 +179,7 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
   }
   // gfx9 has no to a very limited support for floating-point min and max.
   if (chipset.majorVersion == 9) {
-    if (chipset >= Chipset(9, 0, 0xa) && chipset != Chipset(9, 4, 1)) {
+    if (chipset >= Chipset(9, 0, 0xa)) {
       // gfx90a supports f64 max (and min, but we don't have a min wrapper right
       // now) but all other types need to be emulated.
       target.addDynamicallyLegalOp<RawBufferAtomicFmaxOp>(
@@ -189,12 +189,6 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
     } else {
       target.addIllegalOp<RawBufferAtomicFmaxOp>();
     }
-    if (chipset == Chipset(9, 4, 1)) {
-      // gfx941 requires non-CAS atomics to be implemented with CAS loops.
-      // The workaround here mirrors HIP and OpenMP.
-      target.addIllegalOp<RawBufferAtomicFaddOp, RawBufferAtomicFmaxOp,
-                          RawBufferAtomicSmaxOp, RawBufferAtomicUminOp>();
-    }
   }
   patterns.add<
       RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 | FileCheck %s
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s
 
 // CHECK-LABEL: func @ext_scalar
 // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %{{.+}} : f8E5M2FNUZ to i8
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 -cse | FileCheck %s
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 -cse | FileCheck %s
 func.func @mfma_to_rocdl(%arg0 : f32, %arg1 : vector<32xf32>,
                     %arg2 : vector<16xf32>, %arg3 : vector<4xf32>,
                     %arg4 : vector<4xf16>, %arg5 : vector<4xi8>,
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt --split-input-file %s \
-// RUN:   --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \
+// RUN:   --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx942 saturate-fp8-truncf=true}))' \
 // RUN:   | FileCheck %s
 
 // CHECK-LABEL: func.func @scalar_trunc
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s
+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx942" | FileCheck %s
 
 // CHECK-LABEL: func.func @scalar_ext
 // CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ)
diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
@@ -19,11 +19,11 @@ TEST(ChipsetTest, Parsing) {
   EXPECT_EQ(chipset->minorVersion, 0u);
   EXPECT_EQ(chipset->steppingVersion, 0xau);
 
-  chipset = Chipset::parse("gfx940");
+  chipset = Chipset::parse("gfx942");
   ASSERT_TRUE(succeeded(chipset));
   EXPECT_EQ(chipset->majorVersion, 9u);
   EXPECT_EQ(chipset->minorVersion, 4u);
-  EXPECT_EQ(chipset->steppingVersion, 0u);
+  EXPECT_EQ(chipset->steppingVersion, 2u);
 
   chipset = Chipset::parse("gfx1103");
   ASSERT_TRUE(succeeded(chipset));
@@ -36,30 +36,26 @@ TEST(ChipsetTest, ParsingInvalid) {
   EXPECT_TRUE(failed(Chipset::parse("navi33")));
   EXPECT_TRUE(failed(Chipset::parse("rdna2")));
   EXPECT_TRUE(failed(Chipset::parse("sm_80")));
-  EXPECT_TRUE(failed(Chipset::parse("GFX940")));
-  EXPECT_TRUE(failed(Chipset::parse("Gfx940")));
+  EXPECT_TRUE(failed(Chipset::parse("GFX942")));
+  EXPECT_TRUE(failed(Chipset::parse("Gfx942")));
   EXPECT_TRUE(failed(Chipset::parse("gfx9")));
-  EXPECT_TRUE(failed(Chipset::parse("gfx_940")));
-  EXPECT_TRUE(failed(Chipset::parse("gfx940_")));
+  EXPECT_TRUE(failed(Chipset::parse("gfx_942")));
+  EXPECT_TRUE(failed(Chipset::parse("gfx942_")));
   EXPECT_TRUE(failed(Chipset::parse("gfxmeow")));
   EXPECT_TRUE(failed(Chipset::parse("gfx1fff")));
 }
 
 TEST(ChipsetTest, Comparison) {
-  EXPECT_EQ(Chipset(9, 4, 0), Chipset(9, 4, 0));
-  EXPECT_NE(Chipset(9, 4, 0), Chipset(9, 4, 2));
+  EXPECT_EQ(Chipset(9, 4, 2), Chipset(9, 4, 2));
   EXPECT_NE(Chipset(9, 0, 0), Chipset(10, 0, 0));
 
   EXPECT_LT(Chipset(9, 0, 0), Chipset(10, 0, 0));
   EXPECT_LT(Chipset(9, 0, 0), Chipset(9, 4, 2));
-  EXPECT_LE(Chipset(9, 4, 1), Chipset(9, 4, 1));
   EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 2));
-  EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 0));
 
   EXPECT_GT(Chipset(9, 0, 0xa), Chipset(9, 0, 8));
   EXPECT_GE(Chipset(9, 0, 0xa), Chipset(9, 0, 0xa));
-  EXPECT_FALSE(Chipset(9, 4, 1) >= Chipset(9, 4, 2));
-  EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 0));
+  EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2));
 }
 
 } // namespace

Original file line number	Diff line number	Diff line change
`@@ -381,7 +381,7 @@ void ArithToAMDGPUConversionPass::runOnOperation() {`
`381`	`381`	`}`
`382`	`382`
`383`	`383`	`bool convertFP8Arithmetic =`
`384`		`- maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 0);`
	`384`	`+ maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 2);`
`385`	`385`	`arith::populateArithToAMDGPUConversionPatterns(`
`386`	`386`	`patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz,`
`387`	`387`	`*maybeChipset);`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 \| FileCheck %s`
	`1`	`+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 \| FileCheck %s`
`2`	`2`
`3`	`3`	`// CHECK-LABEL: func @ext_scalar`
`4`	`4`	`// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %{{.+}} : f8E5M2FNUZ to i8`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" \| FileCheck %s`
	`1`	`+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx942" \| FileCheck %s`
`2`	`2`
`3`	`3`	`// CHECK-LABEL: func.func @scalar_ext`
`4`	`4`	`// CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ)`