From 1262acf4ecc9f55d0699705c7810bbf84d3da09e Mon Sep 17 00:00:00 2001 From: Tom Tromey Date: Tue, 18 Nov 2025 15:59:36 -0700 Subject: [PATCH 01/15] Introduce DwarfUnit::addBlock helper method (#168446) This patch is just a small cleanup that unifies the various spots that add a DWARF expression to the output. --- llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp | 86 ++++++----------------- llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h | 3 + 2 files changed, 24 insertions(+), 65 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 1666a0e36b39a..e1a231e02aeb2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -441,6 +441,15 @@ void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, addBlock(Die, Attribute, Block->BestForm(), Block); } +void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, + const DIExpression *Expr) { + DIELoc *Loc = new (DIEValueAllocator) DIELoc; + DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); + DwarfExpr.setMemoryLocationKind(); + DwarfExpr.addExpression(Expr); + addBlock(Die, Attribute, DwarfExpr.finalize()); +} + void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, unsigned Column, const DIFile *File) { if (Line == 0) @@ -824,27 +833,14 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIStringType *STy) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_string_length, *VarDIE); } else if (DIExpression *Expr = STy->getStringLengthExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - // This is to describe the memory location of the - // length of a Fortran deferred length string, so - // lock it down as such. - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_string_length, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_string_length, Expr); } else { uint64_t Size = STy->getSizeInBits() >> 3; addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size); } if (DIExpression *Expr = STy->getStringLocationExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - // This is to describe the memory location of the - // string, so lock it down as such. - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_data_location, Expr); } if (STy->getEncoding()) { @@ -1207,11 +1203,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { addDIEEntry(Buffer, dwarf::DW_AT_bit_size, *VarDIE); } else if (auto *Exp = dyn_cast_or_null(CTy->getRawSizeInBits())) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Exp); - addBlock(Buffer, dwarf::DW_AT_bit_size, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_bit_size, Exp); } else { uint64_t Size = CTy->getSizeInBits() >> 3; // Add size if non-zero (derived types might be zero-sized.) @@ -1607,11 +1599,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &DW_Subrange, const DISubrangeType *SR, if (auto *VarDIE = getDIE(BV)) addDIEEntry(DW_Subrange, Attr, *VarDIE); } else if (auto *BE = dyn_cast_if_present(Bound)) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(BE); - addBlock(DW_Subrange, Attr, DwarfExpr.finalize()); + addBlock(DW_Subrange, Attr, BE); } else if (auto *BI = dyn_cast_if_present(Bound)) { if (Attr == dwarf::DW_AT_GNU_bias) { if (BI->getSExtValue() != 0) @@ -1649,11 +1637,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR) { if (auto *VarDIE = getDIE(BV)) addDIEEntry(DW_Subrange, Attr, *VarDIE); } else if (auto *BE = dyn_cast_if_present(Bound)) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(BE); - addBlock(DW_Subrange, Attr, DwarfExpr.finalize()); + addBlock(DW_Subrange, Attr, BE); } else if (auto *BI = dyn_cast_if_present(Bound)) { if (Attr == dwarf::DW_AT_count) { if (BI->getSExtValue() != -1) @@ -1699,11 +1683,7 @@ void DwarfUnit::constructGenericSubrangeDIE(DIE &Buffer, addSInt(DwGenericSubrange, Attr, dwarf::DW_FORM_sdata, BE->getElement(1)); } else { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(BE); - addBlock(DwGenericSubrange, Attr, DwarfExpr.finalize()); + addBlock(DwGenericSubrange, Attr, BE); } } }; @@ -1770,44 +1750,28 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_data_location, *VarDIE); } else if (DIExpression *Expr = CTy->getDataLocationExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_data_location, Expr); } if (DIVariable *Var = CTy->getAssociated()) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_associated, *VarDIE); } else if (DIExpression *Expr = CTy->getAssociatedExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_associated, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_associated, Expr); } if (DIVariable *Var = CTy->getAllocated()) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_allocated, *VarDIE); } else if (DIExpression *Expr = CTy->getAllocatedExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_allocated, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_allocated, Expr); } if (auto *RankConst = CTy->getRankConst()) { addSInt(Buffer, dwarf::DW_AT_rank, dwarf::DW_FORM_sdata, RankConst->getSExtValue()); } else if (auto *RankExpr = CTy->getRankExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(RankExpr); - addBlock(Buffer, dwarf::DW_AT_rank, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_rank, RankExpr); } if (auto *BitStride = CTy->getBitStrideConst()) { @@ -1917,11 +1881,7 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(MemberDie, dwarf::DW_AT_bit_size, *VarDIE); } else if (auto *Exp = dyn_cast(DT->getRawSizeInBits())) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Exp); - addBlock(MemberDie, dwarf::DW_AT_bit_size, DwarfExpr.finalize()); + addBlock(MemberDie, dwarf::DW_AT_bit_size, Exp); } else { Size = DT->getSizeInBits(); FieldSize = DD->getBaseTypeSize(DT); @@ -1945,11 +1905,7 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) { } else if (auto *Expr = dyn_cast_or_null(DT->getRawOffsetInBits())) { if (!Asm->TM.Options.DebugStrictDwarf || DD->getDwarfVersion() >= 6) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(MemberDie, dwarf::DW_AT_data_bit_offset, DwarfExpr.finalize()); + addBlock(MemberDie, dwarf::DW_AT_data_bit_offset, Expr); } } else { uint32_t AlignInBytes = DT->getAlignInBytes(); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 9c0b68b315b50..740e5a80ca619 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -216,6 +216,9 @@ class DwarfUnit : public DIEUnit { void addBlock(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form, DIEBlock *Block); + /// Add an expression as block data. + void addBlock(DIE &Die, dwarf::Attribute Attribute, const DIExpression *Expr); + /// Add location information to specified debug information entry. void addSourceLine(DIE &Die, unsigned Line, unsigned Column, const DIFile *File); From 0a96b240fcb715c082ab9b4cab6fddae02065602 Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Tue, 18 Nov 2025 16:04:11 -0800 Subject: [PATCH 02/15] [mlir][acc][flang] Introduce OpenACC interfaces for globals (#168614) Introduce two new OpenACC operation interfaces for identifying global variables and their address computations: - `GlobalVariableOpInterface`: Identifies operations that define global variables. Provides an `isConstant()` method to query whether the global is constant. - `AddressOfGlobalOpInterface`: Identifies operations that compute the address of a global variable. Provides a `getSymbol()` method to retrieve the symbol reference. This is being done in preparation for `ACCImplicitDeclare` pass which will automatically ensure that `acc declare` is applied to globals when needed. The following operations now implement these interfaces: - `memref::GlobalOp` implements `GlobalVariableOpInterface` - `memref::GetGlobalOp` implements `AddressOfGlobalOpInterface` - `fir::GlobalOp` implements `GlobalVariableOpInterface` - `fir::AddrOfOp` implements `AddressOfGlobalOpInterface` --- .../OpenACC/Support/FIROpenACCOpsInterfaces.h | 14 +++ .../Support/FIROpenACCOpsInterfaces.cpp | 9 ++ .../Support/RegisterOpenACCExtensions.cpp | 3 + .../Dialect/OpenACC/OpenACCOpsInterfaces.td | 31 ++++++ mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 23 +++++ mlir/unittests/Dialect/OpenACC/CMakeLists.txt | 1 + .../OpenACC/OpenACCOpsInterfacesTest.cpp | 95 +++++++++++++++++++ 7 files changed, 176 insertions(+) create mode 100644 mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h index 7afe97aac57e8..bf87654979cc9 100644 --- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h +++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h @@ -16,7 +16,9 @@ #include "mlir/Dialect/OpenACC/OpenACC.h" namespace fir { +class AddrOfOp; class DeclareOp; +class GlobalOp; } // namespace fir namespace hlfir { @@ -53,6 +55,18 @@ struct PartialEntityAccessModel bool isCompleteView(mlir::Operation *op) const; }; +struct AddressOfGlobalModel + : public mlir::acc::AddressOfGlobalOpInterface::ExternalModel< + AddressOfGlobalModel, fir::AddrOfOp> { + mlir::SymbolRefAttr getSymbol(mlir::Operation *op) const; +}; + +struct GlobalVariableModel + : public mlir::acc::GlobalVariableOpInterface::ExternalModel< + GlobalVariableModel, fir::GlobalOp> { + bool isConstant(mlir::Operation *op) const; +}; + } // namespace fir::acc #endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_ diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp index c1734be5185f4..11fbaf2dc2bb8 100644 --- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp @@ -59,4 +59,13 @@ bool PartialEntityAccessModel::isCompleteView( return !getBaseEntity(op); } +mlir::SymbolRefAttr AddressOfGlobalModel::getSymbol(mlir::Operation *op) const { + return mlir::cast(op).getSymbolAttr(); +} + +bool GlobalVariableModel::isConstant(mlir::Operation *op) const { + auto globalOp = mlir::cast(op); + return globalOp.getConstant().has_value(); +} + } // namespace fir::acc diff --git a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp index d71c40dfac03c..5c7f9985d41ca 100644 --- a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp @@ -49,6 +49,9 @@ void registerOpenACCExtensions(mlir::DialectRegistry ®istry) { PartialEntityAccessModel>(*ctx); fir::DeclareOp::attachInterface>( *ctx); + + fir::AddrOfOp::attachInterface(*ctx); + fir::GlobalOp::attachInterface(*ctx); }); // Register HLFIR operation interfaces diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td index 054c13a88a552..6b0c84d31d1ba 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td @@ -44,4 +44,35 @@ def PartialEntityAccessOpInterface : OpInterface<"PartialEntityAccessOpInterface ]; } +def AddressOfGlobalOpInterface : OpInterface<"AddressOfGlobalOpInterface"> { + let cppNamespace = "::mlir::acc"; + + let description = [{ + An interface for operations that compute the address of a global variable + or symbol. + }]; + + let methods = [ + InterfaceMethod<"Get the symbol reference to the global", "::mlir::SymbolRefAttr", + "getSymbol", (ins)>, + ]; +} + +def GlobalVariableOpInterface : OpInterface<"GlobalVariableOpInterface"> { + let cppNamespace = "::mlir::acc"; + + let description = [{ + An interface for operations that define global variables. This interface + provides a uniform way to query properties of global variables across + different dialects. + }]; + + let methods = [ + InterfaceMethod<"Check if the global variable is constant", "bool", + "isConstant", (ins), [{ + return false; + }]>, + ]; +} + #endif // OPENACC_OPS_INTERFACES diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 8c9c137b8aebb..5749e6ded73ba 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -211,6 +211,24 @@ struct LLVMPointerPointerLikeModel Type getElementType(Type pointer) const { return Type(); } }; +struct MemrefAddressOfGlobalModel + : public AddressOfGlobalOpInterface::ExternalModel< + MemrefAddressOfGlobalModel, memref::GetGlobalOp> { + SymbolRefAttr getSymbol(Operation *op) const { + auto getGlobalOp = cast(op); + return getGlobalOp.getNameAttr(); + } +}; + +struct MemrefGlobalVariableModel + : public GlobalVariableOpInterface::ExternalModel { + bool isConstant(Operation *op) const { + auto globalOp = cast(op); + return globalOp.getConstant(); + } +}; + /// Helper function for any of the times we need to modify an ArrayAttr based on /// a device type list. Returns a new ArrayAttr with all of the /// existingDeviceTypes, plus the effective new ones(or an added none if hte new @@ -302,6 +320,11 @@ void OpenACCDialect::initialize() { MemRefPointerLikeModel>(*getContext()); LLVM::LLVMPointerType::attachInterface( *getContext()); + + // Attach operation interfaces + memref::GetGlobalOp::attachInterface( + *getContext()); + memref::GlobalOp::attachInterface(*getContext()); } //===----------------------------------------------------------------------===// diff --git a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt index 177c8680b0040..c8c2bb96b0539 100644 --- a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt +++ b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_unittest(MLIROpenACCTests OpenACCOpsTest.cpp + OpenACCOpsInterfacesTest.cpp OpenACCUtilsTest.cpp ) mlir_target_link_libraries(MLIROpenACCTests diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp new file mode 100644 index 0000000000000..261f5c513ea24 --- /dev/null +++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp @@ -0,0 +1,95 @@ +//===- OpenACCOpsInterfacesTest.cpp - Unit tests for OpenACC interfaces --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/OwningOpRef.h" +#include "gtest/gtest.h" + +using namespace mlir; +using namespace mlir::acc; + +//===----------------------------------------------------------------------===// +// Test Fixture +//===----------------------------------------------------------------------===// + +class OpenACCOpsInterfacesTest : public ::testing::Test { +protected: + OpenACCOpsInterfacesTest() + : context(), builder(&context), loc(UnknownLoc::get(&context)) { + context.loadDialect(); + } + + MLIRContext context; + OpBuilder builder; + Location loc; +}; + +//===----------------------------------------------------------------------===// +// GlobalVariableOpInterface Tests +//===----------------------------------------------------------------------===// + +TEST_F(OpenACCOpsInterfacesTest, GlobalVariableOpInterfaceNonConstant) { + // Test that a non-constant global returns false for isConstant() + + auto memrefType = MemRefType::get({10}, builder.getF32Type()); + OwningOpRef globalOp = memref::GlobalOp::create( + builder, loc, + /*sym_name=*/builder.getStringAttr("mutable_global"), + /*sym_visibility=*/builder.getStringAttr("private"), + /*type=*/TypeAttr::get(memrefType), + /*initial_value=*/Attribute(), + /*constant=*/UnitAttr(), + /*alignment=*/IntegerAttr()); + + auto globalVarIface = + dyn_cast(globalOp->getOperation()); + ASSERT_TRUE(globalVarIface != nullptr); + EXPECT_FALSE(globalVarIface.isConstant()); +} + +TEST_F(OpenACCOpsInterfacesTest, GlobalVariableOpInterfaceConstant) { + // Test that a constant global returns true for isConstant() + + auto memrefType = MemRefType::get({5}, builder.getI32Type()); + OwningOpRef constantGlobalOp = memref::GlobalOp::create( + builder, loc, + /*sym_name=*/builder.getStringAttr("constant_global"), + /*sym_visibility=*/builder.getStringAttr("public"), + /*type=*/TypeAttr::get(memrefType), + /*initial_value=*/Attribute(), + /*constant=*/builder.getUnitAttr(), + /*alignment=*/IntegerAttr()); + + auto globalVarIface = + dyn_cast(constantGlobalOp->getOperation()); + ASSERT_TRUE(globalVarIface != nullptr); + EXPECT_TRUE(globalVarIface.isConstant()); +} + +//===----------------------------------------------------------------------===// +// AddressOfGlobalOpInterface Tests +//===----------------------------------------------------------------------===// + +TEST_F(OpenACCOpsInterfacesTest, AddressOfGlobalOpInterfaceGetSymbol) { + // Test that getSymbol() returns the correct symbol reference + + auto memrefType = MemRefType::get({5}, builder.getI32Type()); + const auto *symbolName = "test_global_symbol"; + + OwningOpRef getGlobalOp = memref::GetGlobalOp::create( + builder, loc, memrefType, FlatSymbolRefAttr::get(&context, symbolName)); + + auto addrOfGlobalIface = + dyn_cast(getGlobalOp->getOperation()); + ASSERT_TRUE(addrOfGlobalIface != nullptr); + EXPECT_EQ(addrOfGlobalIface.getSymbol().getLeafReference(), symbolName); +} From 411c75210e2326f7d6926ae4a303e05c1d0eab9d Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Wed, 19 Nov 2025 11:19:04 +1100 Subject: [PATCH 03/15] [orc-rt] Fix typos in file comments. --- orc-rt/lib/executor/TaskDispatcher.cpp | 4 ++-- orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/orc-rt/lib/executor/TaskDispatcher.cpp b/orc-rt/lib/executor/TaskDispatcher.cpp index 5f34627fb5150..9e42a66c2ea94 100644 --- a/orc-rt/lib/executor/TaskDispatcher.cpp +++ b/orc-rt/lib/executor/TaskDispatcher.cpp @@ -1,4 +1,4 @@ -//===- TaskDispatch.cpp ---------------------------------------------------===// +//===- TaskDispatcher.cpp -------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Contains the implementation of APIs in the orc-rt/TaskDispatch.h header. +// Contains the implementation of APIs in the orc-rt/TaskDispatcher.h header. // //===----------------------------------------------------------------------===// diff --git a/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp b/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp index d6d301302220d..4bf7e5df69654 100644 --- a/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp +++ b/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp @@ -1,4 +1,4 @@ -//===- ThreadPoolTaskDispatch.cpp -----------------------------------------===// +//===- ThreadPoolTaskDispatcher.cpp ---------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Contains the implementation of APIs in the orc-rt/ThreadPoolTaskDispatch.h +// Contains the implementation of APIs in the orc-rt/ThreadPoolTaskDispatcher.h // header. // //===----------------------------------------------------------------------===// From 651785a5bacb9bba2c9dbcbb6e21e28135937129 Mon Sep 17 00:00:00 2001 From: Pranav Kant Date: Tue, 18 Nov 2025 16:21:47 -0800 Subject: [PATCH 04/15] Fix #168367 (#168635) --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index b027d82d98177..643f8ab03f724 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2874,6 +2874,10 @@ llvm_target_lib_list = [lib for lib in [ ["-gen-subtarget"], "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc", ), + ( + ["-gen-sd-node-info"], + "lib/Target/NVPTX/NVPTXGenSDNodeInfo.inc", + ), ], }, { From 7819071c41273e603d1fe1f3e8ab0b11c356a899 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 18 Nov 2025 16:40:26 -0800 Subject: [PATCH 05/15] workflows/release-binaries: Drop install-ninja action (#167070) ninja is already installed by default on Linux and macOS. --- .github/workflows/release-binaries.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index 8b6656834cc06..64f371e9f8db8 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -188,9 +188,6 @@ jobs: with: ref: ${{ needs.prepare.outputs.ref }} - - name: Install Ninja - uses: llvm/actions/install-ninja@5dd955034a6742a2e21d82bf165fcb1050ae7b49 # main - - name: Set Build Prefix id: setup-stage shell: bash From c32d2ee4659170d281d0d89a1d396267e36fc7da Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Tue, 18 Nov 2025 16:46:30 -0800 Subject: [PATCH 06/15] [NFC][TableGen] Adopt CodeGenHelpers in CodeGenMapTable (#168592) Adopt `IfDefEmitter` and `NamespaceEmitter` in CodeGenMapTable.cpp --- llvm/utils/TableGen/CodeGenMapTable.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp index e5025784d304d..35ec495b93ba2 100644 --- a/llvm/utils/TableGen/CodeGenMapTable.cpp +++ b/llvm/utils/TableGen/CodeGenMapTable.cpp @@ -80,6 +80,7 @@ #include "TableGenBackends.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/TableGen/CodeGenHelpers.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" @@ -549,9 +550,8 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { if (InstrMapVec.empty()) return; - OS << "#ifdef GET_INSTRMAP_INFO\n"; - OS << "#undef GET_INSTRMAP_INFO\n"; - OS << "namespace llvm::" << NameSpace << " {\n\n"; + IfDefEmitter IfDef(OS, "GET_INSTRMAP_INFO"); + NamespaceEmitter NS(OS, ("llvm::" + NameSpace).str()); // Emit coulumn field names and their values as enums. emitEnums(OS, Records); @@ -574,6 +574,4 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { // Emit map tables and the functions to query them. IMap.emitTablesWithFunc(OS); } - OS << "} // end namespace llvm::" << NameSpace << '\n'; - OS << "#endif // GET_INSTRMAP_INFO\n\n"; } From 88efd0e88b8dafe9dd5bc118895750dd7413f811 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Tue, 18 Nov 2025 16:53:18 -0800 Subject: [PATCH 07/15] [LTT] Mark as unkown weak function tests. (#167399) We don't have enough information to infer the probability of a weak function pointer being nullptr or not (open question if we could propagate this from the linker) Issue #147390 --- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 3 +++ llvm/test/Transforms/LowerTypeTests/function-weak.ll | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 94663ff928a0b..fa35eef2c00f5 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1469,6 +1469,9 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( Constant::getNullValue(F->getType())); Value *Select = Builder.CreateSelect(ICmp, JT, Constant::getNullValue(F->getType())); + + if (auto *SI = dyn_cast(Select)) + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE); // For phi nodes, we need to update the incoming value for all operands // with the same predecessor. if (PN) diff --git a/llvm/test/Transforms/LowerTypeTests/function-weak.ll b/llvm/test/Transforms/LowerTypeTests/function-weak.ll index 4ea03b6c2c1fa..dbbe8fa4a0a9a 100644 --- a/llvm/test/Transforms/LowerTypeTests/function-weak.ll +++ b/llvm/test/Transforms/LowerTypeTests/function-weak.ll @@ -32,10 +32,10 @@ target triple = "x86_64-unknown-linux-gnu" declare !type !0 extern_weak void @f() ; CHECK: define zeroext i1 @check_f() -define zeroext i1 @check_f() { +define zeroext i1 @check_f() !prof !{!"function_entry_count", i32 10} { entry: ; CHECK: [[CMP:%.*]] = icmp ne ptr @f, null -; CHECK: [[SEL:%.*]] = select i1 [[CMP]], ptr @[[JT:.*]], ptr null +; CHECK: [[SEL:%.*]] = select i1 [[CMP]], ptr @[[JT:.*]], ptr null, !prof ![[SELPROF:[0-9]+]] ; CHECK: [[PTI:%.*]] = ptrtoint ptr [[SEL]] to i1 ; CHECK: ret i1 [[PTI]] ret i1 ptrtoint (ptr @f to i1) @@ -165,3 +165,4 @@ define i1 @foo(ptr %p) { ; CHECK-NEXT: } !0 = !{i32 0, !"typeid1"} +; CHECK: ![[SELPROF]] = !{!"unknown", !"lowertypetests"} \ No newline at end of file From 9a15556d6d6b207084bea8f02381b0459624a006 Mon Sep 17 00:00:00 2001 From: Scott Manley Date: Tue, 18 Nov 2025 18:56:24 -0600 Subject: [PATCH 08/15] [OpenACC] add cl::values to ACCImplicitRoutineOptions (#168601) Add the cl::values to the pass options so an assert is not reached when trying to generate a reproducer e.g. "unknown data value for option" --- mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td index 970d9304d8289..cad78df2fbb0b 100644 --- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td @@ -97,7 +97,14 @@ def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> { "mlir::acc::DeviceType::None", "Target device type for implicit routine generation. " "Ensures that `acc routine` device_type clauses are " - "properly considered not just default clauses."> + "properly considered not just default clauses.", + [{::llvm::cl::values( + clEnumValN(mlir::acc::DeviceType::None, "none", "none"), + clEnumValN(mlir::acc::DeviceType::Host, "host", "host"), + clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"), + clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"), + clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon")) + }]> ]; } From 522177c959ed7ec99a237387ef41aa1e250410e8 Mon Sep 17 00:00:00 2001 From: Pradeep Kumar Date: Wed, 19 Nov 2025 06:50:54 +0530 Subject: [PATCH 09/15] [NVPTX] Add a few more missing fence intrinsics (#166352) This commit adds the below fence intrinsics: - llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster - llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster - llvm.nvvm.fence.mbarrier_init.release.cluster - llvm.nvvm.fence.proxy.async.generic.acquire.sync_restrict.space.cluster.scope.cluster - llvm.nvvm.fence.proxy.async.generic.release.sync_restrict.space.cta.scope.cluster llvm.nvvm.fence.proxy.alias - llvm.nvvm.fence.proxy.async - llvm.nvvm.fence.proxy.async.global - llvm.nvvm.fence.proxy.async.shared_cluster - llvm.nvvm.fence.proxy.async.shared_cta For more information, please refere the [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar) --- llvm/docs/NVPTXUsage.rst | 106 ++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 58 +++++++--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 35 ++++++ .../CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll | 27 +++++ llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll | 51 +++++++++ .../NVPTX/fence-proxy-tensormap-invalid.ll | 8 ++ llvm/test/CodeGen/NVPTX/fence-proxy.ll | 15 +++ llvm/test/CodeGen/NVPTX/op-fence.ll | 17 +++ llvm/test/CodeGen/NVPTX/thread-fence.ll | 31 +++++ 9 files changed, 335 insertions(+), 13 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll create mode 100644 llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll create mode 100644 llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll create mode 100644 llvm/test/CodeGen/NVPTX/fence-proxy.ll create mode 100644 llvm/test/CodeGen/NVPTX/op-fence.ll create mode 100644 llvm/test/CodeGen/NVPTX/thread-fence.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index f2b168f6cb0e3..5f7fb00889655 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -796,6 +796,112 @@ every time. For more information, refer PTX ISA Membar/Fences ------------- +'``llvm.nvvm.fence.acquire/release.sync_restrict.*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster() + declare void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster() + +Overview: +""""""""" + +The `nvvm.fence.{semantics}.sync_restrict.*` restrict the class of memory +operations for which the fence instruction provides the memory ordering guarantees. +When `.sync_restrict` is restricted to `shared_cta`, then memory semantics must +be `release` and the effect of the fence operation only applies to operations +performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is +restricted to `shared_cluster`, then memory semantics must be `acquire` and the +effect of the fence operation only applies to operations performed on objects in +`shared_cluster` memory space. The scope for both operations is `cluster`. For more details, +please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.mbarrier_init.release.cluster``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.mbarrier_init.release.cluster() + +Overview: +""""""""" + +`nvvm.fence.mbarrier_init.release.cluster` intrinsic restrict the class of +memory operations for which the fence instruction provides the memory ordering +guarantees. The `mbarrier_init` modifiers restricts the synchronizing effect to +the prior `mbarrier_init` operation executed by the same thread on mbarrier objects +in `shared_cta` memory space. For more details, please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.proxy.async_generic.acquire/release.sync_restrict``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.proxy.async.generic.acquire.sync_restrict.space.cluster.scope.cluster() + declare void @llvm.nvvm.fence.proxy.async.generic.release.sync_restrict.space.cta.scope.cluster() + +Overview: +""""""""" + +`nvvm.fence.proxy.async_generic.{semantics}.sync_restrict` are used to establish +ordering between a prior memory access performed via the `async proxy__` +and a subsequent memory access performed via the generic proxy. +``nvvm.fence.proxy.async_generic.release.sync_restrict`` can form a release +sequence that synchronizes with an acquire sequence that contains the +``nvvm.fence.proxy.async_generic.acquire.sync_restrict`` proxy fence. When +`.sync_restrict` is restricted to `shared_cta`, then memory semantics must +be `release` and the effect of the fence operation only applies to operations +performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is +restricted to `shared_cluster`, then memory semantics must be `acquire` and the +effect of the fence operation only applies to operations performed on objects in +`shared_cluster` memory space. The scope for both operations is `cluster`. +For more details, please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.proxy.``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.proxy.alias() + declare void @llvm.nvvm.fence.proxy.async() + declare void @llvm.nvvm.fence.proxy.async.global() + declare void @llvm.nvvm.fence.proxy.async.shared_cluster() + declare void @llvm.nvvm.fence.proxy.async.shared_cta() + +Overview: +""""""""" + +`nvvm.fence.proxy.{proxykind}` intrinsics represent a fence with bi-directional +proxy ordering that is established between the memory accesses done between the +`generic proxy__` +and the proxy specified by `proxykind`. A `bi-directional proxy` ordering between +two proxykinds establishes two `uni-directional` proxy orderings: one from the +first proxykind to the second proxykind and the other from the second proxykind +to the first proxykind. + +`alias` proxykind refers to memory accesses performed using virtually aliased +addresses to the same memory location + +`async` proxykind specifies that the memory ordering is established between the +`async proxy` and the `generic proxy`. The memory ordering is limited only to +operations performed on objects in the state space specified (`generic`, `global`, +`shared_cluster`, `shared_cta`). If no state space is specified, then the memory +ordering applies on all state spaces. For more details, please refer the +`PTX ISA `__ + '``llvm.nvvm.fence.proxy.tensormap_generic.*``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 1b485dc8ccd1e..c8e776e6499df 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1746,33 +1746,65 @@ let TargetPrefix = "nvvm" in { def int_nvvm_barrier_cluster_wait_aligned : Intrinsic<[]>; } - // - // Membar - // - let IntrProperties = [IntrNoCallback] in { +// +// Membar / Fence +// +let IntrProperties = [IntrNoCallback] in { def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_fence_sc_cluster : Intrinsic<[]>; - } - // - // Proxy fence (uni-directional) - // + // Operation fence + def int_nvvm_fence_mbarrier_init_release_cluster: Intrinsic<[], [], [], + "llvm.nvvm.fence.mbarrier_init.release.cluster">; + + // Thread fence + def int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster">; + + def int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster">; + +// +// Proxy fence (uni-directional) +// + + def int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster">; + + def int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster">; + foreach scope = ["cta", "cluster", "gpu", "sys"] in { def int_nvvm_fence_proxy_tensormap_generic_release_ # scope : - Intrinsic<[], [], [IntrNoCallback], + Intrinsic<[], [], [], "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>; // The imm-arg 'size' can only be 128. def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoCallback, IntrArgMemOnly, ImmArg>, - Range, 128, 129>], - "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>; + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [], + "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope> { + let IntrProperties = [IntrNoCallback, IntrArgMemOnly, + ImmArg>, Range, 128, 129>]; + } } +// +// Proxy fence (bi-directional) +// + foreach proxykind = ["alias", "async", "async.global", "async.shared_cta", + "async.shared_cluster"] in { + defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>; + def Intr.record_name: Intrinsic<[], [], [], Intr.intr_name>; + } +} + // // Async Copy // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index bcdb46eca9744..cd7bc37942ca4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -364,7 +364,42 @@ def INT_FENCE_SC_CLUSTER: NullaryInst<"fence.sc.cluster", int_nvvm_fence_sc_cluster>, Requires<[hasPTX<78>, hasSM<90>]>; +def INT_FENCE_MBARRIER_INIT_RELEASE_CLUSTER: + NullaryInst<"fence.mbarrier_init.release.cluster", + int_nvvm_fence_mbarrier_init_release_cluster>, + Requires<[hasPTX<80>, hasSM<90>]>; + +let Predicates = [hasPTX<86>, hasSM<90>] in { +def INT_FENCE_ACQUIRE_SYNC_RESTRICT_CLUSTER_CLUSTER: + NullaryInst<"fence.acquire.sync_restrict::shared::cluster.cluster", + int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster>; + +def INT_FENCE_RELEASE_SYNC_RESTRICT_CTA_CLUSTER: + NullaryInst<"fence.release.sync_restrict::shared::cta.cluster", + int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster>; +} + // Proxy fence (uni-directional) +let Predicates = [hasPTX<86>, hasSM<90>] in { +def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_ACQUIRE_SYNC_RESTRICT_SPACE_CLUSTER_SCOPE_CLUSTER: + NullaryInst<"fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster", + int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster>; + +def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_RELEASE_SYNC_RESTRICT_SPACE_CTA_SCOPE_CLUSTER: + NullaryInst<"fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster", + int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster>; +} + +// Proxy fence (bi-directional) +foreach proxykind = ["alias", "async", "async.global", "async.shared_cta", + "async.shared_cluster"] in { + defvar Preds = !if(!eq(proxykind, "alias"), [hasPTX<75>, hasSM<70>], + [hasPTX<80>, hasSM<90>]); + defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>; + def : NullaryInst<"fence.proxy." # !subst("_", "::", proxykind), + !cast(Intr.record_name)>, Requires; +} + class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE : NullaryInst<"fence.proxy.tensormap::generic.release." # Scope, Intr>, Requires<[hasPTX<83>, hasSM<90>]>; diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll new file mode 100644 index 0000000000000..d46408e31752f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async_generic_acquire_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_acquire_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_generic_release_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_release_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll new file mode 100644 index 0000000000000..896c624602a60 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async() { +; CHECK-LABEL: test_nvvm_fence_proxy_async( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async() + ret void +} + +define void @test_nvvm_fence_proxy_async_global() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_global( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.global; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.global() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cluster() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cluster( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cta() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cta( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cta; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cta() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll new file mode 100644 index 0000000000000..ab35e4fb396d6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll @@ -0,0 +1,8 @@ +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 -o /dev/null 2>&1 | FileCheck %s + +define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) { + ; CHECK: immarg value 130 out of range [128, 129) + call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 130); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy.ll b/llvm/test/CodeGen/NVPTX/fence-proxy.ll new file mode 100644 index 0000000000000..cb5679e68944d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_70 && ptxas-isa-7.5 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | %ptxas-verify -arch=sm_70 %} + +define void @test_nvvm_fence_proxy_alias() { +; CHECK-LABEL: test_nvvm_fence_proxy_alias( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.alias; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.alias() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/op-fence.ll b/llvm/test/CodeGen/NVPTX/op-fence.ll new file mode 100644 index 0000000000000..629b702742afb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/op-fence.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_mbarrier_init +define void @test_fence_mbarrier_init() { +; CHECK-LABEL: test_fence_mbarrier_init( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.mbarrier_init.release.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.mbarrier_init.release.cluster(); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/thread-fence.ll b/llvm/test/CodeGen/NVPTX/thread-fence.ll new file mode 100644 index 0000000000000..185461bd183d0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/thread-fence.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_acquire +define void @test_fence_acquire() { +; CHECK-LABEL: test_fence_acquire( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster(); + + ret void +} + +; CHECK-LABEL: test_fence_release +define void @test_fence_release() { +; CHECK-LABEL: test_fence_release( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster(); + + ret void +} From bfb953926c3d5021e3ac6ddbf22fd98f002da208 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Nov 2025 20:22:07 -0500 Subject: [PATCH 10/15] TableGen: Support target specialized pseudoinstructions (#159880) Allow a target to steal the definition of a generic pseudoinstruction and remap the operands. This works by defining a new instruction, which will simply swap out the emitted entry in the InstrInfo table. This is intended to eliminate the C++ half of the implementation of PointerLikeRegClass. With RegClassByHwMode, the remaining usecase for PointerLikeRegClass are the common codegen pseudoinstructions. Every target maintains its own copy of the generic pseudo operand definitions anyway, so we can stub out the register operands with an appropriate class instead of waiting for runtime resolution. In the future we could probably take this a bit further. For example, there is a similar problem for ADJCALLSTACKUP/DOWN since they depend on target register definitions for the stack pointer register. --- llvm/include/llvm/Target/Target.td | 93 ++++++++++++++++ .../TableGen/target-specialized-pseudos.td | 101 ++++++++++++++++++ llvm/utils/TableGen/Common/CodeGenTarget.cpp | 12 ++- llvm/utils/TableGen/InstrInfoEmitter.cpp | 37 +++++++ 4 files changed, 242 insertions(+), 1 deletion(-) create mode 100644 llvm/test/TableGen/target-specialized-pseudos.td diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index db99885121ec1..6abde996e6dc8 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1579,6 +1579,99 @@ def CONVERGENCECTRL_GLUE : StandardPseudoInstruction { } } +/// Allow a target to replace the instruction definition of a +/// StandardPseudoInstruction. A target should only define one +/// instance of this per instruction. +/// +/// This is intended to allow targets to specify the register class +/// used for pointers. It should not be used to change the fundamental +/// operand structure (e.g., this should not add or remove operands, +/// or change the operand types). +class TargetSpecializedStandardPseudoInstruction< + StandardPseudoInstruction base_inst> : Instruction { + + StandardPseudoInstruction Instruction = base_inst; + let OutOperandList = base_inst.OutOperandList; + let InOperandList = base_inst.InOperandList; + + // TODO: Copy everything + let usesCustomInserter = base_inst.usesCustomInserter; + let hasSideEffects = base_inst.hasSideEffects; + let mayLoad = base_inst.mayLoad; + let mayStore = base_inst.mayStore; + let isTerminator = base_inst.isTerminator; + let isBranch = base_inst.isBranch; + let isIndirectBranch = base_inst.isIndirectBranch; + let isEHScopeReturn = base_inst.isEHScopeReturn; + let isReturn = base_inst.isReturn; + let isCall = base_inst.isCall; + let hasCtrlDep = base_inst.hasCtrlDep; + let isReMaterializable = base_inst.isReMaterializable; + let isMeta = base_inst.isMeta; + let Size = base_inst.Size; + let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; + let isPseudo = true; + let hasNoSchedulingInfo = true; + let isNotDuplicable = base_inst.isNotDuplicable; + let isConvergent = base_inst.isConvergent; + let hasExtraSrcRegAllocReq = base_inst.hasExtraSrcRegAllocReq; + let hasExtraDefRegAllocReq = base_inst.hasExtraDefRegAllocReq; +} + +// All pseudo instructions which need a pointer register class, which +// should be specialized by a target. +defvar PseudosWithPtrOps = [ + LOAD_STACK_GUARD, + PREALLOCATED_ARG, + PATCHABLE_EVENT_CALL, + PATCHABLE_TYPED_EVENT_CALL +]; + + +/// Replace PointerLikeRegClass operands in OperandList with new_rc. +class RemapPointerOperandList { + // Collect the set of names so we can query and rewrite them. + list op_names = !foreach(i, !range(!size(OperandList)), + !getdagname(OperandList, i)); + + // Beautiful language. This would be a lot easier if !getdagarg + // didn't require a specific type. We can't just collect a list of + // the operand values and reconstruct the dag, since there isn't a + // common base class for all the field kinds used in + // pseudoinstruction definitions; therefore everything must be + // maintained as a dag, so use a foldl. Additionally, ? doesn't + // evaluate as false so we get even more noise. + dag ret = + !foldl(OperandList, op_names, acc, name, + !cond( + !initialized(!getdagarg(OperandList, name)) + : !setdagarg(acc, name, new_rc), + !initialized(!getdagarg(OperandList, name)) : acc, + !initialized(!getdagarg(OperandList, name)) : acc + ) + ); +} + +/// Define an override for a pseudoinstruction which uses a pointer +/// register class, specialized to the target's pointer type. +class RemapPointerOperands : + TargetSpecializedStandardPseudoInstruction { + let OutOperandList = + RemapPointerOperandList.ret; + let InOperandList = + RemapPointerOperandList.ret; +} + +/// Helper to replace all pseudoinstructions using pointers to a +/// target register class. Most targets should use this. +multiclass RemapAllTargetPseudoPointerOperands< + RegisterClassLike default_ptr_rc> { + foreach inst = PseudosWithPtrOps in { + def : RemapPointerOperands; + } +} + // Generic opcodes used in GlobalISel. include "llvm/Target/GenericOpcodes.td" diff --git a/llvm/test/TableGen/target-specialized-pseudos.td b/llvm/test/TableGen/target-specialized-pseudos.td new file mode 100644 index 0000000000000..99c63f3ec29d9 --- /dev/null +++ b/llvm/test/TableGen/target-specialized-pseudos.td @@ -0,0 +1,101 @@ +// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s -DONECASE -o - | FileCheck -check-prefixes=CHECK,ONECASE %s +// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s -DALLCASES -o - | FileCheck -check-prefixes=CHECK,ALLCASES %s +// RUN: not llvm-tblgen -gen-instr-info -I %p/../../include %s -DERROR -o /dev/null 2>&1 | FileCheck -check-prefix=ERROR %s + +// CHECK: namespace llvm::MyTarget { +// CHECK: enum { +// CHECK: LOAD_STACK_GUARD = [[LOAD_STACK_GUARD_OPCODE:[0-9]+]], +// CHECK: PREALLOCATED_ARG = [[PREALLOCATED_ARG_OPCODE:[0-9]+]], +// CHECK: PATCHABLE_EVENT_CALL = [[PATCHABLE_EVENT_CALL_OPCODE:[0-9]+]], +// CHECK: PATCHABLE_TYPED_EVENT_CALL = [[PATCHABLE_TYPED_EVENT_CALL_OPCODE:[0-9]+]], + +// Make sure no enum entry is emitted for MY_LOAD_STACK_GUARD +// CHECK: G_UBFX = [[G_UBFX_OPCODE:[0-9]+]], +// CHECK-NEXT: MY_MOV = [[MY_MOV_OPCODE:[0-9]+]], +// CHECK-NEXT: INSTRUCTION_LIST_END = [[INSTR_LIST_END_OPCODE:[0-9]+]] + + +// CHECK: extern const MyTargetInstrTable MyTargetDescs = { +// CHECK-NEXT: { +// CHECK-NEXT: { [[MY_MOV_OPCODE]], 2, 1, 2, 0, 0, 0, {{[0-9]+}}, MyTargetImpOpBase + 0, 0|(1ULL< + : Register { + let Namespace = "MyTarget"; +} + +class MyClass types, dag registers> + : RegisterClass<"MyTarget", types, size, registers> { + let Size = size; +} + +def X0 : MyReg<"x0">; +def X1 : MyReg<"x1">; +def XRegs : RegisterClass<"MyTarget", [i64], 64, (add X0, X1)>; + + +class TestInstruction : Instruction { + let Size = 2; + let Namespace = "MyTarget"; + let hasSideEffects = false; +} + +#ifdef ONECASE + +// Example setting the pointer register class manually +def MY_LOAD_STACK_GUARD : + TargetSpecializedStandardPseudoInstruction { + let Namespace = "MyTarget"; + let OutOperandList = (outs XRegs:$dst); +} + +#endif + +#ifdef ALLCASES + +defm my_remaps : RemapAllTargetPseudoPointerOperands; + +#endif + + +#ifdef ERROR + +def MY_LOAD_STACK_GUARD_0 : TargetSpecializedStandardPseudoInstruction; + +// ERROR: :[[@LINE+1]]:5: error: multiple overrides of 'LOAD_STACK_GUARD' defined +def MY_LOAD_STACK_GUARD_1 : TargetSpecializedStandardPseudoInstruction; + +#endif + +def MY_MOV : TestInstruction { + let OutOperandList = (outs XRegs:$dst); + let InOperandList = (ins XRegs:$src); + let AsmString = "my_mov $dst, $src"; +} + + +def MyTargetISA : InstrInfo; +def MyTarget : Target { let InstructionSet = MyTargetISA; } diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp index c0daac127f71a..9f56e38fde4b5 100644 --- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp @@ -283,15 +283,25 @@ void CodeGenTarget::ComputeInstrsByEnum() const { assert(EndOfPredefines == getNumFixedInstructions() && "Missing generic opcode"); + unsigned SkippedInsts = 0; + for (const auto &[_, CGIUp] : InstMap) { const CodeGenInstruction *CGI = CGIUp.get(); if (CGI->Namespace != "TargetOpcode") { + + if (CGI->TheDef->isSubClassOf( + "TargetSpecializedStandardPseudoInstruction")) { + ++SkippedInsts; + continue; + } + InstrsByEnum.push_back(CGI); NumPseudoInstructions += CGI->TheDef->getValueAsBit("isPseudo"); } } - assert(InstrsByEnum.size() == InstMap.size() && "Missing predefined instr"); + assert(InstrsByEnum.size() + SkippedInsts == InstMap.size() && + "Missing predefined instr"); // All of the instructions are now in random order based on the map iteration. llvm::sort( diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 32994c12aa98b..d46c9d811753a 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -72,6 +72,13 @@ class InstrInfoEmitter { using OperandInfoListTy = std::vector; using OperandInfoMapTy = std::map; + DenseMap + TargetSpecializedPseudoInsts; + + /// Compute mapping of opcodes which should have their definitions overridden + /// by a target version. + void buildTargetSpecializedPseudoInstsMap(); + /// Generate member functions in the target-specific GenInstrInfo class. /// /// This method is used to custom expand TIIPredicate definitions. @@ -216,6 +223,10 @@ InstrInfoEmitter::CollectOperandInfo(OperandInfoListTy &OperandInfoList, const CodeGenTarget &Target = CDP.getTargetInfo(); unsigned Offset = 0; for (const CodeGenInstruction *Inst : Target.getInstructions()) { + auto OverrideEntry = TargetSpecializedPseudoInsts.find(Inst); + if (OverrideEntry != TargetSpecializedPseudoInsts.end()) + Inst = OverrideEntry->second; + OperandInfoTy OperandInfo = GetOperandInfo(*Inst); if (OperandInfoMap.try_emplace(OperandInfo, Offset).second) { OperandInfoList.push_back(OperandInfo); @@ -859,6 +870,25 @@ void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS, } } +void InstrInfoEmitter::buildTargetSpecializedPseudoInstsMap() { + ArrayRef SpecializedInsts = Records.getAllDerivedDefinitions( + "TargetSpecializedStandardPseudoInstruction"); + const CodeGenTarget &Target = CDP.getTargetInfo(); + + for (const Record *SpecializedRec : SpecializedInsts) { + const CodeGenInstruction &SpecializedInst = + Target.getInstruction(SpecializedRec); + const Record *BaseInstRec = SpecializedRec->getValueAsDef("Instruction"); + + const CodeGenInstruction &BaseInst = Target.getInstruction(BaseInstRec); + + if (!TargetSpecializedPseudoInsts.insert({&BaseInst, &SpecializedInst}) + .second) + PrintFatalError(SpecializedRec, "multiple overrides of '" + + BaseInst.getName() + "' defined"); + } +} + //===----------------------------------------------------------------------===// // Main Output. //===----------------------------------------------------------------------===// @@ -881,6 +911,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) { // Collect all of the operand info records. Timer.startTimer("Collect operand info"); + buildTargetSpecializedPseudoInstsMap(); + OperandInfoListTy OperandInfoList; OperandInfoMapTy OperandInfoMap; unsigned OperandInfoSize = @@ -963,6 +995,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) { for (const CodeGenInstruction *Inst : reverse(NumberedInstructions)) { // Keep a list of the instruction names. InstrNames.add(Inst->getName()); + + auto OverrideEntry = TargetSpecializedPseudoInsts.find(Inst); + if (OverrideEntry != TargetSpecializedPseudoInsts.end()) + Inst = OverrideEntry->second; + // Emit the record into the table. emitRecord(*Inst, --Num, InstrInfo, EmittedLists, OperandInfoMap, OS); } From 961940e1a7c9b4bbe0ae54c2ea4bdc69308947d6 Mon Sep 17 00:00:00 2001 From: Shih-Po Hung Date: Wed, 19 Nov 2025 09:51:12 +0800 Subject: [PATCH 11/15] [TTI] Use MemIntrinsicCostAttributes for getMaskedMemoryOpCost (#168029) - Split from #165532. This is a step toward a unified interface for masked/gather-scatter/strided/expand-compress cost modeling. - Replace the ad-hoc parameter list with a single attributes object. API change: ``` - InstructionCost getMaskedMemoryOpCost(Opcode, Src, Alignment, - AddressSpace, CostKind); + InstructionCost getMaskedMemoryOpCost(MemIntrinsicCostAttributes, + CostKind); ``` Notes: - NFCI intended: callers populate MemIntrinsicCostAttributes with the same information as before. - Follow-up: migrate gather/scatter, strided, and expand/compress cost queries to the same attributes-based entry point. --- .../llvm/Analysis/TargetTransformInfo.h | 28 ++++++++++++++++++- .../llvm/Analysis/TargetTransformInfoImpl.h | 3 +- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 24 +++++++++------- llvm/lib/Analysis/TargetTransformInfo.cpp | 5 ++-- .../AArch64/AArch64TargetTransformInfo.cpp | 8 +++--- .../AArch64/AArch64TargetTransformInfo.h | 3 +- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 14 ++++++---- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 3 +- .../Hexagon/HexagonTargetTransformInfo.cpp | 6 ++-- .../Hexagon/HexagonTargetTransformInfo.h | 3 +- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 12 +++++--- .../Target/RISCV/RISCVTargetTransformInfo.h | 3 +- .../lib/Target/X86/X86TargetTransformInfo.cpp | 19 +++++++++---- llvm/lib/Target/X86/X86TargetTransformInfo.h | 3 +- .../Transforms/Vectorize/LoopVectorize.cpp | 6 ++-- .../Transforms/Vectorize/SLPVectorizer.cpp | 17 ++++++----- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 12 ++++++-- 17 files changed, 107 insertions(+), 62 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0f17312b03827..a65e4667ab76c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -123,6 +123,32 @@ struct HardwareLoopInfo { LLVM_ABI bool canAnalyze(LoopInfo &LI); }; +/// Information for memory intrinsic cost model. +class MemIntrinsicCostAttributes { + /// Vector type of the data to be loaded or stored. + Type *DataTy = nullptr; + + /// ID of the memory intrinsic. + Intrinsic::ID IID; + + /// Address space of the pointer. + unsigned AddressSpace = 0; + + /// Alignment of single element. + Align Alignment; + +public: + LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy, + Align Alignment, unsigned AddressSpace) + : DataTy(DataTy), IID(Id), AddressSpace(AddressSpace), + Alignment(Alignment) {} + + Intrinsic::ID getID() const { return IID; } + Type *getDataType() const { return DataTy; } + unsigned getAddressSpace() const { return AddressSpace; } + Align getAlignment() const { return Alignment; } +}; + class IntrinsicCostAttributes { const IntrinsicInst *II = nullptr; Type *RetTy = nullptr; @@ -1556,7 +1582,7 @@ class TargetTransformInfo { /// \return The cost of masked Load and Store instructions. LLVM_ABI InstructionCost getMaskedMemoryOpCost( - unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, + const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; /// \return The cost of Gather or Scatter operation diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index aacb88d2f9684..d8e35748f53e5 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -842,8 +842,7 @@ class TargetTransformInfoImplBase { } virtual InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 944e1714e8f98..cb389ae74ef46 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1558,9 +1558,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override { + Type *DataTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned Opcode = MICA.getID() == Intrinsic::masked_load + ? Instruction::Load + : Instruction::Store; // TODO: Pass on AddressSpace when we have test coverage. return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, CostKind); @@ -1617,10 +1621,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Firstly, the cost of load/store operation. InstructionCost Cost; - if (UseMaskForCond || UseMaskForGaps) - Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, - AddressSpace, CostKind); - else + if (UseMaskForCond || UseMaskForGaps) { + unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load + : Intrinsic::masked_store; + Cost = thisT()->getMaskedMemoryOpCost( + {IID, VecTy, Alignment, AddressSpace}, CostKind); + } else Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); @@ -2403,14 +2409,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { case Intrinsic::masked_store: { Type *Ty = Tys[0]; Align TyAlign = thisT()->DL.getABITypeAlign(Ty); - return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, - CostKind); + return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind); } case Intrinsic::masked_load: { Type *Ty = RetTy; Align TyAlign = thisT()->DL.getABITypeAlign(Ty); - return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, - CostKind); + return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind); } case Intrinsic::experimental_vp_strided_store: { auto *Ty = cast(ICA.getArgTypes()[0]); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 0426ac7e62fab..45369f0ffe137 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1183,10 +1183,9 @@ InstructionCost TargetTransformInfo::getMemoryOpCost( } InstructionCost TargetTransformInfo::getMaskedMemoryOpCost( - unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, + const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { - InstructionCost Cost = TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, - AddressSpace, CostKind); + InstructionCost Cost = TTIImpl->getMaskedMemoryOpCost(MICA, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index bf195ca210e9b..0bae00bafee3c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4730,12 +4730,12 @@ bool AArch64TTIImpl::prefersVectorizedAddressing() const { } InstructionCost -AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, unsigned AddressSpace, +AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + Type *Src = MICA.getDataType(); + if (useNeonVector(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); auto LT = getTypeLegalizationCost(Src); if (!LT.first.isValid()) return InstructionCost::getInvalid(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index d189f563f99a1..6cc4987428567 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -188,8 +188,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase { unsigned Opcode2) const; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 24f58a68c345d..d12b802fe234f 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1631,20 +1631,22 @@ InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, +ARMTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned IID = MICA.getID(); + Type *Src = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); if (ST->hasMVEIntegerOps()) { - if (Opcode == Instruction::Load && + if (IID == Intrinsic::masked_load && isLegalMaskedLoad(Src, Alignment, AddressSpace)) return ST->getMVEVectorCostFactor(CostKind); - if (Opcode == Instruction::Store && + if (IID == Intrinsic::masked_store && isLegalMaskedStore(Src, Alignment, AddressSpace)) return ST->getMVEVectorCostFactor(CostKind); } if (!isa(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); // Scalar cost, which is currently very high due to the efficiency of the // generated code. return cast(Src)->getNumElements() * 8; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 0810c5532ed91..919a6fc9fd0b0 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -275,8 +275,7 @@ class ARMTTIImpl final : public BasicTTIImplBase { const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index e925e041eb64e..8f3f0cc8abb01 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -224,11 +224,9 @@ InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, unsigned AddressSpace, +HexagonTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); } InstructionCost diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index cec2bf9656ffc..e95b5a10b76a7 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -120,8 +120,7 @@ class HexagonTTIImpl final : public BasicTTIImplBase { TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index dca6e9cffebb0..1a1a93a9cb178 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1008,13 +1008,17 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( } InstructionCost -RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, +RISCVTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load + : Instruction::Store; + Type *Src = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); + if (!isLegalMaskedLoadStore(Src, Alignment) || CostKind != TTI::TCK_RecipThroughput) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6886e8964e29e..39c1173e2986c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -144,8 +144,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool shouldConsiderVectorizationRegPressure() const override { return true; } InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 0b1430e373fc7..4b77bf925b2ba 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5411,9 +5411,14 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, - unsigned AddressSpace, +X86TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load + : Instruction::Store; + Type *SrcTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); + bool IsLoad = (Instruction::Load == Opcode); bool IsStore = (Instruction::Store == Opcode); @@ -6647,10 +6652,12 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( LegalVT.getVectorNumElements()); InstructionCost MemOpCost; bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; - if (UseMaskedMemOp) - MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, - AddressSpace, CostKind); - else + if (UseMaskedMemOp) { + unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load + : Intrinsic::masked_store; + MemOpCost = getMaskedMemoryOpCost( + {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind); + } else MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace, CostKind); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index de5e1c297b1e4..df1393ce16ca1 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -183,8 +183,7 @@ class X86TTIImpl final : public BasicTTIImplBase { TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c680b6fca84cd..aa52f9e2a53ca 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5251,8 +5251,10 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) { - Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, - CostKind); + unsigned IID = I->getOpcode() == Instruction::Load + ? Intrinsic::masked_load + : Intrinsic::masked_store; + Cost += TTI.getMaskedMemoryOpCost({IID, VectorTy, Alignment, AS}, CostKind); } else { TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index deb8ee2d88055..e33ff724ccdd5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6904,9 +6904,10 @@ static bool isMaskedLoadCompress( ScalarLoadsCost; InstructionCost LoadCost = 0; if (IsMasked) { - LoadCost = - TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, - LI->getPointerAddressSpace(), CostKind); + LoadCost = TTI.getMaskedMemoryOpCost({Intrinsic::masked_load, LoadVecTy, + CommonAlignment, + LI->getPointerAddressSpace()}, + CostKind); } else { LoadCost = TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, @@ -7305,8 +7306,9 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( break; case LoadsState::CompressVectorize: VecLdCost += TTI.getMaskedMemoryOpCost( - Instruction::Load, SubVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind) + + {Intrinsic::masked_load, SubVecTy, CommonAlignment, + LI0->getPointerAddressSpace()}, + CostKind) + VectorGEPCost + ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy, {}, CostKind); @@ -15102,8 +15104,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, CommonAlignment, LI0->getPointerAddressSpace(), CostKind); } else if (IsMasked) { VecLdCost = TTI->getMaskedMemoryOpCost( - Instruction::Load, LoadVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind); + {Intrinsic::masked_load, LoadVecTy, CommonAlignment, + LI0->getPointerAddressSpace()}, + CostKind); // TODO: include this cost into CommonCost. VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 94657f5d39390..e89e91b959926 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3592,8 +3592,10 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, InstructionCost Cost = 0; if (IsMasked) { + unsigned IID = isa(this) ? Intrinsic::masked_load + : Intrinsic::masked_store; Cost += - Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind); + Ctx.TTI.getMaskedMemoryOpCost({IID, Ty, Alignment, AS}, Ctx.CostKind); } else { TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( isa(this) ? getOperand(0) @@ -3711,8 +3713,10 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); + // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics. + // After migrating to getMemIntrinsicInstrCost, switch this to vp_load. InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Instruction::Load, Ty, Alignment, AS, Ctx.CostKind); + {Intrinsic::masked_load, Ty, Alignment, AS}, Ctx.CostKind); if (!Reverse) return Cost; @@ -3820,8 +3824,10 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); + // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics. + // After migrating to getMemIntrinsicInstrCost, switch this to vp_store. InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Instruction::Store, Ty, Alignment, AS, Ctx.CostKind); + {Intrinsic::masked_store, Ty, Alignment, AS}, Ctx.CostKind); if (!Reverse) return Cost; From a3ab11007ba277fb8a126d8199925f4ce184e195 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Wed, 19 Nov 2025 10:04:32 +0800 Subject: [PATCH 12/15] [TableGen] Silence a warning (NFC) /llvm-project/llvm/utils/TableGen/Common/CodeGenTarget.cpp:286:12: error: variable 'SkippedInsts' set but not used [-Werror,-Wunused-but-set-variable] unsigned SkippedInsts = 0; ^ 1 error generated. --- llvm/utils/TableGen/Common/CodeGenTarget.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp index 9f56e38fde4b5..e080ca0aa0b31 100644 --- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp @@ -283,7 +283,7 @@ void CodeGenTarget::ComputeInstrsByEnum() const { assert(EndOfPredefines == getNumFixedInstructions() && "Missing generic opcode"); - unsigned SkippedInsts = 0; + [[maybe_unused]] unsigned SkippedInsts = 0; for (const auto &[_, CGIUp] : InstMap) { const CodeGenInstruction *CGI = CGIUp.get(); From b4aa3d3ae334fea392f62df9693fab07142443ae Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 18 Nov 2025 21:37:56 -0500 Subject: [PATCH 13/15] [NFC] Check operand type instead of opcode (#168641) A folow-up of #168458. --- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 34 +++++++++-------------- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ca98b80787fb4..a87f9f274a4d3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1423,7 +1423,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// \returns true if the target has packed f32 instructions that only read 32 /// bits from a scalar operand (SGPR or literal) and replicates the bits to /// both channels. - bool hasPKF32InstsReplicatingLow32BitsOfScalarInput() const { + bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const { return getGeneration() == GFX12 && GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 289bf1a563ffc..2df9267cde1f2 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -766,29 +766,21 @@ static void appendFoldCandidate(SmallVectorImpl &FoldList, FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); } -// Returns true if the instruction is a packed f32 instruction that only reads -// 32 bits from a scalar operand (SGPR or literal) and replicates the bits to -// both channels. -static bool -isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST, - MachineInstr *MI) { - if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput()) +// Returns true if the instruction is a packed F32 instruction and the +// corresponding scalar operand reads 32 bits and replicates the bits to both +// channels. +static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand( + const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) { + if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput()) return false; - switch (MI->getOpcode()) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_FMA_F32: - return true; - default: - return false; - } - llvm_unreachable("unknown instruction"); + const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo]; + return OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; } // Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or // literal) and replicates the bits to both channels. Therefore, if the hi and // lo are not same, we can't fold it. -static bool checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput( +static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand( const FoldableDef &OpToFold) { assert(OpToFold.isImm() && "Expected immediate operand"); uint64_t ImmVal = OpToFold.getEffectiveImmVal().value(); @@ -953,8 +945,8 @@ bool SIFoldOperandsImpl::tryAddToFoldList( // Special case for PK_F32 instructions if we are trying to fold an imm to // src0 or src1. if (OpToFold.isImm() && - isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) && - !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) + isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, MI, OpNo) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) return false; appendFoldCandidate(FoldList, MI, OpNo, OpToFold); @@ -1172,8 +1164,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { - if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) && - !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) + if (isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, UseMI, UseOpIdx) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) return false; appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; From 52a58a4193935f60df70eb45f8ec7c61f142ac3b Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Wed, 19 Nov 2025 11:06:00 +0800 Subject: [PATCH 14/15] [AMDGPU] Adding instruction specific features (#167809) --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 22 ++-- clang/test/CodeGen/link-builtin-bitcode.c | 8 +- .../test/CodeGenOpenCL/amdgpu-cluster-dims.cl | 4 +- .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 8 +- clang/test/CodeGenOpenCL/amdgpu-features.cl | 98 ++++++++-------- ...eadonly-features-written-with-no-target.cl | 6 +- .../CodeGenOpenCL/builtins-amdgcn-fiji.cl | 86 ++++++++++++++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 67 ----------- clang/test/OpenMP/amdgcn-attributes.cpp | 4 +- .../test/Lower/OpenMP/target_cpu_features.f90 | 4 +- llvm/lib/Target/AMDGPU/AMDGPU.td | 108 ++++++++++++++++-- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 21 ++++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 14 +-- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 2 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 22 ++-- llvm/lib/TargetParser/TargetParser.cpp | 52 ++++++++- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll | 1 - 17 files changed, 356 insertions(+), 171 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-fiji.cl diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 2b6fcb1fd479b..81e684a04a03d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -122,13 +122,13 @@ BUILTIN(__builtin_amdgcn_frexp_exp, "id", "nc") BUILTIN(__builtin_amdgcn_frexp_expf, "if", "nc") BUILTIN(__builtin_amdgcn_fract, "dd", "nc") BUILTIN(__builtin_amdgcn_fractf, "ff", "nc") -BUILTIN(__builtin_amdgcn_lerp, "UiUiUiUi", "nc") +TARGET_BUILTIN(__builtin_amdgcn_lerp, "UiUiUiUi", "nc", "lerp-inst") BUILTIN(__builtin_amdgcn_class, "bdi", "nc") BUILTIN(__builtin_amdgcn_classf, "bfi", "nc") -BUILTIN(__builtin_amdgcn_cubeid, "ffff", "nc") -BUILTIN(__builtin_amdgcn_cubesc, "ffff", "nc") -BUILTIN(__builtin_amdgcn_cubetc, "ffff", "nc") -BUILTIN(__builtin_amdgcn_cubema, "ffff", "nc") +TARGET_BUILTIN(__builtin_amdgcn_cubeid, "ffff", "nc", "cube-insts") +TARGET_BUILTIN(__builtin_amdgcn_cubesc, "ffff", "nc", "cube-insts") +TARGET_BUILTIN(__builtin_amdgcn_cubetc, "ffff", "nc", "cube-insts") +TARGET_BUILTIN(__builtin_amdgcn_cubema, "ffff", "nc", "cube-insts") BUILTIN(__builtin_amdgcn_s_sleep, "vIi", "n") BUILTIN(__builtin_amdgcn_s_incperflevel, "vIi", "n") BUILTIN(__builtin_amdgcn_s_decperflevel, "vIi", "n") @@ -149,17 +149,17 @@ BUILTIN(__builtin_amdgcn_alignbyte, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_ubfe, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_sbfe, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_cvt_pkrtz, "E2hff", "nc") -BUILTIN(__builtin_amdgcn_cvt_pknorm_i16, "E2sff", "nc") -BUILTIN(__builtin_amdgcn_cvt_pknorm_u16, "E2Usff", "nc") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pknorm_i16, "E2sff", "nc", "cvt-pknorm-vop2-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pknorm_u16, "E2Usff", "nc", "cvt-pknorm-vop2-insts") BUILTIN(__builtin_amdgcn_cvt_pk_i16, "E2sii", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u16, "E2UsUiUi", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u8_f32, "UifUiUi", "nc") BUILTIN(__builtin_amdgcn_cvt_off_f32_i4, "fi", "nc") -BUILTIN(__builtin_amdgcn_sad_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_msad_u8, "UiUiUiUi", "nc") -BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc") -BUILTIN(__builtin_amdgcn_sad_u16, "UiUiUiUi", "nc") -BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc") +TARGET_BUILTIN(__builtin_amdgcn_sad_u8, "UiUiUiUi", "nc", "sad-insts") +TARGET_BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc", "sad-insts") +TARGET_BUILTIN(__builtin_amdgcn_sad_u16, "UiUiUiUi", "nc", "sad-insts") +TARGET_BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc", "qsad-insts") BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc") BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc") diff --git a/clang/test/CodeGen/link-builtin-bitcode.c b/clang/test/CodeGen/link-builtin-bitcode.c index 9a5b6de3c3b38..f6e45bf573705 100644 --- a/clang/test/CodeGen/link-builtin-bitcode.c +++ b/clang/test/CodeGen/link-builtin-bitcode.c @@ -43,7 +43,7 @@ int bar() { return no_attr() + attr_in_target() + attr_not_in_target() + attr_in // CHECK-LABEL: @attr_incompatible // CHECK-SAME: () #[[ATTR_INCOMPATIBLE:[0-9]+]] { -// CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" } +// CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" } diff --git a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl index 14fbeb24a96c2..c5656c49c4761 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl @@ -26,8 +26,8 @@ kernel void foo(global int *p) { *p = 1; } // CHECK-NEXT: ret void // //. -// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" } -// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" } +// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" } +// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" } // CHECK: attributes #[[ATTR2]] = { convergent nounwind } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl index e9adac23a6530..2cbc9787a04b0 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -816,12 +816,12 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: attributes #[[ATTR10]] = { convergent nounwind } //. // GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" } -// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } -// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" } -// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" } +// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc" } // GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } // GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc" } // GFX900: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } // GFX900: attributes #[[ATTR8]] = { convergent nounwind } // GFX900: attributes #[[ATTR9]] = { nounwind } diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 9bd096f3fcbc7..bd162b40b8e47 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -63,55 +63,55 @@ // NOCPU-WAVE32: "target-features"="+wavefrontsize32" // NOCPU-WAVE64: "target-features"="+wavefrontsize64" -// GFX600: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+s-memtime-inst,+wavefrontsize64 -// GFX601: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+s-memtime-inst,+wavefrontsize64 -// GFX602: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+s-memtime-inst,+wavefrontsize64 -// GFX700: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" -// GFX701: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" -// GFX702: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" -// GFX703: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" -// GFX704: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" -// GFX705: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" -// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX802: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX803: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX805: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX810: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX900: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX902: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" -// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX1010: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1011: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1012: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1013: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1030: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1031: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1032: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1033: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1034: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1035: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1036: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" -// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" +// GFX600: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX601: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX602: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX700: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX701: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX702: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX703: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX704: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX705: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX801: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX802: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX803: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX805: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX810: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX900: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX902: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX904: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX906: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX908: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX909: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX90C: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,+xf32-insts" +// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+lerp-inst,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" +// GFX1010: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1011: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1012: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1013: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1030: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1031: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1032: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1033: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1034: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1035: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1036: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" +// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" +// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" -// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" +// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize64" kernel void test() {} diff --git a/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl b/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl index 1a0a30ca0b51e..2d50ce7cab2e0 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl @@ -11,6 +11,6 @@ __attribute__((target("gws,image-insts,vmem-to-lds-load-insts"))) void test() {} // NOCPU: "target-features"="+gws,+image-insts,+vmem-to-lds-load-insts" -// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" -// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32 +// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,+xf32-insts" +// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fiji.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fiji.cl new file mode 100644 index 0000000000000..2178718f90d5a --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fiji.cl @@ -0,0 +1,86 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu fiji -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK %s + + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +typedef unsigned long ulong; +typedef unsigned int uint; +typedef unsigned short ushort; +typedef half __attribute__((ext_vector_type(2))) half2; +typedef short __attribute__((ext_vector_type(2))) short2; +typedef ushort __attribute__((ext_vector_type(2))) ushort2; +typedef uint __attribute__((ext_vector_type(4))) uint4; + +// CHECK-LABEL: @test_lerp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.lerp +void test_lerp(global int* out, int a, int b, int c) +{ + *out = __builtin_amdgcn_lerp(a, b, c); +} + +// CHECK-LABEL: @test_cubeid( +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubeid(float %a, float %b, float %c) +void test_cubeid(global float* out, float a, float b, float c) { + *out = __builtin_amdgcn_cubeid(a, b, c); +} + +// CHECK-LABEL: @test_cubesc( +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubesc(float %a, float %b, float %c) +void test_cubesc(global float* out, float a, float b, float c) { + *out = __builtin_amdgcn_cubesc(a, b, c); +} + +// CHECK-LABEL: @test_cubetc( +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubetc(float %a, float %b, float %c) +void test_cubetc(global float* out, float a, float b, float c) { + *out = __builtin_amdgcn_cubetc(a, b, c); +} + +// CHECK-LABEL: @test_cubema( +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubema(float %a, float %b, float %c) +void test_cubema(global float* out, float a, float b, float c) { + *out = __builtin_amdgcn_cubema(a, b, c); +} + +// CHECK-LABEL: @test_cvt_pknorm_i16( +// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %src0, float %src1) +kernel void test_cvt_pknorm_i16(global short2* out, float src0, float src1) { + *out = __builtin_amdgcn_cvt_pknorm_i16(src0, src1); +} + +// CHECK-LABEL: @test_cvt_pknorm_u16( +// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %src0, float %src1) +kernel void test_cvt_pknorm_u16(global ushort2* out, float src0, float src1) { + *out = __builtin_amdgcn_cvt_pknorm_u16(src0, src1); +} + +// CHECK-LABEL: @test_sad_u8( +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.sad.u8(i32 %src0, i32 %src1, i32 %src2) +kernel void test_sad_u8(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_sad_u8(src0, src1, src2); +} + +// CHECK-LABEL: test_msad_u8( +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.msad.u8(i32 %src0, i32 %src1, i32 %src2) +kernel void test_msad_u8(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_msad_u8(src0, src1, src2); +} + +// CHECK-LABEL: test_sad_hi_u8( +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.hi.u8(i32 %src0, i32 %src1, i32 %src2) +kernel void test_sad_hi_u8(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_sad_hi_u8(src0, src1, src2); +} + +// CHECK-LABEL: @test_sad_u16( +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.u16(i32 %src0, i32 %src1, i32 %src2) +kernel void test_sad_u16(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_sad_u16(src0, src1, src2); +} + +// CHECK-LABEL: @test_qsad_pk_u16_u8( +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) +kernel void test_qsad_pk_u16_u8(global ulong* out, ulong src0, uint src1, ulong src2) { + *out = __builtin_amdgcn_qsad_pk_u16_u8(src0, src1, src2); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index ab0b0b936abdc..b92454de60c78 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -251,13 +251,6 @@ void test_fract_f64(global int* out, double a) *out = __builtin_amdgcn_fract(a); } -// CHECK-LABEL: @test_lerp -// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.lerp -void test_lerp(global int* out, int a, int b, int c) -{ - *out = __builtin_amdgcn_lerp(a, b, c); -} - // CHECK-LABEL: @test_sicmp_i32 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.icmp.i64.i32(i32 %a, i32 %b, i32 32) void test_sicmp_i32(global ulong* out, int a, int b) @@ -865,30 +858,6 @@ void test_s_setprio() __builtin_amdgcn_s_setprio(3); } -// CHECK-LABEL: @test_cubeid( -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubeid(float %a, float %b, float %c) -void test_cubeid(global float* out, float a, float b, float c) { - *out = __builtin_amdgcn_cubeid(a, b, c); -} - -// CHECK-LABEL: @test_cubesc( -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubesc(float %a, float %b, float %c) -void test_cubesc(global float* out, float a, float b, float c) { - *out = __builtin_amdgcn_cubesc(a, b, c); -} - -// CHECK-LABEL: @test_cubetc( -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubetc(float %a, float %b, float %c) -void test_cubetc(global float* out, float a, float b, float c) { - *out = __builtin_amdgcn_cubetc(a, b, c); -} - -// CHECK-LABEL: @test_cubema( -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubema(float %a, float %b, float %c) -void test_cubema(global float* out, float a, float b, float c) { - *out = __builtin_amdgcn_cubema(a, b, c); -} - // CHECK-LABEL: @test_read_exec( // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.ballot.i64(i1 true) void test_read_exec(global ulong* out) { @@ -1139,18 +1108,6 @@ kernel void test_cvt_pkrtz(global half2* out, float src0, float src1) { *out = __builtin_amdgcn_cvt_pkrtz(src0, src1); } -// CHECK-LABEL: @test_cvt_pknorm_i16( -// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %src0, float %src1) -kernel void test_cvt_pknorm_i16(global short2* out, float src0, float src1) { - *out = __builtin_amdgcn_cvt_pknorm_i16(src0, src1); -} - -// CHECK-LABEL: @test_cvt_pknorm_u16( -// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %src0, float %src1) -kernel void test_cvt_pknorm_u16(global ushort2* out, float src0, float src1) { - *out = __builtin_amdgcn_cvt_pknorm_u16(src0, src1); -} - // CHECK-LABEL: @test_cvt_pk_i16( // CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %src0, i32 %src1) kernel void test_cvt_pk_i16(global short2* out, int src0, int src1) { @@ -1169,36 +1126,12 @@ kernel void test_cvt_pk_u8_f32(global uint* out, float src0, uint src1, uint src *out = __builtin_amdgcn_cvt_pk_u8_f32(src0, src1, src2); } -// CHECK-LABEL: @test_sad_u8( -// CHECK: tail call{{.*}} i32 @llvm.amdgcn.sad.u8(i32 %src0, i32 %src1, i32 %src2) -kernel void test_sad_u8(global uint* out, uint src0, uint src1, uint src2) { - *out = __builtin_amdgcn_sad_u8(src0, src1, src2); -} - // CHECK-LABEL: test_msad_u8( // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.msad.u8(i32 %src0, i32 %src1, i32 %src2) kernel void test_msad_u8(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_msad_u8(src0, src1, src2); } -// CHECK-LABEL: test_sad_hi_u8( -// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.hi.u8(i32 %src0, i32 %src1, i32 %src2) -kernel void test_sad_hi_u8(global uint* out, uint src0, uint src1, uint src2) { - *out = __builtin_amdgcn_sad_hi_u8(src0, src1, src2); -} - -// CHECK-LABEL: @test_sad_u16( -// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.u16(i32 %src0, i32 %src1, i32 %src2) -kernel void test_sad_u16(global uint* out, uint src0, uint src1, uint src2) { - *out = __builtin_amdgcn_sad_u16(src0, src1, src2); -} - -// CHECK-LABEL: @test_qsad_pk_u16_u8( -// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) -kernel void test_qsad_pk_u16_u8(global ulong* out, ulong src0, uint src1, ulong src2) { - *out = __builtin_amdgcn_qsad_pk_u16_u8(src0, src1, src2); -} - // CHECK-LABEL: @test_mqsad_pk_u16_u8( // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) kernel void test_mqsad_pk_u16_u8(global ulong* out, ulong src0, uint src1, ulong src2) { diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp b/clang/test/OpenMP/amdgcn-attributes.cpp index 2c9e16a4f5098..03f5c31e3157c 100644 --- a/clang/test/OpenMP/amdgcn-attributes.cpp +++ b/clang/test/OpenMP/amdgcn-attributes.cpp @@ -32,9 +32,9 @@ int callable(int x) { } // DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" "uniform-work-group-size"="true" } // NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } // DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" } // NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/flang/test/Lower/OpenMP/target_cpu_features.f90 b/flang/test/Lower/OpenMP/target_cpu_features.f90 index 4532593156eab..341cfc7991d43 100644 --- a/flang/test/Lower/OpenMP/target_cpu_features.f90 +++ b/flang/test/Lower/OpenMP/target_cpu_features.f90 @@ -11,8 +11,8 @@ !AMDGCN-SAME: fir.target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts", !AMDGCN-SAME: "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts", !AMDGCN-SAME: "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", -!AMDGCN-SAME: "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+mai-insts", -!AMDGCN-SAME: "+s-memrealtime", "+s-memtime-inst", "+vmem-to-lds-load-insts", "+wavefrontsize64"]> +!AMDGCN-SAME: "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+lerp-inst", "+mai-insts", +!AMDGCN-SAME: "+qsad-insts", "+s-memrealtime", "+s-memtime-inst", "+sad-insts", "+vmem-to-lds-load-insts", "+wavefrontsize64"]> !NVPTX: module attributes { !NVPTX-SAME: fir.target_cpu = "sm_80" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b008354cfd462..5dea64844e64e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -901,6 +901,48 @@ def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "Has v_pk_fmac_f16 instruction" >; +def FeatureCubeInsts : SubtargetFeature<"cube-insts", + "HasCubeInsts", + "true", + "Has v_cube* instructions" +>; + +def FeatureLerpInst : SubtargetFeature<"lerp-inst", + "HasLerpInst", + "true", + "Has v_lerp_u8 instruction" +>; + +def FeatureSadInsts : SubtargetFeature<"sad-insts", + "HasSadInsts", + "true", + "Has v_sad* instructions" +>; + +def FeatureQsadInsts : SubtargetFeature<"qsad-insts", + "HasQsadInsts", + "true", + "Has v_qsad* instructions" +>; + +def FeatureCvtNormInsts : SubtargetFeature<"cvt-norm-insts", + "HasCvtNormInsts", + "true", + "Has v_cvt_norm* instructions" +>; + +def FeatureCvtPkNormVOP2Insts : SubtargetFeature<"cvt-pknorm-vop2-insts", + "HasCvtPkNormVOP2Insts", + "true", + "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions" +>; + +def FeatureCvtPkNormVOP3Insts : SubtargetFeature<"cvt-pknorm-vop3-insts", + "HasCvtPkNormVOP3Insts", + "true", + "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions" +>; + def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts", "HasAtomicDsPkAdd16Insts", "true", @@ -1494,7 +1536,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, + FeatureSadInsts, FeatureCvtPkNormVOP2Insts ] >; @@ -1508,7 +1551,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, + FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts ] >; @@ -1524,7 +1568,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, - FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder + FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder, FeatureCubeInsts, + FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, + FeatureCvtPkNormVOP2Insts ] >; @@ -1543,7 +1589,10 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, - FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad + FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, + FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, + FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ] >; @@ -1567,7 +1616,10 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad + FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeatureCubeInsts, + FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, + FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ] >; @@ -1590,7 +1642,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, + FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts ] >; @@ -2069,10 +2123,17 @@ def FeatureISAVersion12 : FeatureSet< FeatureMemoryAtomicFAddF32DenormalSupport, FeatureBVHDualAndBVH8Insts, FeatureWaitsBeforeSystemScopeStores, - FeatureD16Writes32BitVgpr + FeatureD16Writes32BitVgpr, + FeatureCubeInsts, + FeatureLerpInst, + FeatureSadInsts, + FeatureQsadInsts, + FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ]>; -def FeatureISAVersion12_50 : FeatureSet< +def FeatureISAVersion12_50_Common : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureRequiresAlignedVGPRs, @@ -2147,6 +2208,16 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureD16Writes32BitVgpr, ]>; +def FeatureISAVersion12_50 : FeatureSet< + !listconcat(FeatureISAVersion12_50_Common.Features, + [FeatureCubeInsts, + FeatureLerpInst, + FeatureSadInsts, + FeatureQsadInsts, + FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts])>; + def FeatureISAVersion12_51 : FeatureSet< !listconcat(FeatureISAVersion12_50.Features, [FeatureDPALU_DPP])>; @@ -2816,6 +2887,27 @@ def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">, def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">, AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>; +def HasCubeInsts : Predicate<"Subtarget->hasCubeInsts()">, + AssemblerPredicate<(all_of FeatureCubeInsts)>; + +def HasLerpInst : Predicate<"Subtarget->hasLerpInst()">, + AssemblerPredicate<(all_of FeatureLerpInst)>; + +def HasSadInsts : Predicate<"Subtarget->hasSadInsts()">, + AssemblerPredicate<(all_of FeatureSadInsts)>; + +def HasQsadInsts : Predicate<"Subtarget->hasQsadInsts()">, + AssemblerPredicate<(all_of FeatureQsadInsts)>; + +def HasCvtNormInsts : Predicate<"Subtarget->hasCvtNormInsts()">, + AssemblerPredicate<(all_of FeatureCvtNormInsts)>; + +def HasCvtPkNormVOP2Insts : Predicate<"Subtarget->hasCvtPkNormVOP2Insts()">, + AssemblerPredicate<(all_of FeatureCvtPkNormVOP2Insts)>; + +def HasCvtPkNormVOP3Insts : Predicate<"Subtarget->hasCvtPkNormVOP3Insts()">, + AssemblerPredicate<(all_of FeatureCvtPkNormVOP3Insts)>; + def HasFP8E5M3Insts : Predicate<"Subtarget->hasFP8E5M3Insts()">, AssemblerPredicate<(all_of FeatureFP8E5M3Insts)>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a87f9f274a4d3..cb27f474d78f3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -166,6 +166,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMAIInsts = false; bool HasFP8Insts = false; bool HasFP8ConversionInsts = false; + bool HasCubeInsts = false; + bool HasLerpInst = false; + bool HasSadInsts = false; + bool HasQsadInsts = false; + bool HasCvtNormInsts = false; + bool HasCvtPkNormVOP2Insts = false; + bool HasCvtPkNormVOP3Insts = false; bool HasFP8E5M3Insts = false; bool HasCvtFP8Vop1Bug = false; bool HasPkFmacF16Inst = false; @@ -892,6 +899,20 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; } + bool hasCubeInsts() const { return HasCubeInsts; } + + bool hasLerpInst() const { return HasLerpInst; } + + bool hasSadInsts() const { return HasSadInsts; } + + bool hasQsadInsts() const { return HasQsadInsts; } + + bool hasCvtNormInsts() const { return HasCvtNormInsts; } + + bool hasCvtPkNormVOP2Insts() const { return HasCvtPkNormVOP2Insts; } + + bool hasCvtPkNormVOP3Insts() const { return HasCvtPkNormVOP3Insts; } + bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; } bool hasPkFmacF16Inst() const { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 85adcab55b742..1d1e95908fce6 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -616,15 +616,15 @@ let SubtargetPredicate = isGFX9Plus in { let isReMaterializable = 1 in defm V_SAT_PK_U8_I16 : VOP1Inst_t16<"v_sat_pk_u8_i16", VOP_I16_I32>; - - let mayRaiseFPException = 0 in { - defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16", - VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; - defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16", - VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; - } // End mayRaiseFPException = 0 } // End SubtargetPredicate = isGFX9Plus +let mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts in { +defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16", + VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; +defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16", + VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; +} // End mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts + let SubtargetPredicate = isGFX9Only in { defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; } // End SubtargetPredicate = isGFX9Only diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index d87d250a034f0..dbb7862ab4ab5 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -971,7 +971,7 @@ defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_a } // End IsNeverUniform = 1 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>; -let ReadsModeReg = 0, mayRaiseFPException = 0 in { +let ReadsModeReg = 0, mayRaiseFPException = 0, SubtargetPredicate = HasCvtPkNormVOP2Insts in { defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>; defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>; } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 05ba76ab489d8..872bde501cd2d 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -185,7 +185,8 @@ defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile>; defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile>; defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile, any_fma>, VOPD_Component<0x13, "v_fma_f32">; -defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; +let SubtargetPredicate = HasLerpInst in + defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; let SchedRW = [WriteIntMul] in { let SubtargetPredicate = HasMadU32Inst in @@ -258,12 +259,12 @@ defm V_DIV_FMAS_F64 : VOP3Inst <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC>; } // End isCommutable = 1 let isReMaterializable = 1 in { -let mayRaiseFPException = 0 in { +let mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts in { defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile, int_amdgcn_cubeid>; defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile, int_amdgcn_cubesc>; defm V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile, int_amdgcn_cubetc>; defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile, int_amdgcn_cubema>; -} // End mayRaiseFPException +} // mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile, AMDGPUbfe_u32>; defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile, AMDGPUbfe_i32>; @@ -306,12 +307,12 @@ let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in { defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 -let isCommutable = 1 in { +let isCommutable = 1, SubtargetPredicate = HasSadInsts in { defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile>; defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile>; defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile>; defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile>; -} // End isCommutable = 1 +} // End isCommutable = 1, SubtargetPredicate = HasSadInsts defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile, int_amdgcn_cvt_pk_u8_f32>; defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>; @@ -424,7 +425,8 @@ def VOPProfileMQSAD : VOP3_Profile { let SubtargetPredicate = isGFX7Plus in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { -defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile>; +let SubtargetPredicate = HasQsadInsts in + defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile>; defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>; } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] } // End SubtargetPredicate = isGFX7Plus @@ -789,9 +791,6 @@ let isCommutable = 1 in { defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>; } // End isCommutable = 1 -defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>; -defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>; - defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>; let isReMaterializable = 1 in { @@ -996,6 +995,11 @@ def : GCNPat<(DivergentBinFrag (or_oneuse i64:$src0, i64:$src1), i64:$src2), } // End SubtargetPredicate = isGFX9Plus +let SubtargetPredicate = HasCvtPkNormVOP3Insts in { + defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>; + defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>; +} // end SubtargetPredicate = HasCvtPkNormVOP3Insts + // FIXME: Probably should hardcode clamp bit in pseudo and avoid this. class OpSelBinOpClampPat : GCNPat< diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 96bef0e574a45..28f3649a840d6 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -447,6 +447,11 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["atomic-fmin-fmax-global-f64"] = true; Features["wavefrontsize32"] = true; Features["clusters"] = true; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["qsad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX1201: case GK_GFX1200: @@ -474,6 +479,11 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["gfx12-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["image-insts"] = true; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["qsad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; Features["fp8-conversion-insts"] = true; Features["atomic-fmin-fmax-global-f32"] = true; break; @@ -503,6 +513,11 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["gfx11-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["image-insts"] = true; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["qsad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; Features["gws"] = true; Features["atomic-fmin-fmax-global-f32"] = true; break; @@ -535,6 +550,11 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["vmem-to-lds-load-insts"] = true; Features["atomic-fmin-fmax-global-f32"] = true; Features["atomic-fmin-fmax-global-f64"] = true; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["qsad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX1012: case GK_GFX1011: @@ -562,6 +582,11 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["vmem-to-lds-load-insts"] = true; Features["atomic-fmin-fmax-global-f32"] = true; Features["atomic-fmin-fmax-global-f64"] = true; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["qsad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX950: Features["bitop3-insts"] = true; @@ -615,6 +640,11 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["vmem-to-lds-load-insts"] = true; Features["atomic-fmin-fmax-global-f64"] = true; Features["wavefrontsize64"] = true; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["qsad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX90A: Features["gfx90a-insts"] = true; @@ -659,6 +689,11 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["s-memtime-inst"] = true; Features["gws"] = true; Features["wavefrontsize64"] = true; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["qsad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX705: case GK_GFX704: @@ -667,7 +702,18 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, case GK_GFX701: case GK_GFX700: Features["ci-insts"] = true; - [[fallthrough]]; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["qsad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; + Features["image-insts"] = true; + Features["s-memtime-inst"] = true; + Features["gws"] = true; + Features["atomic-fmin-fmax-global-f32"] = true; + Features["atomic-fmin-fmax-global-f64"] = true; + Features["wavefrontsize64"] = true; + break; case GK_GFX602: case GK_GFX601: case GK_GFX600: @@ -677,6 +723,10 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["atomic-fmin-fmax-global-f32"] = true; Features["atomic-fmin-fmax-global-f64"] = true; Features["wavefrontsize64"] = true; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; break; case GK_NONE: break; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll index 43c69baaf3e7f..49169eec072b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll @@ -1,4 +1,3 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0 From 3d3844f71dbbf378fddc88030cd3ce393e06ed50 Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Wed, 19 Nov 2025 06:39:43 -0600 Subject: [PATCH 15/15] Revert "[AMDGPU] Adding instruction specific features (#167809)" Per Shore: revert locally, he will reapply This reverts commit 52a58a4193935f60df70eb45f8ec7c61f142ac3b. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 22 ++-- .../CodeGen/amdgpu-builtin-is-invocable.c | 4 +- .../CodeGen/amdgpu-builtin-processor-is.c | 4 +- clang/test/CodeGen/link-builtin-bitcode.c | 8 +- .../test/CodeGenOpenCL/amdgpu-cluster-dims.cl | 4 +- .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 8 +- clang/test/CodeGenOpenCL/amdgpu-features.cl | 98 ++++++++-------- ...eadonly-features-written-with-no-target.cl | 6 +- .../CodeGenOpenCL/builtins-amdgcn-fiji.cl | 86 -------------- clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 67 +++++++++++ clang/test/OpenMP/amdgcn-attributes.cpp | 2 + .../test/Lower/OpenMP/target_cpu_features.f90 | 4 +- llvm/lib/Target/AMDGPU/AMDGPU.td | 108 ++---------------- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 21 ---- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 14 +-- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 2 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 22 ++-- llvm/lib/TargetParser/TargetParser.cpp | 52 +-------- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll | 1 + revert_patches.txt | 3 + 20 files changed, 178 insertions(+), 358 deletions(-) delete mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-fiji.cl diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 69d7f8e8c3094..654e09c753109 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -122,13 +122,13 @@ BUILTIN(__builtin_amdgcn_frexp_exp, "id", "nc") BUILTIN(__builtin_amdgcn_frexp_expf, "if", "nc") BUILTIN(__builtin_amdgcn_fract, "dd", "nc") BUILTIN(__builtin_amdgcn_fractf, "ff", "nc") -TARGET_BUILTIN(__builtin_amdgcn_lerp, "UiUiUiUi", "nc", "lerp-inst") +BUILTIN(__builtin_amdgcn_lerp, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_class, "bdi", "nc") BUILTIN(__builtin_amdgcn_classf, "bfi", "nc") -TARGET_BUILTIN(__builtin_amdgcn_cubeid, "ffff", "nc", "cube-insts") -TARGET_BUILTIN(__builtin_amdgcn_cubesc, "ffff", "nc", "cube-insts") -TARGET_BUILTIN(__builtin_amdgcn_cubetc, "ffff", "nc", "cube-insts") -TARGET_BUILTIN(__builtin_amdgcn_cubema, "ffff", "nc", "cube-insts") +BUILTIN(__builtin_amdgcn_cubeid, "ffff", "nc") +BUILTIN(__builtin_amdgcn_cubesc, "ffff", "nc") +BUILTIN(__builtin_amdgcn_cubetc, "ffff", "nc") +BUILTIN(__builtin_amdgcn_cubema, "ffff", "nc") BUILTIN(__builtin_amdgcn_s_sleep, "vIi", "n") BUILTIN(__builtin_amdgcn_s_incperflevel, "vIi", "n") BUILTIN(__builtin_amdgcn_s_decperflevel, "vIi", "n") @@ -149,17 +149,17 @@ BUILTIN(__builtin_amdgcn_alignbyte, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_ubfe, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_sbfe, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_cvt_pkrtz, "E2hff", "nc") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pknorm_i16, "E2sff", "nc", "cvt-pknorm-vop2-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pknorm_u16, "E2Usff", "nc", "cvt-pknorm-vop2-insts") +BUILTIN(__builtin_amdgcn_cvt_pknorm_i16, "E2sff", "nc") +BUILTIN(__builtin_amdgcn_cvt_pknorm_u16, "E2Usff", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_i16, "E2sii", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u16, "E2UsUiUi", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u8_f32, "UifUiUi", "nc") BUILTIN(__builtin_amdgcn_cvt_off_f32_i4, "fi", "nc") +BUILTIN(__builtin_amdgcn_sad_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_msad_u8, "UiUiUiUi", "nc") -TARGET_BUILTIN(__builtin_amdgcn_sad_u8, "UiUiUiUi", "nc", "sad-insts") -TARGET_BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc", "sad-insts") -TARGET_BUILTIN(__builtin_amdgcn_sad_u16, "UiUiUiUi", "nc", "sad-insts") -TARGET_BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc", "qsad-insts") +BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc") +BUILTIN(__builtin_amdgcn_sad_u16, "UiUiUiUi", "nc") +BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc") BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc") BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc") diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c index 5a3395d2e0c55..b33e5ae041aee 100644 --- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c +++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c @@ -42,9 +42,9 @@ void foo() { return __builtin_trap(); } //. -// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" } +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } //. -// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" } +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } // AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } //. // AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c index 4c55160e5ea6d..8241c98fc3c77 100644 --- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c +++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c @@ -40,10 +40,10 @@ void foo() { return __builtin_trap(); } //. -// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" } +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } // AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } //. -// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" } +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } //. // AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } diff --git a/clang/test/CodeGen/link-builtin-bitcode.c b/clang/test/CodeGen/link-builtin-bitcode.c index f6e45bf573705..9a5b6de3c3b38 100644 --- a/clang/test/CodeGen/link-builtin-bitcode.c +++ b/clang/test/CodeGen/link-builtin-bitcode.c @@ -43,7 +43,7 @@ int bar() { return no_attr() + attr_in_target() + attr_not_in_target() + attr_in // CHECK-LABEL: @attr_incompatible // CHECK-SAME: () #[[ATTR_INCOMPATIBLE:[0-9]+]] { -// CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" } +// CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" } diff --git a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl index c5656c49c4761..14fbeb24a96c2 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl @@ -26,8 +26,8 @@ kernel void foo(global int *p) { *p = 1; } // CHECK-NEXT: ret void // //. -// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" } -// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" } +// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" } +// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" } // CHECK: attributes #[[ATTR2]] = { convergent nounwind } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl index 2cbc9787a04b0..e9adac23a6530 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -816,12 +816,12 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: attributes #[[ATTR10]] = { convergent nounwind } //. // GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" } -// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc" } -// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" } -// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" } +// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } // GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } // GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } // GFX900: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } // GFX900: attributes #[[ATTR8]] = { convergent nounwind } // GFX900: attributes #[[ATTR9]] = { nounwind } diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index bd162b40b8e47..9bd096f3fcbc7 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -63,55 +63,55 @@ // NOCPU-WAVE32: "target-features"="+wavefrontsize32" // NOCPU-WAVE64: "target-features"="+wavefrontsize64" -// GFX600: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX601: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX602: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX700: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX701: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX702: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX703: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX704: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX705: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+lerp-inst,+qsad-insts,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX801: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX802: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX803: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX805: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX810: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX900: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX902: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX904: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX906: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX908: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX909: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX90C: "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,+xf32-insts" -// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+lerp-inst,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" -// GFX1010: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1011: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1012: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1013: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1030: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1031: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1032: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1033: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1034: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1035: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1036: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" -// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" -// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" +// GFX600: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+s-memtime-inst,+wavefrontsize64 +// GFX601: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+s-memtime-inst,+wavefrontsize64 +// GFX602: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+s-memtime-inst,+wavefrontsize64 +// GFX700: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" +// GFX701: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" +// GFX702: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" +// GFX703: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" +// GFX704: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" +// GFX705: "target-features"="+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+s-memtime-inst,+wavefrontsize64" +// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX802: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX803: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX805: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX810: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX900: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX902: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" +// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX1010: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1011: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1012: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1013: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1030: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1031: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1032: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1033: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1034: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1035: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1036: "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" +// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" +// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" -// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize64" +// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" kernel void test() {} diff --git a/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl b/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl index 2d50ce7cab2e0..1a0a30ca0b51e 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl @@ -11,6 +11,6 @@ __attribute__((target("gws,image-insts,vmem-to-lds-load-insts"))) void test() {} // NOCPU: "target-features"="+gws,+image-insts,+vmem-to-lds-load-insts" -// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,+xf32-insts" -// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" +// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" +// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32 diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fiji.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fiji.cl deleted file mode 100644 index 2178718f90d5a..0000000000000 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fiji.cl +++ /dev/null @@ -1,86 +0,0 @@ -// REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu fiji -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK %s - - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -typedef unsigned long ulong; -typedef unsigned int uint; -typedef unsigned short ushort; -typedef half __attribute__((ext_vector_type(2))) half2; -typedef short __attribute__((ext_vector_type(2))) short2; -typedef ushort __attribute__((ext_vector_type(2))) ushort2; -typedef uint __attribute__((ext_vector_type(4))) uint4; - -// CHECK-LABEL: @test_lerp -// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.lerp -void test_lerp(global int* out, int a, int b, int c) -{ - *out = __builtin_amdgcn_lerp(a, b, c); -} - -// CHECK-LABEL: @test_cubeid( -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubeid(float %a, float %b, float %c) -void test_cubeid(global float* out, float a, float b, float c) { - *out = __builtin_amdgcn_cubeid(a, b, c); -} - -// CHECK-LABEL: @test_cubesc( -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubesc(float %a, float %b, float %c) -void test_cubesc(global float* out, float a, float b, float c) { - *out = __builtin_amdgcn_cubesc(a, b, c); -} - -// CHECK-LABEL: @test_cubetc( -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubetc(float %a, float %b, float %c) -void test_cubetc(global float* out, float a, float b, float c) { - *out = __builtin_amdgcn_cubetc(a, b, c); -} - -// CHECK-LABEL: @test_cubema( -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubema(float %a, float %b, float %c) -void test_cubema(global float* out, float a, float b, float c) { - *out = __builtin_amdgcn_cubema(a, b, c); -} - -// CHECK-LABEL: @test_cvt_pknorm_i16( -// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %src0, float %src1) -kernel void test_cvt_pknorm_i16(global short2* out, float src0, float src1) { - *out = __builtin_amdgcn_cvt_pknorm_i16(src0, src1); -} - -// CHECK-LABEL: @test_cvt_pknorm_u16( -// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %src0, float %src1) -kernel void test_cvt_pknorm_u16(global ushort2* out, float src0, float src1) { - *out = __builtin_amdgcn_cvt_pknorm_u16(src0, src1); -} - -// CHECK-LABEL: @test_sad_u8( -// CHECK: tail call{{.*}} i32 @llvm.amdgcn.sad.u8(i32 %src0, i32 %src1, i32 %src2) -kernel void test_sad_u8(global uint* out, uint src0, uint src1, uint src2) { - *out = __builtin_amdgcn_sad_u8(src0, src1, src2); -} - -// CHECK-LABEL: test_msad_u8( -// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.msad.u8(i32 %src0, i32 %src1, i32 %src2) -kernel void test_msad_u8(global uint* out, uint src0, uint src1, uint src2) { - *out = __builtin_amdgcn_msad_u8(src0, src1, src2); -} - -// CHECK-LABEL: test_sad_hi_u8( -// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.hi.u8(i32 %src0, i32 %src1, i32 %src2) -kernel void test_sad_hi_u8(global uint* out, uint src0, uint src1, uint src2) { - *out = __builtin_amdgcn_sad_hi_u8(src0, src1, src2); -} - -// CHECK-LABEL: @test_sad_u16( -// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.u16(i32 %src0, i32 %src1, i32 %src2) -kernel void test_sad_u16(global uint* out, uint src0, uint src1, uint src2) { - *out = __builtin_amdgcn_sad_u16(src0, src1, src2); -} - -// CHECK-LABEL: @test_qsad_pk_u16_u8( -// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) -kernel void test_qsad_pk_u16_u8(global ulong* out, ulong src0, uint src1, ulong src2) { - *out = __builtin_amdgcn_qsad_pk_u16_u8(src0, src1, src2); -} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index b92454de60c78..ab0b0b936abdc 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -251,6 +251,13 @@ void test_fract_f64(global int* out, double a) *out = __builtin_amdgcn_fract(a); } +// CHECK-LABEL: @test_lerp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.lerp +void test_lerp(global int* out, int a, int b, int c) +{ + *out = __builtin_amdgcn_lerp(a, b, c); +} + // CHECK-LABEL: @test_sicmp_i32 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.icmp.i64.i32(i32 %a, i32 %b, i32 32) void test_sicmp_i32(global ulong* out, int a, int b) @@ -858,6 +865,30 @@ void test_s_setprio() __builtin_amdgcn_s_setprio(3); } +// CHECK-LABEL: @test_cubeid( +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubeid(float %a, float %b, float %c) +void test_cubeid(global float* out, float a, float b, float c) { + *out = __builtin_amdgcn_cubeid(a, b, c); +} + +// CHECK-LABEL: @test_cubesc( +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubesc(float %a, float %b, float %c) +void test_cubesc(global float* out, float a, float b, float c) { + *out = __builtin_amdgcn_cubesc(a, b, c); +} + +// CHECK-LABEL: @test_cubetc( +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubetc(float %a, float %b, float %c) +void test_cubetc(global float* out, float a, float b, float c) { + *out = __builtin_amdgcn_cubetc(a, b, c); +} + +// CHECK-LABEL: @test_cubema( +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubema(float %a, float %b, float %c) +void test_cubema(global float* out, float a, float b, float c) { + *out = __builtin_amdgcn_cubema(a, b, c); +} + // CHECK-LABEL: @test_read_exec( // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.ballot.i64(i1 true) void test_read_exec(global ulong* out) { @@ -1108,6 +1139,18 @@ kernel void test_cvt_pkrtz(global half2* out, float src0, float src1) { *out = __builtin_amdgcn_cvt_pkrtz(src0, src1); } +// CHECK-LABEL: @test_cvt_pknorm_i16( +// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %src0, float %src1) +kernel void test_cvt_pknorm_i16(global short2* out, float src0, float src1) { + *out = __builtin_amdgcn_cvt_pknorm_i16(src0, src1); +} + +// CHECK-LABEL: @test_cvt_pknorm_u16( +// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %src0, float %src1) +kernel void test_cvt_pknorm_u16(global ushort2* out, float src0, float src1) { + *out = __builtin_amdgcn_cvt_pknorm_u16(src0, src1); +} + // CHECK-LABEL: @test_cvt_pk_i16( // CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %src0, i32 %src1) kernel void test_cvt_pk_i16(global short2* out, int src0, int src1) { @@ -1126,12 +1169,36 @@ kernel void test_cvt_pk_u8_f32(global uint* out, float src0, uint src1, uint src *out = __builtin_amdgcn_cvt_pk_u8_f32(src0, src1, src2); } +// CHECK-LABEL: @test_sad_u8( +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.sad.u8(i32 %src0, i32 %src1, i32 %src2) +kernel void test_sad_u8(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_sad_u8(src0, src1, src2); +} + // CHECK-LABEL: test_msad_u8( // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.msad.u8(i32 %src0, i32 %src1, i32 %src2) kernel void test_msad_u8(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_msad_u8(src0, src1, src2); } +// CHECK-LABEL: test_sad_hi_u8( +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.hi.u8(i32 %src0, i32 %src1, i32 %src2) +kernel void test_sad_hi_u8(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_sad_hi_u8(src0, src1, src2); +} + +// CHECK-LABEL: @test_sad_u16( +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.u16(i32 %src0, i32 %src1, i32 %src2) +kernel void test_sad_u16(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_sad_u16(src0, src1, src2); +} + +// CHECK-LABEL: @test_qsad_pk_u16_u8( +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) +kernel void test_qsad_pk_u16_u8(global ulong* out, ulong src0, uint src1, ulong src2) { + *out = __builtin_amdgcn_qsad_pk_u16_u8(src0, src1, src2); +} + // CHECK-LABEL: @test_mqsad_pk_u16_u8( // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) kernel void test_mqsad_pk_u16_u8(global ulong* out, ulong src0, uint src1, ulong src2) { diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp b/clang/test/OpenMP/amdgcn-attributes.cpp index 650d0f441c980..252fe50e7398b 100644 --- a/clang/test/OpenMP/amdgcn-attributes.cpp +++ b/clang/test/OpenMP/amdgcn-attributes.cpp @@ -32,7 +32,9 @@ int callable(int x) { } // DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,65" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,65" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } // NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,65" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } // DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } // NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/flang/test/Lower/OpenMP/target_cpu_features.f90 b/flang/test/Lower/OpenMP/target_cpu_features.f90 index 341cfc7991d43..4532593156eab 100644 --- a/flang/test/Lower/OpenMP/target_cpu_features.f90 +++ b/flang/test/Lower/OpenMP/target_cpu_features.f90 @@ -11,8 +11,8 @@ !AMDGCN-SAME: fir.target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts", !AMDGCN-SAME: "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts", !AMDGCN-SAME: "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", -!AMDGCN-SAME: "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+lerp-inst", "+mai-insts", -!AMDGCN-SAME: "+qsad-insts", "+s-memrealtime", "+s-memtime-inst", "+sad-insts", "+vmem-to-lds-load-insts", "+wavefrontsize64"]> +!AMDGCN-SAME: "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+mai-insts", +!AMDGCN-SAME: "+s-memrealtime", "+s-memtime-inst", "+vmem-to-lds-load-insts", "+wavefrontsize64"]> !NVPTX: module attributes { !NVPTX-SAME: fir.target_cpu = "sm_80" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 5dea64844e64e..b008354cfd462 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -901,48 +901,6 @@ def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "Has v_pk_fmac_f16 instruction" >; -def FeatureCubeInsts : SubtargetFeature<"cube-insts", - "HasCubeInsts", - "true", - "Has v_cube* instructions" ->; - -def FeatureLerpInst : SubtargetFeature<"lerp-inst", - "HasLerpInst", - "true", - "Has v_lerp_u8 instruction" ->; - -def FeatureSadInsts : SubtargetFeature<"sad-insts", - "HasSadInsts", - "true", - "Has v_sad* instructions" ->; - -def FeatureQsadInsts : SubtargetFeature<"qsad-insts", - "HasQsadInsts", - "true", - "Has v_qsad* instructions" ->; - -def FeatureCvtNormInsts : SubtargetFeature<"cvt-norm-insts", - "HasCvtNormInsts", - "true", - "Has v_cvt_norm* instructions" ->; - -def FeatureCvtPkNormVOP2Insts : SubtargetFeature<"cvt-pknorm-vop2-insts", - "HasCvtPkNormVOP2Insts", - "true", - "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions" ->; - -def FeatureCvtPkNormVOP3Insts : SubtargetFeature<"cvt-pknorm-vop3-insts", - "HasCvtPkNormVOP3Insts", - "true", - "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions" ->; - def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts", "HasAtomicDsPkAdd16Insts", "true", @@ -1536,8 +1494,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, - FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, - FeatureSadInsts, FeatureCvtPkNormVOP2Insts + FeatureVmemWriteVgprInOrder ] >; @@ -1551,8 +1508,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, - FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts + FeatureVmemWriteVgprInOrder ] >; @@ -1568,9 +1524,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, - FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder, FeatureCubeInsts, - FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, - FeatureCvtPkNormVOP2Insts + FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder ] >; @@ -1589,10 +1543,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, - FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, - FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, - FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, - FeatureCvtPkNormVOP3Insts + FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad ] >; @@ -1616,10 +1567,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeatureCubeInsts, - FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, - FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, - FeatureCvtPkNormVOP3Insts + FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad ] >; @@ -1642,9 +1590,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, - FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, - FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts, - FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts + FeatureVmemWriteVgprInOrder ] >; @@ -2123,17 +2069,10 @@ def FeatureISAVersion12 : FeatureSet< FeatureMemoryAtomicFAddF32DenormalSupport, FeatureBVHDualAndBVH8Insts, FeatureWaitsBeforeSystemScopeStores, - FeatureD16Writes32BitVgpr, - FeatureCubeInsts, - FeatureLerpInst, - FeatureSadInsts, - FeatureQsadInsts, - FeatureCvtNormInsts, - FeatureCvtPkNormVOP2Insts, - FeatureCvtPkNormVOP3Insts + FeatureD16Writes32BitVgpr ]>; -def FeatureISAVersion12_50_Common : FeatureSet< +def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureRequiresAlignedVGPRs, @@ -2208,16 +2147,6 @@ def FeatureISAVersion12_50_Common : FeatureSet< FeatureD16Writes32BitVgpr, ]>; -def FeatureISAVersion12_50 : FeatureSet< - !listconcat(FeatureISAVersion12_50_Common.Features, - [FeatureCubeInsts, - FeatureLerpInst, - FeatureSadInsts, - FeatureQsadInsts, - FeatureCvtNormInsts, - FeatureCvtPkNormVOP2Insts, - FeatureCvtPkNormVOP3Insts])>; - def FeatureISAVersion12_51 : FeatureSet< !listconcat(FeatureISAVersion12_50.Features, [FeatureDPALU_DPP])>; @@ -2887,27 +2816,6 @@ def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">, def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">, AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>; -def HasCubeInsts : Predicate<"Subtarget->hasCubeInsts()">, - AssemblerPredicate<(all_of FeatureCubeInsts)>; - -def HasLerpInst : Predicate<"Subtarget->hasLerpInst()">, - AssemblerPredicate<(all_of FeatureLerpInst)>; - -def HasSadInsts : Predicate<"Subtarget->hasSadInsts()">, - AssemblerPredicate<(all_of FeatureSadInsts)>; - -def HasQsadInsts : Predicate<"Subtarget->hasQsadInsts()">, - AssemblerPredicate<(all_of FeatureQsadInsts)>; - -def HasCvtNormInsts : Predicate<"Subtarget->hasCvtNormInsts()">, - AssemblerPredicate<(all_of FeatureCvtNormInsts)>; - -def HasCvtPkNormVOP2Insts : Predicate<"Subtarget->hasCvtPkNormVOP2Insts()">, - AssemblerPredicate<(all_of FeatureCvtPkNormVOP2Insts)>; - -def HasCvtPkNormVOP3Insts : Predicate<"Subtarget->hasCvtPkNormVOP3Insts()">, - AssemblerPredicate<(all_of FeatureCvtPkNormVOP3Insts)>; - def HasFP8E5M3Insts : Predicate<"Subtarget->hasFP8E5M3Insts()">, AssemblerPredicate<(all_of FeatureFP8E5M3Insts)>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index cb27f474d78f3..a87f9f274a4d3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -166,13 +166,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMAIInsts = false; bool HasFP8Insts = false; bool HasFP8ConversionInsts = false; - bool HasCubeInsts = false; - bool HasLerpInst = false; - bool HasSadInsts = false; - bool HasQsadInsts = false; - bool HasCvtNormInsts = false; - bool HasCvtPkNormVOP2Insts = false; - bool HasCvtPkNormVOP3Insts = false; bool HasFP8E5M3Insts = false; bool HasCvtFP8Vop1Bug = false; bool HasPkFmacF16Inst = false; @@ -899,20 +892,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; } - bool hasCubeInsts() const { return HasCubeInsts; } - - bool hasLerpInst() const { return HasLerpInst; } - - bool hasSadInsts() const { return HasSadInsts; } - - bool hasQsadInsts() const { return HasQsadInsts; } - - bool hasCvtNormInsts() const { return HasCvtNormInsts; } - - bool hasCvtPkNormVOP2Insts() const { return HasCvtPkNormVOP2Insts; } - - bool hasCvtPkNormVOP3Insts() const { return HasCvtPkNormVOP3Insts; } - bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; } bool hasPkFmacF16Inst() const { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 1d1e95908fce6..85adcab55b742 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -616,14 +616,14 @@ let SubtargetPredicate = isGFX9Plus in { let isReMaterializable = 1 in defm V_SAT_PK_U8_I16 : VOP1Inst_t16<"v_sat_pk_u8_i16", VOP_I16_I32>; -} // End SubtargetPredicate = isGFX9Plus -let mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts in { -defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16", - VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; -defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16", - VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; -} // End mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts + let mayRaiseFPException = 0 in { + defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16", + VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; + defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16", + VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; + } // End mayRaiseFPException = 0 +} // End SubtargetPredicate = isGFX9Plus let SubtargetPredicate = isGFX9Only in { defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index dbb7862ab4ab5..d87d250a034f0 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -971,7 +971,7 @@ defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_a } // End IsNeverUniform = 1 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>; -let ReadsModeReg = 0, mayRaiseFPException = 0, SubtargetPredicate = HasCvtPkNormVOP2Insts in { +let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>; defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>; } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 872bde501cd2d..05ba76ab489d8 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -185,8 +185,7 @@ defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile>; defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile>; defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile, any_fma>, VOPD_Component<0x13, "v_fma_f32">; -let SubtargetPredicate = HasLerpInst in - defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; +defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; let SchedRW = [WriteIntMul] in { let SubtargetPredicate = HasMadU32Inst in @@ -259,12 +258,12 @@ defm V_DIV_FMAS_F64 : VOP3Inst <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC>; } // End isCommutable = 1 let isReMaterializable = 1 in { -let mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts in { +let mayRaiseFPException = 0 in { defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile, int_amdgcn_cubeid>; defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile, int_amdgcn_cubesc>; defm V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile, int_amdgcn_cubetc>; defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile, int_amdgcn_cubema>; -} // mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts +} // End mayRaiseFPException defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile, AMDGPUbfe_u32>; defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile, AMDGPUbfe_i32>; @@ -307,12 +306,12 @@ let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in { defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 -let isCommutable = 1, SubtargetPredicate = HasSadInsts in { +let isCommutable = 1 in { defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile>; defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile>; defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile>; defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile>; -} // End isCommutable = 1, SubtargetPredicate = HasSadInsts +} // End isCommutable = 1 defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile, int_amdgcn_cvt_pk_u8_f32>; defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>; @@ -425,8 +424,7 @@ def VOPProfileMQSAD : VOP3_Profile { let SubtargetPredicate = isGFX7Plus in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { -let SubtargetPredicate = HasQsadInsts in - defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile>; +defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile>; defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>; } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] } // End SubtargetPredicate = isGFX7Plus @@ -791,6 +789,9 @@ let isCommutable = 1 in { defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>; } // End isCommutable = 1 +defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>; +defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>; + defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>; let isReMaterializable = 1 in { @@ -995,11 +996,6 @@ def : GCNPat<(DivergentBinFrag (or_oneuse i64:$src0, i64:$src1), i64:$src2), } // End SubtargetPredicate = isGFX9Plus -let SubtargetPredicate = HasCvtPkNormVOP3Insts in { - defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>; - defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>; -} // end SubtargetPredicate = HasCvtPkNormVOP3Insts - // FIXME: Probably should hardcode clamp bit in pseudo and avoid this. class OpSelBinOpClampPat : GCNPat< diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 28f3649a840d6..96bef0e574a45 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -447,11 +447,6 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["atomic-fmin-fmax-global-f64"] = true; Features["wavefrontsize32"] = true; Features["clusters"] = true; - Features["cube-insts"] = true; - Features["lerp-inst"] = true; - Features["sad-insts"] = true; - Features["qsad-insts"] = true; - Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX1201: case GK_GFX1200: @@ -479,11 +474,6 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["gfx12-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["image-insts"] = true; - Features["cube-insts"] = true; - Features["lerp-inst"] = true; - Features["sad-insts"] = true; - Features["qsad-insts"] = true; - Features["cvt-pknorm-vop2-insts"] = true; Features["fp8-conversion-insts"] = true; Features["atomic-fmin-fmax-global-f32"] = true; break; @@ -513,11 +503,6 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["gfx11-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["image-insts"] = true; - Features["cube-insts"] = true; - Features["lerp-inst"] = true; - Features["sad-insts"] = true; - Features["qsad-insts"] = true; - Features["cvt-pknorm-vop2-insts"] = true; Features["gws"] = true; Features["atomic-fmin-fmax-global-f32"] = true; break; @@ -550,11 +535,6 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["vmem-to-lds-load-insts"] = true; Features["atomic-fmin-fmax-global-f32"] = true; Features["atomic-fmin-fmax-global-f64"] = true; - Features["cube-insts"] = true; - Features["lerp-inst"] = true; - Features["sad-insts"] = true; - Features["qsad-insts"] = true; - Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX1012: case GK_GFX1011: @@ -582,11 +562,6 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["vmem-to-lds-load-insts"] = true; Features["atomic-fmin-fmax-global-f32"] = true; Features["atomic-fmin-fmax-global-f64"] = true; - Features["cube-insts"] = true; - Features["lerp-inst"] = true; - Features["sad-insts"] = true; - Features["qsad-insts"] = true; - Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX950: Features["bitop3-insts"] = true; @@ -640,11 +615,6 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["vmem-to-lds-load-insts"] = true; Features["atomic-fmin-fmax-global-f64"] = true; Features["wavefrontsize64"] = true; - Features["cube-insts"] = true; - Features["lerp-inst"] = true; - Features["sad-insts"] = true; - Features["qsad-insts"] = true; - Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX90A: Features["gfx90a-insts"] = true; @@ -689,11 +659,6 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["s-memtime-inst"] = true; Features["gws"] = true; Features["wavefrontsize64"] = true; - Features["cube-insts"] = true; - Features["lerp-inst"] = true; - Features["sad-insts"] = true; - Features["qsad-insts"] = true; - Features["cvt-pknorm-vop2-insts"] = true; break; case GK_GFX705: case GK_GFX704: @@ -702,18 +667,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, case GK_GFX701: case GK_GFX700: Features["ci-insts"] = true; - Features["cube-insts"] = true; - Features["lerp-inst"] = true; - Features["sad-insts"] = true; - Features["qsad-insts"] = true; - Features["cvt-pknorm-vop2-insts"] = true; - Features["image-insts"] = true; - Features["s-memtime-inst"] = true; - Features["gws"] = true; - Features["atomic-fmin-fmax-global-f32"] = true; - Features["atomic-fmin-fmax-global-f64"] = true; - Features["wavefrontsize64"] = true; - break; + [[fallthrough]]; case GK_GFX602: case GK_GFX601: case GK_GFX600: @@ -723,10 +677,6 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["atomic-fmin-fmax-global-f32"] = true; Features["atomic-fmin-fmax-global-f64"] = true; Features["wavefrontsize64"] = true; - Features["cube-insts"] = true; - Features["lerp-inst"] = true; - Features["sad-insts"] = true; - Features["cvt-pknorm-vop2-insts"] = true; break; case GK_NONE: break; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll index 49169eec072b6..43c69baaf3e7f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll @@ -1,3 +1,4 @@ +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0 diff --git a/revert_patches.txt b/revert_patches.txt index 9e465ba90ae6a..a3a76b6ac1e40 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -5,3 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485) breaks build of ROCmValidationSuite [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662) --- +Shore will help land downstream +[AMDGPU] Adding instruction specific features (#167809) +---