diff --git a/clang-tools-extra/clang-query/tool/ClangQuery.cpp b/clang-tools-extra/clang-query/tool/ClangQuery.cpp index 0c471def2e140..31c7f12251c90 100644 --- a/clang-tools-extra/clang-query/tool/ClangQuery.cpp +++ b/clang-tools-extra/clang-query/tool/ClangQuery.cpp @@ -110,31 +110,33 @@ int main(int argc, const char **argv) { ClangTool Tool(OptionsParser->getCompilations(), OptionsParser->getSourcePathList()); std::vector> ASTs; - int Status = Tool.buildASTs(ASTs); int ASTStatus = 0; - if (Status == 1) { - // Building ASTs failed. + switch (Tool.buildASTs(ASTs)) { + case 0: + break; + case 1: // Building ASTs failed. return 1; - } else if (Status == 2) { + case 2: ASTStatus |= 1; llvm::errs() << "Failed to build AST for some of the files, " << "results may be incomplete." << "\n"; - } else { - assert(Status == 0 && "Unexpected status returned"); + break; + default: + llvm_unreachable("Unexpected status returned"); } QuerySession QS(ASTs); if (!Commands.empty()) { - for (auto I = Commands.begin(), E = Commands.end(); I != E; ++I) { - QueryRef Q = QueryParser::parse(*I, QS); + for (auto &Command : Commands) { + QueryRef Q = QueryParser::parse(Command, QS); if (!Q->run(llvm::outs(), QS)) return 1; } } else if (!CommandFiles.empty()) { - for (auto I = CommandFiles.begin(), E = CommandFiles.end(); I != E; ++I) { - if (runCommandsInFile(argv[0], *I, QS)) + for (auto &CommandFile : CommandFiles) { + if (runCommandsInFile(argv[0], CommandFile, QS)) return 1; } } else { diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp index 0408b0498488e..15ef89cb34faa 100644 --- a/clang-tools-extra/clangd/ClangdLSPServer.cpp +++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp @@ -592,7 +592,10 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params, {"codeActionProvider", std::move(CodeActionProvider)}, {"completionProvider", llvm::json::Object{ - {"allCommitCharacters", " \t()[]{}<>:;,+-/*%^&#?.=\"'|"}, + {"allCommitCharacters", + {" ", "\t", "(", ")", "[", "]", "{", "}", "<", + ">", ":", ";", ",", "+", "-", "/", "*", "%", + "^", "&", "#", "?", ".", "=", "\"", "'", "|"}}, {"resolveProvider", false}, // We do extra checks, e.g. that > is part of ->. {"triggerCharacters", {".", "<", ">", ":", "\"", "/"}}, diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h index 7068cd5eb4217..ae10dba32b58c 100644 --- a/clang-tools-extra/clangd/ClangdServer.h +++ b/clang-tools-extra/clangd/ClangdServer.h @@ -131,7 +131,7 @@ class ClangdServer { bool BuildRecoveryAST = true; /// If true, turn on the `-frecovery-ast-type` clang flag. - bool PreserveRecoveryASTType = false; + bool PreserveRecoveryASTType = true; /// Clangd's workspace root. Relevant for "workspace" operations not bound /// to a particular file. diff --git a/clang-tools-extra/clangd/SemanticHighlighting.cpp b/clang-tools-extra/clangd/SemanticHighlighting.cpp index fb1ef1e326b41..4e66a9bb4e857 100644 --- a/clang-tools-extra/clangd/SemanticHighlighting.cpp +++ b/clang-tools-extra/clangd/SemanticHighlighting.cpp @@ -221,23 +221,51 @@ class HighlightingsBuilder { // the end of the Tokens). TokRef = TokRef.drop_front(Conflicting.size()); } - // Add tokens indicating lines skipped by the preprocessor. - for (const Range &R : AST.getMacros().SkippedRanges) { + const auto &SM = AST.getSourceManager(); + StringRef MainCode = SM.getBuffer(SM.getMainFileID())->getBuffer(); + + // Merge token stream with "inactive line" markers. + std::vector WithInactiveLines; + auto SortedSkippedRanges = AST.getMacros().SkippedRanges; + llvm::sort(SortedSkippedRanges); + auto It = NonConflicting.begin(); + for (const Range &R : SortedSkippedRanges) { // Create one token for each line in the skipped range, so it works // with line-based diffing. assert(R.start.line <= R.end.line); for (int Line = R.start.line; Line <= R.end.line; ++Line) { - // Don't bother computing the offset for the end of the line, just use - // zero. The client will treat this highlighting kind specially, and - // highlight the entire line visually (i.e. not just to where the text - // on the line ends, but to the end of the screen). - NonConflicting.push_back({HighlightingKind::InactiveCode, - {Position{Line, 0}, Position{Line, 0}}}); + // Copy tokens before the inactive line + for (; It != NonConflicting.end() && It->R.start.line < Line; ++It) + WithInactiveLines.push_back(std::move(*It)); + // Add a token for the inactive line itself. + auto StartOfLine = positionToOffset(MainCode, Position{Line, 0}); + if (StartOfLine) { + StringRef LineText = + MainCode.drop_front(*StartOfLine).take_until([](char C) { + return C == '\n'; + }); + WithInactiveLines.push_back( + {HighlightingKind::InactiveCode, + {Position{Line, 0}, + Position{Line, static_cast(lspLength(LineText))}}}); + } else { + elog("Failed to convert position to offset: {0}", + StartOfLine.takeError()); + } + + // Skip any other tokens on the inactive line. e.g. + // `#ifndef Foo` is considered as part of an inactive region when Foo is + // defined, and there is a Foo macro token. + // FIXME: we should reduce the scope of the inactive region to not + // include the directive itself. + while (It != NonConflicting.end() && It->R.start.line == Line) + ++It; } } - // Re-sort the tokens because that's what the diffing expects. - llvm::sort(NonConflicting); - return NonConflicting; + // Copy tokens after the last inactive line + for (; It != NonConflicting.end(); ++It) + WithInactiveLines.push_back(std::move(*It)); + return WithInactiveLines; } private: @@ -493,9 +521,6 @@ toSemanticTokens(llvm::ArrayRef Tokens) { std::vector Result; const HighlightingToken *Last = nullptr; for (const HighlightingToken &Tok : Tokens) { - // FIXME: support inactive code - we need to provide the actual bounds. - if (Tok.Kind == HighlightingKind::InactiveCode) - continue; Result.emplace_back(); SemanticToken &Out = Result.back(); // deltaStart/deltaLine are relative if possible. diff --git a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp index 9e64ceeeaeade..e4900041671a4 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp @@ -86,6 +86,13 @@ class UsingFinder : public RecursiveASTVisitor { const SourceManager &SM; }; +bool isFullyQualified(const NestedNameSpecifier *NNS) { + if (!NNS) + return false; + return NNS->getKind() == NestedNameSpecifier::Global || + isFullyQualified(NNS->getPrefix()); +} + struct InsertionPointData { // Location to insert the "using" statement. If invalid then the statement // should not be inserted at all (it already exists). @@ -94,6 +101,9 @@ struct InsertionPointData { // insertion point is anchored to, we may need one or more \n to ensure // proper formatting. std::string Suffix; + // Whether using should be fully qualified, even if what the user typed was + // not. This is based on our detection of the local style. + bool AlwaysFullyQualify = false; }; // Finds the best place to insert the "using" statement. Returns invalid @@ -118,7 +128,13 @@ findInsertionPoint(const Tweak::Selection &Inputs, SM) .TraverseAST(Inputs.AST->getASTContext()); + bool AlwaysFullyQualify = true; for (auto &U : Usings) { + // Only "upgrade" to fully qualified is all relevant using decls are fully + // qualified. Otherwise trust what the user typed. + if (!isFullyQualified(U->getQualifier())) + AlwaysFullyQualify = false; + if (SM.isBeforeInTranslationUnit(Inputs.Cursor, U->getUsingLoc())) // "Usings" is sorted, so we're done. break; @@ -137,6 +153,7 @@ findInsertionPoint(const Tweak::Selection &Inputs, if (LastUsingLoc.isValid()) { InsertionPointData Out; Out.Loc = LastUsingLoc; + Out.AlwaysFullyQualify = AlwaysFullyQualify; return Out; } @@ -278,6 +295,9 @@ Expected AddUsing::apply(const Selection &Inputs) { std::string UsingText; llvm::raw_string_ostream UsingTextStream(UsingText); UsingTextStream << "using "; + if (InsertionPoint->AlwaysFullyQualify && + !isFullyQualified(QualifierToRemove.getNestedNameSpecifier())) + UsingTextStream << "::"; QualifierToRemove.getNestedNameSpecifier()->print( UsingTextStream, Inputs.AST->getASTContext().getPrintingPolicy()); UsingTextStream << Name << ";" << InsertionPoint->Suffix; diff --git a/clang-tools-extra/clangd/test/initialize-params.test b/clang-tools-extra/clangd/test/initialize-params.test index f0a0f791c2f68..4125c27e4e35a 100644 --- a/clang-tools-extra/clangd/test/initialize-params.test +++ b/clang-tools-extra/clangd/test/initialize-params.test @@ -7,7 +7,35 @@ # CHECK-NEXT: "capabilities": { # CHECK-NEXT: "codeActionProvider": true, # CHECK-NEXT: "completionProvider": { -# CHECK-NEXT: "allCommitCharacters": " \t()[]{}<>:;,+-/*%^&#?.=\"'|", +# CHECK-NEXT: "allCommitCharacters": [ +# CHECK-NEXT: " ", +# CHECK-NEXT: "\t", +# CHECK-NEXT: "(", +# CHECK-NEXT: ")", +# CHECK-NEXT: "[", +# CHECK-NEXT: "]", +# CHECK-NEXT: "{", +# CHECK-NEXT: "}", +# CHECK-NEXT: "<", +# CHECK-NEXT: ">", +# CHECK-NEXT: ":", +# CHECK-NEXT: ";", +# CHECK-NEXT: ",", +# CHECK-NEXT: "+", +# CHECK-NEXT: "-", +# CHECK-NEXT: "/", +# CHECK-NEXT: "*", +# CHECK-NEXT: "%", +# CHECK-NEXT: "^", +# CHECK-NEXT: "&", +# CHECK-NEXT: "#", +# CHECK-NEXT: "?", +# CHECK-NEXT: ".", +# CHECK-NEXT: "=", +# CHECK-NEXT: "\"", +# CHECK-NEXT: "'", +# CHECK-NEXT: "|" +# CHECK-NEXT: ], # CHECK-NEXT: "resolveProvider": false, # CHECK-NEXT: "triggerCharacters": [ # CHECK-NEXT: ".", diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp index 57dac600014d5..dcbaa35238226 100644 --- a/clang-tools-extra/clangd/tool/ClangdMain.cpp +++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -291,9 +291,8 @@ opt RecoveryAST{ opt RecoveryASTType{ "recovery-ast-type", cat(Features), - desc("Preserve the type for recovery AST. Note that " - "this feature is experimental and may lead to crashes"), - init(false), + desc("Preserve the type for recovery AST."), + init(ClangdServer::Options().PreserveRecoveryASTType), Hidden, }; diff --git a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp index 06743080166b4..232be6a783803 100644 --- a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp +++ b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp @@ -503,11 +503,11 @@ TEST(SemanticHighlighting, GetsCorrectTokens) { #define $Macro[[test]] #undef $Macro[[test]] -$InactiveCode[[]] #ifdef $Macro[[test]] -$InactiveCode[[]] #endif +$InactiveCode[[#ifdef test]] +$InactiveCode[[#endif]] -$InactiveCode[[]] #if defined($Macro[[test]]) -$InactiveCode[[]] #endif +$InactiveCode[[#if defined(test)]] +$InactiveCode[[#endif]] )cpp", R"cpp( struct $Class[[S]] { @@ -614,8 +614,8 @@ TEST(SemanticHighlighting, GetsCorrectTokens) { R"cpp( // Code in the preamble. // Inactive lines get an empty InactiveCode token at the beginning. -$InactiveCode[[]] #ifdef $Macro[[test]] -$InactiveCode[[]] #endif +$InactiveCode[[#ifdef test]] +$InactiveCode[[#endif]] // A declaration to cause the preamble to end. int $Variable[[EndPreamble]]; @@ -623,17 +623,17 @@ TEST(SemanticHighlighting, GetsCorrectTokens) { // Code after the preamble. // Code inside inactive blocks does not get regular highlightings // because it's not part of the AST. -$InactiveCode[[]] #ifdef $Macro[[test]] -$InactiveCode[[]] int Inactive2; -$InactiveCode[[]] #endif +$InactiveCode[[#ifdef test]] +$InactiveCode[[int Inactive2;]] +$InactiveCode[[#endif]] #ifndef $Macro[[test]] int $Variable[[Active1]]; #endif -$InactiveCode[[]] #ifdef $Macro[[test]] -$InactiveCode[[]] int Inactive3; -$InactiveCode[[]] #else +$InactiveCode[[#ifdef test]] +$InactiveCode[[int Inactive3;]] +$InactiveCode[[#else]] int $Variable[[Active2]]; #endif )cpp", diff --git a/clang-tools-extra/clangd/unittests/TweakTests.cpp b/clang-tools-extra/clangd/unittests/TweakTests.cpp index b4f135b3efe24..5626b75305993 100644 --- a/clang-tools-extra/clangd/unittests/TweakTests.cpp +++ b/clang-tools-extra/clangd/unittests/TweakTests.cpp @@ -2723,6 +2723,63 @@ namespace foo { void fun(); } void foo::fun() { ff(); +})cpp"}, + // If all other using are fully qualified, add :: + {R"cpp( +#include "test.hpp" + +using ::one::two::cc; +using ::one::two::ee; + +void fun() { + one::two::f^f(); +})cpp", + R"cpp( +#include "test.hpp" + +using ::one::two::cc; +using ::one::two::ff;using ::one::two::ee; + +void fun() { + ff(); +})cpp"}, + // Make sure we don't add :: if it's already there + {R"cpp( +#include "test.hpp" + +using ::one::two::cc; +using ::one::two::ee; + +void fun() { + ::one::two::f^f(); +})cpp", + R"cpp( +#include "test.hpp" + +using ::one::two::cc; +using ::one::two::ff;using ::one::two::ee; + +void fun() { + ff(); +})cpp"}, + // If even one using doesn't start with ::, do not add it + {R"cpp( +#include "test.hpp" + +using ::one::two::cc; +using one::two::ee; + +void fun() { + one::two::f^f(); +})cpp", + R"cpp( +#include "test.hpp" + +using ::one::two::cc; +using one::two::ff;using one::two::ee; + +void fun() { + ff(); })cpp"}}; llvm::StringMap EditedFiles; for (const auto &Case : Cases) { diff --git a/clang/docs/LTOVisibility.rst b/clang/docs/LTOVisibility.rst index 3a60f54e1b907..cdc0b9cc0e19e 100644 --- a/clang/docs/LTOVisibility.rst +++ b/clang/docs/LTOVisibility.rst @@ -35,6 +35,16 @@ other classes receive hidden LTO visibility. Classes with internal linkage (e.g. classes declared in unnamed namespaces) also receive hidden LTO visibility. +During the LTO link, all classes with public LTO visibility will be refined +to hidden LTO visibility when the ``--lto-whole-program-visibility`` lld linker +option is applied (``-plugin-opt=whole-program-visibility`` for gold). This flag +can be used to defer specifying whether classes have hidden LTO visibility until +link time, to allow bitcode objects to be shared by different LTO links. +Due to an implementation limitation, symbols associated with classes with hidden +LTO visibility may still be exported from the binary when using this flag. It is +unsafe to refer to these symbols, and their visibility may be relaxed to hidden +in a future compiler release. + A class defined in a translation unit built without LTO receives public LTO visibility regardless of its object file visibility, linkage or other attributes. diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c036f66d60bf7..66427f293775b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -188,6 +188,13 @@ X86 Support in Clang - The x86 intrinsics ``__rorb``, ``__rorw``, ``__rord``, ``__rorq`, ``_rotr``, ``_rotwr`` and ``_lrotr`` may now be used within constant expressions. +- Support for -march=sapphirerapids was added. + +- The -mtune command line option is no longer ignored for X86. This can be used + to request microarchitectural optimizations independent on -march. -march= + implies -mtune=. -mtune=generic is the default with no -march or -mtune + specified. + Internal API Changes -------------------- diff --git a/clang/docs/UndefinedBehaviorSanitizer.rst b/clang/docs/UndefinedBehaviorSanitizer.rst index 76676dfce95b4..3345536bf821a 100644 --- a/clang/docs/UndefinedBehaviorSanitizer.rst +++ b/clang/docs/UndefinedBehaviorSanitizer.rst @@ -153,6 +153,8 @@ Available checks are: unsigned overflow in C++. You can use ``-fsanitize=shift-base`` or ``-fsanitize=shift-exponent`` to check only left-hand side or right-hand side of shift operation, respectively. + - ``-fsanitize=unsigned-shift-base``: check that an unsigned left-hand side of + a left shift operation doesn't overflow. - ``-fsanitize=signed-integer-overflow``: Signed integer overflow, where the result of a signed integer computation cannot be represented in its type. This includes all the checks covered by ``-ftrapv``, as well as checks for diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 7d313a9fa186b..a7e2b747df7b8 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -2063,6 +2063,11 @@ class ASTContext : public RefCountedBase { /// types. bool areCompatibleVectorTypes(QualType FirstVec, QualType SecondVec); + /// Return true if the given types are an SVE builtin and a VectorType that + /// is a fixed-length representation of the SVE builtin for a specific + /// vector-length. + bool areCompatibleSveTypes(QualType FirstType, QualType SecondType); + /// Return true if the type has been explicitly qualified with ObjC ownership. /// A type may be implicitly qualified with ownership under ObjC ARC, and in /// some cases the compiler treats these differently. @@ -2119,10 +2124,6 @@ class ASTContext : public RefCountedBase { return getTypeSizeInCharsIfKnown(QualType(Ty, 0)); } - /// Returns the bitwidth of \p T, an SVE type attributed with - /// 'arm_sve_vector_bits'. Should only be called if T->isVLST(). - unsigned getBitwidthForAttributedSveType(const Type *T) const; - /// Return the ABI-specified alignment of a (complete) type \p T, in /// bits. unsigned getTypeAlign(QualType T) const { return getTypeInfo(T).Align; } diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index b976ca2c7303b..63e640186ef41 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -1893,14 +1893,16 @@ class alignas(8) Type : public ExtQualsTypeCommonBase { bool isSizelessType() const; bool isSizelessBuiltinType() const; - /// Determines if this is a vector-length-specific type (VLST), i.e. a - /// sizeless type with the 'arm_sve_vector_bits' attribute applied. - bool isVLST() const; /// Determines if this is a sizeless type supported by the /// 'arm_sve_vector_bits' type attribute, which can be applied to a single /// SVE vector or predicate, excluding tuple types such as svint32x4_t. bool isVLSTBuiltinType() const; + /// Returns the representative type for the element of an SVE builtin type. + /// This is used to represent fixed-length SVE vectors created with the + /// 'arm_sve_vector_bits' type attribute as VectorType. + QualType getSveEltType(const ASTContext &Ctx) const; + /// Types are partitioned into 3 broad categories (C99 6.2.5p1): /// object types, function types, and incomplete types. @@ -3236,7 +3238,13 @@ class VectorType : public Type, public llvm::FoldingSetNode { NeonVector, /// is ARM Neon polynomial vector - NeonPolyVector + NeonPolyVector, + + /// is AArch64 SVE fixed-length data vector + SveFixedLengthDataVector, + + /// is AArch64 SVE fixed-length predicate vector + SveFixedLengthPredicateVector }; protected: diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 5211c4499a99f..99dd43a5f4dde 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2042,6 +2042,8 @@ def ArmSveVectorBits : TypeAttr { let Args = [UnsignedArgument<"NumBits">]; let Documentation = [ArmSveVectorBitsDocs]; let PragmaAttributeSupport = 0; + // Represented as VectorType instead. + let ASTNode = 0; } def ArmMveStrictPolymorphism : TypeAttr, TargetSpecificAttr { @@ -3823,13 +3825,15 @@ def OMPDeclareTargetDecl : InheritableAttr { [ "MT_To", "MT_Link" ]>, EnumArgument<"DevType", "DevTypeTy", [ "host", "nohost", "any" ], - [ "DT_Host", "DT_NoHost", "DT_Any" ]> + [ "DT_Host", "DT_NoHost", "DT_Any" ]>, + UnsignedArgument<"Level"> ]; let AdditionalMembers = [{ void printPrettyPragma(raw_ostream &OS, const PrintingPolicy &Policy) const; static llvm::Optional isDeclareTargetDeclaration(const ValueDecl *VD); static llvm::Optional getDeviceType(const ValueDecl *VD); + static llvm::Optional getLocation(const ValueDecl *VD); }]; } diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def index 566420d5dce94..9a33ba06d82e1 100644 --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -305,6 +305,18 @@ BUILTIN(__builtin_altivec_vextractwm, "UiV4Ui", "") BUILTIN(__builtin_altivec_vextractdm, "UiV2ULLi", "") BUILTIN(__builtin_altivec_vextractqm, "UiV1ULLLi", "") +// P10 Vector Divide Extended built-ins. +BUILTIN(__builtin_altivec_vdivesw, "V4SiV4SiV4Si", "") +BUILTIN(__builtin_altivec_vdiveuw, "V4UiV4UiV4Ui", "") +BUILTIN(__builtin_altivec_vdivesd, "V2LLiV2LLiV2LLi", "") +BUILTIN(__builtin_altivec_vdiveud, "V2ULLiV2ULLiV2ULLi", "") + +// P10 Vector Multiply High built-ins. +BUILTIN(__builtin_altivec_vmulhsw, "V4SiV4SiV4Si", "") +BUILTIN(__builtin_altivec_vmulhuw, "V4UiV4UiV4Ui", "") +BUILTIN(__builtin_altivec_vmulhsd, "V2LLiV2LLiV2LLi", "") +BUILTIN(__builtin_altivec_vmulhud, "V2ULLiV2ULLiV2ULLi", "") + // P10 Vector Parallel Bits built-ins. BUILTIN(__builtin_altivec_vpdepd, "V2ULLiV2ULLiV2ULLi", "") BUILTIN(__builtin_altivec_vpextd, "V2ULLiV2ULLiV2ULLi", "") diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index cbd9df998e786..8b89aac8d6d5f 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -145,6 +145,7 @@ CODEGENOPT(IncrementalLinkerCompatible, 1, 0) ///< Emit an object file which can ///< linker. CODEGENOPT(MergeAllConstants , 1, 1) ///< Merge identical constants. CODEGENOPT(MergeFunctions , 1, 0) ///< Set when -fmerge-functions is enabled. +CODEGENOPT(HeapProf , 1, 0) ///< Set when -fmemprof is enabled. CODEGENOPT(MSVolatile , 1, 0) ///< Set when /volatile:ms is enabled. CODEGENOPT(NoCommon , 1, 0) ///< Set when -fno-common or C++ is enabled. CODEGENOPT(NoDwarfDirectoryAsm , 1, 0) ///< Set when -fno-dwarf-directory-asm is @@ -325,6 +326,9 @@ ENUM_CODEGENOPT(DebuggerTuning, llvm::DebuggerKind, 2, /// emitted. VALUE_CODEGENOPT(DwarfVersion, 3, 0) +/// Whether to use experimental new variable location tracking. +CODEGENOPT(ValueTrackingVariableLocations, 1, 0) + /// Whether we should emit CodeView debug information. It's possible to emit /// CodeView and DWARF into the same object. CODEGENOPT(EmitCodeView, 1, 0) diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 9cb06cf5b5e11..77d2e26ba7909 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -695,6 +695,9 @@ def err_mmap_missing_module_unqualified : Error< "no module named '%0' visible from '%1'">; def err_mmap_missing_module_qualified : Error< "no module named '%0' in '%1'">; +def err_mmap_missing_parent_module: Error< + "no module named '%0' %select{found|in '%2'}1, " + "parent module must be defined before the submodule">; def err_mmap_top_level_inferred_submodule : Error< "only submodules and framework modules may be inferred with wildcard syntax">; def err_mmap_inferred_no_umbrella : Error< diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 7e86a024a094b..bda07c98834d5 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -2966,6 +2966,8 @@ def err_attribute_invalid_size : Error< "vector size not an integral multiple of component size">; def err_attribute_zero_size : Error<"zero %0 size">; def err_attribute_size_too_large : Error<"%0 size too large">; +def err_typecheck_vector_not_convertable_sizeless : Error< + "cannot convert between a fixed-length and a sizeless vector (%0 and %1)">; def err_typecheck_vector_not_convertable_implict_truncation : Error< "cannot convert between %select{scalar|vector}0 type %1 and vector type" " %2 as implicit conversion would cause truncation">; diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h index 94dd215379661..ac33c7573f35d 100644 --- a/clang/include/clang/Basic/Module.h +++ b/clang/include/clang/Basic/Module.h @@ -62,6 +62,15 @@ struct ASTFileSignature : std::array { explicit operator bool() const { return *this != BaseT({{0}}); } + /// Returns the value truncated to the size of an uint64_t. + uint64_t truncatedValue() const { + uint64_t Value = 0; + static_assert(sizeof(*this) >= sizeof(uint64_t), "No need to truncate."); + for (unsigned I = 0; I < sizeof(uint64_t); ++I) + Value |= static_cast((*this)[I]) << (I * 8); + return Value; + } + static ASTFileSignature create(StringRef Bytes) { return create(Bytes.bytes_begin(), Bytes.bytes_end()); } diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def index 2912bdd44b2db..9b8936cc520cb 100644 --- a/clang/include/clang/Basic/Sanitizers.def +++ b/clang/include/clang/Basic/Sanitizers.def @@ -107,6 +107,7 @@ SANITIZER("vptr", Vptr) // IntegerSanitizer SANITIZER("unsigned-integer-overflow", UnsignedIntegerOverflow) +SANITIZER("unsigned-shift-base", UnsignedShiftBase) // DataFlowSanitizer SANITIZER("dataflow", DataFlow) @@ -171,7 +172,8 @@ SANITIZER_GROUP("implicit-conversion", ImplicitConversion, SANITIZER_GROUP("integer", Integer, ImplicitConversion | IntegerDivideByZero | Shift | - SignedIntegerOverflow | UnsignedIntegerOverflow) + SignedIntegerOverflow | UnsignedIntegerOverflow | + UnsignedShiftBase) SANITIZER("local-bounds", LocalBounds) SANITIZER_GROUP("bounds", Bounds, ArrayBounds | LocalBounds) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7d4e1487adeed..cebbb27609297 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -717,7 +717,7 @@ def emit_ast : Flag<["-"], "emit-ast">, def emit_llvm : Flag<["-"], "emit-llvm">, Flags<[CC1Option]>, Group, HelpText<"Use the LLVM representation for assembler and object files">; def emit_interface_stubs : Flag<["-"], "emit-interface-stubs">, Flags<[CC1Option]>, Group, - HelpText<"Generate Inteface Stub Files.">; + HelpText<"Generate Interface Stub Files.">; def emit_merged_ifs : Flag<["-"], "emit-merged-ifs">, Flags<[CC1Option]>, Group, HelpText<"Generate Interface Stub Files, emit merged text not binary.">; @@ -1014,6 +1014,8 @@ defm cxx_static_destructors : OptOutFFlag<"c++-static-destructors", "", def fsymbol_partition_EQ : Joined<["-"], "fsymbol-partition=">, Group, Flags<[CC1Option]>; +defm memprof : OptInFFlag<"memprof", "Enable", "Disable", " heap memory profiling">; + // Begin sanitizer flags. These should all be core options exposed in all driver // modes. let Flags = [CC1Option, CoreOption] in { @@ -1830,7 +1832,7 @@ def fstack_protector : Flag<["-"], "fstack-protector">, Group, "This uses a loose heuristic which considers functions vulnerable if they " "contain a char (or 8bit integer) array or constant sized calls to alloca " ", which are of greater size than ssp-buffer-size (default: 8 bytes). All " - "variable sized calls to alloca are considered vulnerable. A function with" + "variable sized calls to alloca are considered vulnerable. A function with " "a stack protector has a guard value added to the stack frame that is " "checked on function exit. The guard value must be positioned in the " "stack frame such that a buffer overflow from a vulnerable variable will " @@ -2569,6 +2571,8 @@ def mlongcall: Flag<["-"], "mlongcall">, Group; def mno_longcall : Flag<["-"], "mno-longcall">, Group; +def mmma: Flag<["-"], "mmma">, Group; +def mno_mma: Flag<["-"], "mno-mma">, Group; def maix_struct_return : Flag<["-"], "maix-struct-return">, Group, Flags<[CC1Option]>, HelpText<"Return all structs in memory (PPC32 only)">; @@ -3946,6 +3950,9 @@ def fdebug_pass_manager : Flag<["-"], "fdebug-pass-manager">, HelpText<"Prints debug information for the new pass manager">; def fno_debug_pass_manager : Flag<["-"], "fno-debug-pass-manager">, HelpText<"Disables debug printing for the new pass manager">; +def fexperimental_debug_variable_locations : Flag<["-"], + "fexperimental-debug-variable-locations">, + HelpText<"Use experimental new value-tracking variable locations">; // The driver option takes the key as a parameter to the -msign-return-address= // and -mbranch-protection= options, but CC1 has a separate option so we // don't have to parse the parameter twice. @@ -4123,8 +4130,7 @@ def frecovery_ast : Flag<["-"], "frecovery-ast">, "encountering semantic errors">; def fno_recovery_ast : Flag<["-"], "fno-recovery-ast">; def frecovery_ast_type : Flag<["-"], "frecovery-ast-type">, - HelpText<"Preserve the type for recovery expressions when possible " - "(experimental)">; + HelpText<"Preserve the type for recovery expressions when possible">; def fno_recovery_ast_type : Flag<["-"], "fno-recovery-ast-type">; let Group = Action_Group in { @@ -4839,6 +4845,8 @@ def _SLASH_TC : CLCompileFlag<"TC">, HelpText<"Treat all source files as C">; def _SLASH_Tp : CLCompileJoinedOrSeparate<"Tp">, HelpText<"Treat as C++ source file">, MetaVarName<"">; def _SLASH_TP : CLCompileFlag<"TP">, HelpText<"Treat all source files as C++">; +def _SLASH_vctoolsdir : CLJoinedOrSeparate<"vctoolsdir">, + HelpText<"Path to the VCToolChain">, MetaVarName<"">; def _SLASH_volatile_iso : Option<["/", "-"], "volatile:iso", KIND_FLAG>, Group<_SLASH_volatile_Group>, Flags<[CLOption, DriverOption]>, HelpText<"Volatile loads and stores have standard semantics">; diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index 563d6c3ff9de2..95d6bcf35c786 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -55,13 +55,15 @@ class SanitizerArgs { bool MinimalRuntime = false; // True if cross-dso CFI support if provided by the system (i.e. Android). bool ImplicitCfiRuntime = false; + bool NeedsHeapProfRt = false; - public: +public: /// Parses the sanitizer arguments from an argument list. SanitizerArgs(const ToolChain &TC, const llvm::opt::ArgList &Args); bool needsSharedRt() const { return SharedRuntime; } + bool needsHeapProfRt() const { return NeedsHeapProfRt; } bool needsAsanRt() const { return Sanitizers.has(SanitizerKind::Address); } bool needsHwasanRt() const { return Sanitizers.has(SanitizerKind::HWAddress); diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index 5023525aa41bd..4f5e497bc2024 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -160,6 +160,9 @@ class Sema; /// Vector conversions ICK_Vector_Conversion, + /// Arm SVE Vector conversions + ICK_SVE_Vector_Conversion, + /// A vector splat from an arithmetic type ICK_Vector_Splat, diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 0194580149085..9b25973ba77ec 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -2157,10 +2157,7 @@ class Sema final { bool RequireCompleteSizedType(SourceLocation Loc, QualType T, unsigned DiagID, const Ts &... Args) { SizelessTypeDiagnoser Diagnoser(DiagID, Args...); - CompleteTypeKind Kind = CompleteTypeKind::Normal; - if (T->isVLST()) - Kind = CompleteTypeKind::AcceptSizeless; - return RequireCompleteType(Loc, T, Kind, Diagnoser); + return RequireCompleteType(Loc, T, CompleteTypeKind::Normal, Diagnoser); } void completeExprArrayBound(Expr *E); @@ -2178,10 +2175,7 @@ class Sema final { bool RequireCompleteSizedExprType(Expr *E, unsigned DiagID, const Ts &... Args) { SizelessTypeDiagnoser Diagnoser(DiagID, Args...); - CompleteTypeKind Kind = CompleteTypeKind::Normal; - if (E->getType()->isVLST()) - Kind = CompleteTypeKind::AcceptSizeless; - return RequireCompleteExprType(E, Kind, Diagnoser); + return RequireCompleteExprType(E, CompleteTypeKind::Normal, Diagnoser); } bool RequireLiteralType(SourceLocation Loc, QualType T, @@ -10150,7 +10144,7 @@ class Sema final { private: void *VarDataSharingAttributesStack; /// Number of nested '#pragma omp declare target' directives. - unsigned DeclareTargetNestingLevel = 0; + SmallVector DeclareTargetNesting; /// Initialization of data-sharing attributes stack. void InitDataSharingAttributesStack(); void DestroyDataSharingAttributesStack(); @@ -10410,7 +10404,7 @@ class Sema final { SourceLocation Loc); /// Return true inside OpenMP declare target region. bool isInOpenMPDeclareTargetContext() const { - return DeclareTargetNestingLevel > 0; + return !DeclareTargetNesting.empty(); } /// Return true inside OpenMP target region. bool isInOpenMPTargetExecutionDirective() const; diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h index d2df24a6e21b7..4907b0757a8a4 100644 --- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h +++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h @@ -177,23 +177,23 @@ class AnalyzerOptions : public RefCountedBase { /// description in a formatted manner. If \p MinLineWidth is set to 0, no line /// breaks are introduced for the description. /// - /// Format, depending whether the option name's length is less then - /// \p OptionWidth: + /// Format, depending whether the option name's length is less than + /// \p EntryWidth: /// /// EntryNameDescription /// <---------padding--------->Description /// <---------padding--------->Description /// - /// VeryVeryLongOptionName + /// VeryVeryLongEntryName /// <---------padding--------->Description /// <---------padding--------->Description - /// ^~~~~~~~ InitialPad - /// ^~~~~~~~~~~~~~~~~~~~~~~~~~ EntryWidth + /// ^~~~~~~~~InitialPad + /// ^~~~~~~~~~~~~~~~~~EntryWidth /// ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~MinLineWidth - static void printFormattedEntry( - llvm::raw_ostream &Out, - std::pair EntryDescPair, - size_t EntryWidth, size_t InitialPad, size_t MinLineWidth = 0); + static void printFormattedEntry(llvm::raw_ostream &Out, + std::pair EntryDescPair, + size_t InitialPad, size_t EntryWidth, + size_t MinLineWidth = 0); /// Pairs of checker/package name and enable/disable. std::vector> CheckersAndPackages; diff --git a/clang/include/clang/Tooling/Syntax/Nodes.h b/clang/include/clang/Tooling/Syntax/Nodes.h index 0b3991768008a..a6505c8167eed 100644 --- a/clang/include/clang/Tooling/Syntax/Nodes.h +++ b/clang/include/clang/Tooling/Syntax/Nodes.h @@ -57,6 +57,7 @@ enum class NodeKind : uint16_t { IdExpression, MemberExpression, ThisExpression, + CallExpression, // Statements. UnknownStatement, @@ -98,12 +99,14 @@ enum class NodeKind : uint16_t { ParametersAndQualifiers, MemberPointer, UnqualifiedId, + ParameterDeclarationList, + CallArguments, // Nested Name Specifiers. NestedNameSpecifier, GlobalNameSpecifier, DecltypeNameSpecifier, IdentifierNameSpecifier, - SimpleTemplateNameSpecifier + SimpleTemplateNameSpecifier, }; /// For debugging purposes. raw_ostream &operator<<(raw_ostream &OS, NodeKind K); @@ -111,6 +114,8 @@ raw_ostream &operator<<(raw_ostream &OS, NodeKind K); /// A relation between a parent and child node, e.g. 'left-hand-side of /// a binary expression'. Used for implementing accessors. /// +/// In general `NodeRole`s should be named the same as their accessors. +/// /// Some roles describe parent/child relations that occur multiple times in /// language grammar. We define only one role to describe all instances of such /// recurring relations. For example, grammar for both "if" and "while" @@ -121,12 +126,6 @@ raw_ostream &operator<<(raw_ostream &OS, NodeKind K); /// opening paren), we define a role for this token and use it across all /// grammar rules with the same requirement. Names of such reusable roles end /// with a ~Token or a ~Keyword suffix. -/// -/// Some roles are assigned only to child nodes of one specific parent syntax -/// node type. Names of such roles start with the name of the parent syntax tree -/// node type. For example, a syntax node with a role -/// BinaryOperatorExpression_leftHandSide can only appear as a child of a -/// BinaryOperatorExpression node. enum class NodeRole : uint8_t { // Roles common to multiple node kinds. /// A node without a parent @@ -141,7 +140,7 @@ enum class NodeRole : uint8_t { IntroducerKeyword, /// A token that represents a literal, e.g. 'nullptr', '1', 'true', etc. LiteralToken, - /// Tokens or Keywords + /// Tokens or Keywords. ArrowToken, ExternKeyword, TemplateKeyword, @@ -149,36 +148,37 @@ enum class NodeRole : uint8_t { /// statement, e.g. loop body for while, for, etc; inner statement for case, /// default, etc. BodyStatement, - List_element, - List_delimiter, + /// List API roles. + ListElement, + ListDelimiter, // Roles specific to particular node kinds. - OperatorExpression_operatorToken, - UnaryOperatorExpression_operand, - BinaryOperatorExpression_leftHandSide, - BinaryOperatorExpression_rightHandSide, - CaseStatement_value, - IfStatement_thenStatement, - IfStatement_elseKeyword, - IfStatement_elseStatement, - ReturnStatement_value, - ExpressionStatement_expression, - CompoundStatement_statement, - StaticAssertDeclaration_condition, - StaticAssertDeclaration_message, - SimpleDeclaration_declarator, - TemplateDeclaration_declaration, - ExplicitTemplateInstantiation_declaration, - ArraySubscript_sizeExpression, - TrailingReturnType_declarator, - ParametersAndQualifiers_parameter, - ParametersAndQualifiers_trailingReturn, - IdExpression_id, - IdExpression_qualifier, - ParenExpression_subExpression, - MemberExpression_object, - MemberExpression_accessToken, - MemberExpression_member, + OperatorToken, + Operand, + LeftHandSide, + RightHandSide, + ReturnValue, + CaseValue, + ThenStatement, + ElseKeyword, + ElseStatement, + Expression, + Statement, + Condition, + Message, + Declarator, + Declaration, + Size, + Parameters, + TrailingReturn, + UnqualifiedId, + Qualifier, + SubExpression, + Object, + AccessToken, + Member, + Callee, + Arguments, }; /// For debugging purposes. raw_ostream &operator<<(raw_ostream &OS, NodeRole R); @@ -271,9 +271,9 @@ class NestedNameSpecifier final : public List { static bool classof(const Node *N) { return N->kind() <= NodeKind::NestedNameSpecifier; } - std::vector specifiers(); + std::vector getSpecifiers(); std::vector> - specifiersAndDoubleColons(); + getSpecifiersAndDoubleColons(); }; /// Models an `unqualified-id`. C++ [expr.prim.id.unqual] @@ -299,9 +299,9 @@ class IdExpression final : public Expression { static bool classof(const Node *N) { return N->kind() == NodeKind::IdExpression; } - NestedNameSpecifier *qualifier(); - Leaf *templateKeyword(); - UnqualifiedId *unqualifiedId(); + NestedNameSpecifier *getQualifier(); + Leaf *getTemplateKeyword(); + UnqualifiedId *getUnqualifiedId(); }; /// An expression of an unknown kind, i.e. one not currently handled by the @@ -321,7 +321,38 @@ class ThisExpression final : public Expression { static bool classof(const Node *N) { return N->kind() == NodeKind::ThisExpression; } - Leaf *thisKeyword(); + Leaf *getThisKeyword(); +}; + +/// Models arguments of a function call. +/// call-arguments: +/// delimited_list(expression, ',') +/// Note: This construct is a simplification of the grammar rule for +/// `expression-list`, that is used in the definition of `call-expression` +class CallArguments final : public List { +public: + CallArguments() : List(NodeKind::CallArguments) {} + static bool classof(const Node *N) { + return N->kind() <= NodeKind::CallArguments; + } + std::vector getArguments(); + std::vector> getArgumentsAndCommas(); +}; + +/// A function call. C++ [expr.call] +/// call-expression: +/// expression '(' call-arguments ')' +/// e.g `f(1, '2')` or `this->Base::f()` +class CallExpression final : public Expression { +public: + CallExpression() : Expression(NodeKind::CallExpression) {} + static bool classof(const Node *N) { + return N->kind() == NodeKind::CallExpression; + } + Expression *getCallee(); + Leaf *getOpenParen(); + CallArguments *getArguments(); + Leaf *getCloseParen(); }; /// Models a parenthesized expression `(E)`. C++ [expr.prim.paren] @@ -332,9 +363,9 @@ class ParenExpression final : public Expression { static bool classof(const Node *N) { return N->kind() == NodeKind::ParenExpression; } - Leaf *openParen(); - Expression *subExpression(); - Leaf *closeParen(); + Leaf *getOpenParen(); + Expression *getSubExpression(); + Leaf *getCloseParen(); }; /// Models a class member access. C++ [expr.ref] @@ -351,10 +382,10 @@ class MemberExpression final : public Expression { static bool classof(const Node *N) { return N->kind() == NodeKind::MemberExpression; } - Expression *object(); - Leaf *accessToken(); - Leaf *templateKeyword(); - IdExpression *member(); + Expression *getObject(); + Leaf *getAccessToken(); + Leaf *getTemplateKeyword(); + IdExpression *getMember(); }; /// Expression for literals. C++ [lex.literal] @@ -373,7 +404,7 @@ class LiteralExpression : public Expression { N->kind() == NodeKind::CharUserDefinedLiteralExpression || N->kind() == NodeKind::StringUserDefinedLiteralExpression; } - Leaf *literalToken(); + Leaf *getLiteralToken(); }; /// Expression for integer literals. C++ [lex.icon] @@ -508,8 +539,8 @@ class UnaryOperatorExpression : public Expression { return N->kind() == NodeKind::PrefixUnaryOperatorExpression || N->kind() == NodeKind::PostfixUnaryOperatorExpression; } - Leaf *operatorToken(); - Expression *operand(); + Leaf *getOperatorToken(); + Expression *getOperand(); }; /// @@ -557,9 +588,9 @@ class BinaryOperatorExpression final : public Expression { static bool classof(const Node *N) { return N->kind() == NodeKind::BinaryOperatorExpression; } - Expression *lhs(); - Leaf *operatorToken(); - Expression *rhs(); + Expression *getLhs(); + Leaf *getOperatorToken(); + Expression *getRhs(); }; /// An abstract node for C++ statements, e.g. 'while', 'if', etc. @@ -608,8 +639,8 @@ class SwitchStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::SwitchStatement; } - Leaf *switchKeyword(); - Statement *body(); + Leaf *getSwitchKeyword(); + Statement *getBody(); }; /// case : @@ -619,9 +650,9 @@ class CaseStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::CaseStatement; } - Leaf *caseKeyword(); - Expression *value(); - Statement *body(); + Leaf *getCaseKeyword(); + Expression *getCaseValue(); + Statement *getBody(); }; /// default: @@ -631,8 +662,8 @@ class DefaultStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::DefaultStatement; } - Leaf *defaultKeyword(); - Statement *body(); + Leaf *getDefaultKeyword(); + Statement *getBody(); }; /// if (cond) else @@ -643,10 +674,10 @@ class IfStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::IfStatement; } - Leaf *ifKeyword(); - Statement *thenStatement(); - Leaf *elseKeyword(); - Statement *elseStatement(); + Leaf *getIfKeyword(); + Statement *getThenStatement(); + Leaf *getElseKeyword(); + Statement *getElseStatement(); }; /// for (; ; ) @@ -656,8 +687,8 @@ class ForStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::ForStatement; } - Leaf *forKeyword(); - Statement *body(); + Leaf *getForKeyword(); + Statement *getBody(); }; /// while () @@ -667,8 +698,8 @@ class WhileStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::WhileStatement; } - Leaf *whileKeyword(); - Statement *body(); + Leaf *getWhileKeyword(); + Statement *getBody(); }; /// continue; @@ -678,7 +709,7 @@ class ContinueStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::ContinueStatement; } - Leaf *continueKeyword(); + Leaf *getContinueKeyword(); }; /// break; @@ -688,7 +719,7 @@ class BreakStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::BreakStatement; } - Leaf *breakKeyword(); + Leaf *getBreakKeyword(); }; /// return ; @@ -699,8 +730,8 @@ class ReturnStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::ReturnStatement; } - Leaf *returnKeyword(); - Expression *value(); + Leaf *getReturnKeyword(); + Expression *getReturnValue(); }; /// for ( : ) @@ -710,8 +741,8 @@ class RangeBasedForStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::RangeBasedForStatement; } - Leaf *forKeyword(); - Statement *body(); + Leaf *getForKeyword(); + Statement *getBody(); }; /// Expression in a statement position, e.g. functions calls inside compound @@ -722,7 +753,7 @@ class ExpressionStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::ExpressionStatement; } - Expression *expression(); + Expression *getExpression(); }; /// { statement1; statement2; … } @@ -732,10 +763,10 @@ class CompoundStatement final : public Statement { static bool classof(const Node *N) { return N->kind() == NodeKind::CompoundStatement; } - Leaf *lbrace(); + Leaf *getLbrace(); /// FIXME: use custom iterator instead of 'vector'. - std::vector statements(); - Leaf *rbrace(); + std::vector getStatements(); + Leaf *getRbrace(); }; /// A declaration that can appear at the top-level. Note that this does *not* @@ -777,8 +808,8 @@ class StaticAssertDeclaration final : public Declaration { static bool classof(const Node *N) { return N->kind() == NodeKind::StaticAssertDeclaration; } - Expression *condition(); - Expression *message(); + Expression *getCondition(); + Expression *getMessage(); }; /// extern declaration @@ -802,7 +833,7 @@ class SimpleDeclaration final : public Declaration { return N->kind() == NodeKind::SimpleDeclaration; } /// FIXME: use custom iterator instead of 'vector'. - std::vector declarators(); + std::vector getDeclarators(); }; /// template @@ -812,8 +843,8 @@ class TemplateDeclaration final : public Declaration { static bool classof(const Node *N) { return N->kind() == NodeKind::TemplateDeclaration; } - Leaf *templateKeyword(); - Declaration *declaration(); + Leaf *getTemplateKeyword(); + Declaration *getDeclaration(); }; /// template @@ -828,9 +859,9 @@ class ExplicitTemplateInstantiation final : public Declaration { static bool classof(const Node *N) { return N->kind() == NodeKind::ExplicitTemplateInstantiation; } - Leaf *templateKeyword(); - Leaf *externKeyword(); - Declaration *declaration(); + Leaf *getTemplateKeyword(); + Leaf *getExternKeyword(); + Declaration *getDeclaration(); }; /// namespace { } @@ -920,8 +951,8 @@ class ParenDeclarator final : public Declarator { static bool classof(const Node *N) { return N->kind() == NodeKind::ParenDeclarator; } - Leaf *lparen(); - Leaf *rparen(); + Leaf *getLparen(); + Leaf *getRparen(); }; /// Array size specified inside a declarator. @@ -935,9 +966,9 @@ class ArraySubscript final : public Tree { return N->kind() == NodeKind::ArraySubscript; } // TODO: add an accessor for the "static" keyword. - Leaf *lbracket(); - Expression *sizeExpression(); - Leaf *rbracket(); + Leaf *getLbracket(); + Expression *getSize(); + Leaf *getRbracket(); }; /// Trailing return type after the parameter list, including the arrow token. @@ -949,8 +980,23 @@ class TrailingReturnType final : public Tree { return N->kind() == NodeKind::TrailingReturnType; } // TODO: add accessors for specifiers. - Leaf *arrowToken(); - SimpleDeclarator *declarator(); + Leaf *getArrowToken(); + // FIXME: This should be a `type-id` following the grammar. Fix this once we + // have a representation of `type-id`s. + SimpleDeclarator *getDeclarator(); +}; + +/// Models a `parameter-declaration-list` which appears within +/// `parameters-and-qualifiers`. See C++ [dcl.fct] +class ParameterDeclarationList final : public List { +public: + ParameterDeclarationList() : List(NodeKind::ParameterDeclarationList) {} + static bool classof(const Node *N) { + return N->kind() == NodeKind::ParameterDeclarationList; + } + std::vector getParameterDeclarations(); + std::vector> + getParametersAndCommas(); }; /// Parameter list for a function type and a trailing return type, if the @@ -970,11 +1016,10 @@ class ParametersAndQualifiers final : public Tree { static bool classof(const Node *N) { return N->kind() == NodeKind::ParametersAndQualifiers; } - Leaf *lparen(); - /// FIXME: use custom iterator instead of 'vector'. - std::vector parameters(); - Leaf *rparen(); - TrailingReturnType *trailingReturn(); + Leaf *getLparen(); + ParameterDeclarationList *getParameters(); + Leaf *getRparen(); + TrailingReturnType *getTrailingReturn(); }; /// Member pointer inside a declarator diff --git a/clang/include/clang/Tooling/Syntax/Tree.h b/clang/include/clang/Tooling/Syntax/Tree.h index fcd169cad3ec6..f7f9e6bdc5a09 100644 --- a/clang/include/clang/Tooling/Syntax/Tree.h +++ b/clang/include/clang/Tooling/Syntax/Tree.h @@ -106,9 +106,9 @@ class Node { Node *nextSibling() { return NextSibling; } /// Dumps the structure of a subtree. For debugging and testing purposes. - std::string dump(const Arena &A) const; + std::string dump(const SourceManager &SM) const; /// Dumps the tokens forming this subtree. - std::string dumpTokens(const Arena &A) const; + std::string dumpTokens(const SourceManager &SM) const; /// Asserts invariants on this node of the tree and its immediate children. /// Will not recurse into the subtree. No-op if NDEBUG is set. diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index fe3d2f6c95880..59ca0b8c963f7 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -1876,50 +1876,6 @@ TypeInfo ASTContext::getTypeInfo(const Type *T) const { return TI; } -static unsigned getSveVectorWidth(const Type *T) { - // Get the vector size from the 'arm_sve_vector_bits' attribute via the - // AttributedTypeLoc associated with the typedef decl. - if (const auto *TT = T->getAs()) { - const TypedefNameDecl *Typedef = TT->getDecl(); - TypeSourceInfo *TInfo = Typedef->getTypeSourceInfo(); - TypeLoc TL = TInfo->getTypeLoc(); - if (AttributedTypeLoc ATL = TL.getAs()) - if (const auto *Attr = ATL.getAttrAs()) - return Attr->getNumBits(); - } - - llvm_unreachable("bad 'arm_sve_vector_bits' attribute!"); -} - -static unsigned getSvePredWidth(const ASTContext &Context, const Type *T) { - return getSveVectorWidth(T) / Context.getCharWidth(); -} - -unsigned ASTContext::getBitwidthForAttributedSveType(const Type *T) const { - assert(T->isVLST() && - "getBitwidthForAttributedSveType called for non-attributed type!"); - - switch (T->castAs()->getKind()) { - default: - llvm_unreachable("unknown builtin type!"); - case BuiltinType::SveInt8: - case BuiltinType::SveInt16: - case BuiltinType::SveInt32: - case BuiltinType::SveInt64: - case BuiltinType::SveUint8: - case BuiltinType::SveUint16: - case BuiltinType::SveUint32: - case BuiltinType::SveUint64: - case BuiltinType::SveFloat16: - case BuiltinType::SveFloat32: - case BuiltinType::SveFloat64: - case BuiltinType::SveBFloat16: - return getSveVectorWidth(T); - case BuiltinType::SveBool: - return getSvePredWidth(*this, T); - } -} - /// getTypeInfoImpl - Return the size of the specified type, in bits. This /// method does not work on incomplete types. /// @@ -1986,6 +1942,13 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { uint64_t TargetVectorAlign = Target->getMaxVectorAlign(); if (TargetVectorAlign && TargetVectorAlign < Align) Align = TargetVectorAlign; + if (VT->getVectorKind() == VectorType::SveFixedLengthDataVector) + // Adjust the alignment for fixed-length SVE vectors. This is important + // for non-power-of-2 vector lengths. + Align = 128; + else if (VT->getVectorKind() == VectorType::SveFixedLengthPredicateVector) + // Adjust the alignment for fixed-length SVE predicates. + Align = 16; break; } @@ -2331,10 +2294,7 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { Align = Info.Align; AlignIsRequired = Info.AlignIsRequired; } - if (T->isVLST()) - Width = getBitwidthForAttributedSveType(T); - else - Width = Info.Width; + Width = Info.Width; break; } @@ -8563,6 +8523,31 @@ bool ASTContext::areCompatibleVectorTypes(QualType FirstVec, return false; } +bool ASTContext::areCompatibleSveTypes(QualType FirstType, + QualType SecondType) { + assert(((FirstType->isSizelessBuiltinType() && SecondType->isVectorType()) || + (FirstType->isVectorType() && SecondType->isSizelessBuiltinType())) && + "Expected SVE builtin type and vector type!"); + + auto IsValidCast = [this](QualType FirstType, QualType SecondType) { + if (const auto *BT = FirstType->getAs()) { + if (const auto *VT = SecondType->getAs()) { + // Predicates have the same representation as uint8 so we also have to + // check the kind to make these types incompatible. + if (VT->getVectorKind() == VectorType::SveFixedLengthPredicateVector) + return BT->getKind() == BuiltinType::SveBool; + else if (VT->getVectorKind() == VectorType::SveFixedLengthDataVector) + return VT->getElementType().getCanonicalType() == + FirstType->getSveEltType(*this); + } + } + return false; + }; + + return IsValidCast(FirstType, SecondType) || + IsValidCast(SecondType, FirstType); +} + bool ASTContext::hasDirectOwnershipQualifier(QualType Ty) const { while (true) { // __strong id diff --git a/clang/lib/AST/AttrImpl.cpp b/clang/lib/AST/AttrImpl.cpp index 7818fbb1918bb..7ca58f2b83a2c 100644 --- a/clang/lib/AST/AttrImpl.cpp +++ b/clang/lib/AST/AttrImpl.cpp @@ -136,8 +136,16 @@ llvm::Optional OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(const ValueDecl *VD) { if (!VD->hasAttrs()) return llvm::None; - if (const auto *Attr = VD->getAttr()) - return Attr->getMapType(); + unsigned Level = 0; + const OMPDeclareTargetDeclAttr *FoundAttr = nullptr; + for (const auto *Attr : VD->specific_attrs()) { + if (Level < Attr->getLevel()) { + Level = Attr->getLevel(); + FoundAttr = Attr; + } + } + if (FoundAttr) + return FoundAttr->getMapType(); return llvm::None; } @@ -146,8 +154,34 @@ llvm::Optional OMPDeclareTargetDeclAttr::getDeviceType(const ValueDecl *VD) { if (!VD->hasAttrs()) return llvm::None; - if (const auto *Attr = VD->getAttr()) - return Attr->getDevType(); + unsigned Level = 0; + const OMPDeclareTargetDeclAttr *FoundAttr = nullptr; + for (const auto *Attr : VD->specific_attrs()) { + if (Level < Attr->getLevel()) { + Level = Attr->getLevel(); + FoundAttr = Attr; + } + } + if (FoundAttr) + return FoundAttr->getDevType(); + + return llvm::None; +} + +llvm::Optional +OMPDeclareTargetDeclAttr::getLocation(const ValueDecl *VD) { + if (!VD->hasAttrs()) + return llvm::None; + unsigned Level = 0; + const OMPDeclareTargetDeclAttr *FoundAttr = nullptr; + for (const auto *Attr : VD->specific_attrs()) { + if (Level < Attr->getLevel()) { + Level = Attr->getLevel(); + FoundAttr = Attr; + } + } + if (FoundAttr) + return FoundAttr->getRange().getBegin(); return llvm::None; } diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index da1eadd9d931d..f4314d0bd9614 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1487,6 +1487,13 @@ static bool shouldBeHidden(NamedDecl *D) { if (FD->isFunctionTemplateSpecialization()) return true; + // Hide destructors that are invalid. There should always be one destructor, + // but if it is an invalid decl, another one is created. We need to hide the + // invalid one from places that expect exactly one destructor, like the + // serialization code. + if (isa(D) && D->isInvalidDecl()) + return true; + return false; } diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp index 4bd00ece86ab8..a9136a903b988 100644 --- a/clang/lib/AST/JSONNodeDumper.cpp +++ b/clang/lib/AST/JSONNodeDumper.cpp @@ -616,6 +616,12 @@ void JSONNodeDumper::VisitVectorType(const VectorType *VT) { case VectorType::NeonPolyVector: JOS.attribute("vectorKind", "neon poly"); break; + case VectorType::SveFixedLengthDataVector: + JOS.attribute("vectorKind", "fixed-length sve data vector"); + break; + case VectorType::SveFixedLengthPredicateVector: + JOS.attribute("vectorKind", "fixed-length sve predicate vector"); + break; } } diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 47a7e431faf8c..16c4c3736a4a3 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -1408,6 +1408,12 @@ void TextNodeDumper::VisitVectorType(const VectorType *T) { case VectorType::NeonPolyVector: OS << " neon poly"; break; + case VectorType::SveFixedLengthDataVector: + OS << " fixed-length sve data vector"; + break; + case VectorType::SveFixedLengthPredicateVector: + OS << " fixed-length sve predicate vector"; + break; } OS << " " << T->getNumElements(); } diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index d2755b55075d4..3754a515f115a 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2313,11 +2313,42 @@ bool Type::isVLSTBuiltinType() const { return false; } -bool Type::isVLST() const { - if (!isVLSTBuiltinType()) - return false; +QualType Type::getSveEltType(const ASTContext &Ctx) const { + assert(isVLSTBuiltinType() && "unsupported type!"); - return hasAttr(attr::ArmSveVectorBits); + const BuiltinType *BTy = getAs(); + switch (BTy->getKind()) { + default: + llvm_unreachable("Unknown builtin SVE type!"); + case BuiltinType::SveInt8: + return Ctx.SignedCharTy; + case BuiltinType::SveUint8: + case BuiltinType::SveBool: + // Represent predicates as i8 rather than i1 to avoid any layout issues. + // The type is bitcasted to a scalable predicate type when casting between + // scalable and fixed-length vectors. + return Ctx.UnsignedCharTy; + case BuiltinType::SveInt16: + return Ctx.ShortTy; + case BuiltinType::SveUint16: + return Ctx.UnsignedShortTy; + case BuiltinType::SveInt32: + return Ctx.IntTy; + case BuiltinType::SveUint32: + return Ctx.UnsignedIntTy; + case BuiltinType::SveInt64: + return Ctx.LongTy; + case BuiltinType::SveUint64: + return Ctx.UnsignedLongTy; + case BuiltinType::SveFloat16: + return Ctx.Float16Ty; + case BuiltinType::SveBFloat16: + return Ctx.BFloat16Ty; + case BuiltinType::SveFloat32: + return Ctx.FloatTy; + case BuiltinType::SveFloat64: + return Ctx.DoubleTy; + } } bool QualType::isPODType(const ASTContext &Context) const { diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index ab82b5d338c6b..28bbbcf0e139e 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -663,6 +663,24 @@ void TypePrinter::printVectorBefore(const VectorType *T, raw_ostream &OS) { printBefore(T->getElementType(), OS); break; } + case VectorType::SveFixedLengthDataVector: + case VectorType::SveFixedLengthPredicateVector: + // FIXME: We prefer to print the size directly here, but have no way + // to get the size of the type. + OS << "__attribute__((__arm_sve_vector_bits__("; + + if (T->getVectorKind() == VectorType::SveFixedLengthPredicateVector) + // Predicates take a bit per byte of the vector size, multiply by 8 to + // get the number of bits passed to the attribute. + OS << T->getNumElements() * 8; + else + OS << T->getNumElements(); + + OS << " * sizeof("; + print(T->getElementType(), OS, StringRef()); + // Multiply by 8 for the number of bits. + OS << ") * 8))) "; + printBefore(T->getElementType(), OS); } } @@ -710,6 +728,24 @@ void TypePrinter::printDependentVectorBefore( printBefore(T->getElementType(), OS); break; } + case VectorType::SveFixedLengthDataVector: + case VectorType::SveFixedLengthPredicateVector: + // FIXME: We prefer to print the size directly here, but have no way + // to get the size of the type. + OS << "__attribute__((__arm_sve_vector_bits__("; + if (T->getSizeExpr()) { + T->getSizeExpr()->printPretty(OS, nullptr, Policy); + if (T->getVectorKind() == VectorType::SveFixedLengthPredicateVector) + // Predicates take a bit per byte of the vector size, multiply by 8 to + // get the number of bits passed to the attribute. + OS << " * 8"; + OS << " * sizeof("; + print(T->getElementType(), OS, StringRef()); + // Multiply by 8 for the number of bits. + OS << ") * 8"; + } + OS << "))) "; + printBefore(T->getElementType(), OS); } } @@ -1651,9 +1687,6 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, case attr::ArmMveStrictPolymorphism: OS << "__clang_arm_mve_strict_polymorphism"; break; - case attr::ArmSveVectorBits: - OS << "arm_sve_vector_bits"; - break; } OS << "))"; } diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp index ec53be2269497..5894afd69a827 100644 --- a/clang/lib/Basic/Targets.cpp +++ b/clang/lib/Basic/Targets.cpp @@ -452,6 +452,8 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple, switch (os) { case llvm::Triple::Linux: return new LinuxTargetInfo(Triple, Opts); + case llvm::Triple::ZOS: + return new ZOSTargetInfo(Triple, Opts); default: return new SystemZTargetInfo(Triple, Opts); } diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h index a2c0fd42f26d8..9c206fc7e6a42 100644 --- a/clang/lib/Basic/Targets/OSTargets.h +++ b/clang/lib/Basic/Targets/OSTargets.h @@ -728,6 +728,55 @@ class AIXTargetInfo : public OSTargetInfo { bool defaultsToAIXPowerAlignment() const override { return true; } }; +// z/OS target +template +class LLVM_LIBRARY_VISIBILITY ZOSTargetInfo : public OSTargetInfo { +protected: + void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple, + MacroBuilder &Builder) const override { + // FIXME: _LONG_LONG should not be defined under -std=c89. + Builder.defineMacro("_LONG_LONG"); + Builder.defineMacro("_OPEN_DEFAULT"); + // _UNIX03_WITHDRAWN is required to build libcxx. + Builder.defineMacro("_UNIX03_WITHDRAWN"); + Builder.defineMacro("__370__"); + Builder.defineMacro("__BFP__"); + // FIXME: __BOOL__ should not be defined under -std=c89. + Builder.defineMacro("__BOOL__"); + Builder.defineMacro("__LONGNAME__"); + Builder.defineMacro("__MVS__"); + Builder.defineMacro("__THW_370__"); + Builder.defineMacro("__THW_BIG_ENDIAN__"); + Builder.defineMacro("__TOS_390__"); + Builder.defineMacro("__TOS_MVS__"); + Builder.defineMacro("__XPLINK__"); + + if (this->PointerWidth == 64) + Builder.defineMacro("__64BIT__"); + + if (Opts.CPlusPlus) { + Builder.defineMacro("__DLL__"); + // _XOPEN_SOURCE=600 is required to build libcxx. + Builder.defineMacro("_XOPEN_SOURCE", "600"); + } + + if (Opts.GNUMode) { + Builder.defineMacro("_MI_BUILTIN"); + Builder.defineMacro("_EXT"); + } + + if (Opts.CPlusPlus && Opts.WChar) { + // Macro __wchar_t is defined so that the wchar_t data + // type is not declared as a typedef in system headers. + Builder.defineMacro("__wchar_t"); + } + } + +public: + ZOSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) + : OSTargetInfo(Triple, Opts) {} +}; + void addWindowsDefines(const llvm::Triple &Triple, const LangOptions &Opts, MacroBuilder &Builder); diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp index c5ad4a5a2d270..13db564d360d5 100644 --- a/clang/lib/Basic/Targets/PPC.cpp +++ b/clang/lib/Basic/Targets/PPC.cpp @@ -64,6 +64,8 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector &Features, FloatABI = SoftFloat; } else if (Feature == "+paired-vector-memops") { PairedVectorMemops = true; + } else if (Feature == "+mma") { + HasMMA = true; } // TODO: Finish this list and add an assert that we've handled them // all. @@ -184,6 +186,8 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__FLOAT128__"); if (HasP9Vector) Builder.defineMacro("__POWER9_VECTOR__"); + if (HasMMA) + Builder.defineMacro("__MMA__"); if (HasP10Vector) Builder.defineMacro("__POWER10_VECTOR__"); @@ -221,6 +225,7 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, // - float128 // - power9-vector // - paired-vector-memops +// - mma // - power10-vector // then go ahead and error since the customer has expressed an incompatible // set of options. @@ -244,6 +249,7 @@ static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags, Found |= FindVSXSubfeature("+float128", "-mfloat128"); Found |= FindVSXSubfeature("+power9-vector", "-mpower9-vector"); Found |= FindVSXSubfeature("+paired-vector-memops", "-mpaired-vector-memops"); + Found |= FindVSXSubfeature("+mma", "-mmma"); Found |= FindVSXSubfeature("+power10-vector", "-mpower10-vector"); // Return false if any vsx subfeatures was found. @@ -345,6 +351,7 @@ void PPCTargetInfo::addP10SpecificFeatures( llvm::StringMap &Features) const { Features["htm"] = false; // HTM was removed for P10. Features["paired-vector-memops"] = true; + Features["mma"] = true; Features["power10-vector"] = true; Features["pcrelative-memops"] = true; return; @@ -373,6 +380,7 @@ bool PPCTargetInfo::hasFeature(StringRef Feature) const { .Case("power10-vector", HasP10Vector) .Case("pcrelative-memops", HasPCRelativeMemops) .Case("spe", HasSPE) + .Case("mma", HasMMA) .Default(false); } @@ -389,6 +397,7 @@ void PPCTargetInfo::setFeatureEnabled(llvm::StringMap &Features, .Case("paired-vector-memops", true) .Case("power10-vector", true) .Case("float128", true) + .Case("mma", true) .Default(false); if (FeatureHasVSX) Features["vsx"] = Features["altivec"] = true; @@ -406,13 +415,14 @@ void PPCTargetInfo::setFeatureEnabled(llvm::StringMap &Features, if ((Name == "altivec") || (Name == "vsx")) Features["vsx"] = Features["direct-move"] = Features["power8-vector"] = Features["float128"] = Features["power9-vector"] = - Features["paired-vector-memops"] = Features["power10-vector"] = - false; + Features["paired-vector-memops"] = Features["mma"] = + Features["power10-vector"] = false; if (Name == "power8-vector") Features["power9-vector"] = Features["paired-vector-memops"] = - Features["power10-vector"] = false; + Features["mma"] = Features["power10-vector"] = false; else if (Name == "power9-vector") - Features["paired-vector-memops"] = Features["power10-vector"] = false; + Features["paired-vector-memops"] = Features["mma"] = + Features["power10-vector"] = false; if (Name == "pcrel") Features["pcrelative-memops"] = false; else diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 88523279a6eed..bca06a7a802dd 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -58,6 +58,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { // Target cpu features. bool HasAltivec = false; + bool HasMMA = false; bool HasVSX = false; bool HasP8Vector = false; bool HasP8Crypto = false; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 64c7ce9182c9f..b829dfac74fbf 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -459,6 +459,7 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, case CK_IcelakeClient: case CK_IcelakeServer: case CK_Tigerlake: + case CK_SapphireRapids: // FIXME: Historically, we defined this legacy name, it would be nice to // remove it at some point. We've never exposed fine-grained names for // recent primary x86 CPUs, and we should keep it that way. @@ -1269,6 +1270,7 @@ Optional X86TargetInfo::getCPUCacheLineSize() const { case CK_Cooperlake: case CK_Cannonlake: case CK_Tigerlake: + case CK_SapphireRapids: case CK_IcelakeClient: case CK_IcelakeServer: case CK_KNL: diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 853c4e6212229..25dc9458c25a6 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -306,6 +306,9 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { } bool isValidTuneCPUName(StringRef Name) const override { + if (Name == "generic") + return true; + // Allow 32-bit only CPUs regardless of 64-bit mode unlike isValidCPUName. // NOTE: gcc rejects 32-bit mtune CPUs in 64-bit mode. But being lenient // since mtune was ignored by clang for so long. diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 8f28deccc3a01..97b29ae434ed2 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -69,6 +69,7 @@ #include "llvm/Transforms/Instrumentation/BoundsChecking.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" +#include "llvm/Transforms/Instrumentation/HeapProfiler.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" #include "llvm/Transforms/Instrumentation/MemorySanitizer.h" #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h" @@ -269,6 +270,12 @@ static bool asanUseGlobalsGC(const Triple &T, const CodeGenOptions &CGOpts) { return false; } +static void addHeapProfilerPasses(const PassManagerBuilder &Builder, + legacy::PassManagerBase &PM) { + PM.add(createHeapProfilerFunctionPass()); + PM.add(createModuleHeapProfilerLegacyPassPass()); +} + static void addAddressSanitizerPasses(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { const PassManagerBuilderWrapper &BuilderWrapper = @@ -522,6 +529,8 @@ static void initTargetOptions(DiagnosticsEngine &Diags, Options.EmitAddrsig = CodeGenOpts.Addrsig; Options.ForceDwarfFrameSection = CodeGenOpts.ForceDwarfFrameSection; Options.EmitCallSiteInfo = CodeGenOpts.EmitCallSiteInfo; + Options.ValueTrackingVariableLocations = + CodeGenOpts.ValueTrackingVariableLocations; Options.XRayOmitFunctionIndex = CodeGenOpts.XRayOmitFunctionIndex; Options.MCOptions.SplitDwarfFile = CodeGenOpts.SplitDwarfFile; @@ -662,6 +671,13 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM, if (LangOpts.Coroutines) addCoroutinePassesToExtensionPoints(PMBuilder); + if (CodeGenOpts.HeapProf) { + PMBuilder.addExtension(PassManagerBuilder::EP_OptimizerLast, + addHeapProfilerPasses); + PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0, + addHeapProfilerPasses); + } + if (LangOpts.Sanitize.has(SanitizerKind::LocalBounds)) { PMBuilder.addExtension(PassManagerBuilder::EP_ScalarOptimizerLate, addBoundsCheckingPass); @@ -1403,6 +1419,11 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager( } } + if (CodeGenOpts.HeapProf) { + MPM.addPass(createModuleToFunctionPassAdaptor(HeapProfilerPass())); + MPM.addPass(ModuleHeapProfilerPass()); + } + if (LangOpts.Sanitize.has(SanitizerKind::HWAddress)) { bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::HWAddress); MPM.addPass(HWAddressSanitizerPass( diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp index d7720a23dd720..a640cb7b5a6ec 100644 --- a/clang/lib/CodeGen/CGAtomic.cpp +++ b/clang/lib/CodeGen/CGAtomic.cpp @@ -119,7 +119,7 @@ namespace { ValueTy = lvalue.getType(); ValueSizeInBits = C.getTypeSize(ValueTy); AtomicTy = ValueTy = CGF.getContext().getExtVectorType( - lvalue.getType(), cast( + lvalue.getType(), cast( lvalue.getExtVectorAddress().getElementType()) ->getNumElements()); AtomicSizeInBits = C.getTypeSize(AtomicTy); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 40ea23aa9f93a..b6453116daff1 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4567,11 +4567,11 @@ Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID, getTarget().getTriple().getArch()); } -static llvm::VectorType *GetNeonType(CodeGenFunction *CGF, - NeonTypeFlags TypeFlags, - bool HasLegalHalfType = true, - bool V1Ty = false, - bool AllowBFloatArgsAndRet = true) { +static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF, + NeonTypeFlags TypeFlags, + bool HasLegalHalfType = true, + bool V1Ty = false, + bool AllowBFloatArgsAndRet = true) { int IsQuad = TypeFlags.isQuad(); switch (TypeFlags.getEltType()) { case NeonTypeFlags::Int8: @@ -5627,8 +5627,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( const bool AllowBFloatArgsAndRet = getTargetHooks().getABIInfo().allowBFloatArgsAndRet(); - llvm::VectorType *VTy = GetNeonType(this, Type, HasLegalHalfType, false, - AllowBFloatArgsAndRet); + llvm::FixedVectorType *VTy = + GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -5669,8 +5669,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs"); return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs"); case NEON::BI__builtin_neon_vaddhn_v: { - llvm::VectorType *SrcTy = - llvm::VectorType::getExtendedElementVectorType(VTy); + llvm::FixedVectorType *SrcTy = + llvm::FixedVectorType::getExtendedElementVectorType(VTy); // %sum = add <4 x i32> %lhs, %rhs Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); @@ -5942,14 +5942,16 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); } case NEON::BI__builtin_neon_vmovl_v: { - llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy); + llvm::FixedVectorType *DTy = + llvm::FixedVectorType::getTruncatedElementVectorType(VTy); Ops[0] = Builder.CreateBitCast(Ops[0], DTy); if (Usgn) return Builder.CreateZExt(Ops[0], Ty, "vmovl"); return Builder.CreateSExt(Ops[0], Ty, "vmovl"); } case NEON::BI__builtin_neon_vmovn_v: { - llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy); + llvm::FixedVectorType *QTy = + llvm::FixedVectorType::getExtendedElementVectorType(VTy); Ops[0] = Builder.CreateBitCast(Ops[0], QTy); return Builder.CreateTrunc(Ops[0], Ty, "vmovn"); } @@ -5995,7 +5997,7 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vqdmulh_lane_v: case NEON::BI__builtin_neon_vqrdmulhq_lane_v: case NEON::BI__builtin_neon_vqrdmulh_lane_v: { - auto *RTy = cast(Ty); + auto *RTy = cast(Ty); if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v || BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v) RTy = llvm::FixedVectorType::get(RTy->getElementType(), @@ -6044,7 +6046,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1], "vshl_n"); case NEON::BI__builtin_neon_vshll_n_v: { - llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy); + llvm::FixedVectorType *SrcTy = + llvm::FixedVectorType::getTruncatedElementVectorType(VTy); Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); if (Usgn) Ops[0] = Builder.CreateZExt(Ops[0], VTy); @@ -6054,7 +6057,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( return Builder.CreateShl(Ops[0], Ops[1], "vshll_n"); } case NEON::BI__builtin_neon_vshrn_n_v: { - llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy); + llvm::FixedVectorType *SrcTy = + llvm::FixedVectorType::getExtendedElementVectorType(VTy); Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false); if (Usgn) @@ -6103,8 +6107,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, ""); } case NEON::BI__builtin_neon_vsubhn_v: { - llvm::VectorType *SrcTy = - llvm::VectorType::getExtendedElementVectorType(VTy); + llvm::FixedVectorType *SrcTy = + llvm::FixedVectorType::getExtendedElementVectorType(VTy); // %sum = add <4 x i32> %lhs, %rhs Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); @@ -6243,28 +6247,10 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vbfdot_v: case NEON::BI__builtin_neon_vbfdotq_v: { llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16); llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot"); } - case NEON::BI__builtin_neon_vbfmmlaq_v: { - llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); - llvm::Type *Tys[2] = { Ty, InputTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmmla"); - } - case NEON::BI__builtin_neon_vbfmlalbq_v: { - llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); - llvm::Type *Tys[2] = { Ty, InputTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalb"); - } - case NEON::BI__builtin_neon_vbfmlaltq_v: { - llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); - llvm::Type *Tys[2] = { Ty, InputTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt"); - } case NEON::BI__builtin_neon___a32_vcvt_bf16_v: { llvm::Type *Tys[1] = { Ty }; Function *F = CGM.getIntrinsic(Int, Tys); @@ -6316,7 +6302,7 @@ static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef Ops, // Build a vector containing sequential number like (0, 1, 2, ..., 15) SmallVector Indices; - llvm::VectorType *TblTy = cast(Ops[0]->getType()); + auto *TblTy = cast(Ops[0]->getType()); for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) { Indices.push_back(2*i); Indices.push_back(2*i+1); @@ -7157,10 +7143,9 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, bool usgn = Type.isUnsigned(); bool rightShift = false; - llvm::VectorType *VTy = GetNeonType(this, Type, - getTarget().hasLegalHalfType(), - false, - getTarget().hasBFloat16Type()); + llvm::FixedVectorType *VTy = + GetNeonType(this, Type, getTarget().hasLegalHalfType(), false, + getTarget().hasBFloat16Type()); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -7368,7 +7353,7 @@ static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd) // or odds, as desired). SmallVector Indices; unsigned InputElements = - cast(V->getType())->getNumElements(); + cast(V->getType())->getNumElements(); for (unsigned i = 0; i < InputElements; i += 2) Indices.push_back(i + Odd); return Builder.CreateShuffleVector(V, llvm::UndefValue::get(V->getType()), @@ -7381,7 +7366,7 @@ static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0, assert(V0->getType() == V1->getType() && "Can't zip different vector types"); SmallVector Indices; unsigned InputElements = - cast(V0->getType())->getNumElements(); + cast(V0->getType())->getNumElements(); for (unsigned i = 0; i < InputElements; i++) { Indices.push_back(i); Indices.push_back(i + InputElements); @@ -7577,7 +7562,7 @@ static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID // Determine the type of this overloaded NEON intrinsic. NeonTypeFlags Type = Result->getZExtValue(); - llvm::VectorType *Ty = GetNeonType(&CGF, Type); + llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type); if (!Ty) return nullptr; @@ -9779,7 +9764,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, } } - llvm::VectorType *VTy = GetNeonType(this, Type); + llvm::FixedVectorType *VTy = GetNeonType(this, Type); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -9840,13 +9825,13 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla"); } case NEON::BI__builtin_neon_vfma_laneq_v: { - llvm::VectorType *VTy = cast(Ty); + auto *VTy = cast(Ty); // v1f64 fma should be mapped to Neon scalar f64 fma if (VTy && VTy->getElementType() == DoubleTy) { Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy); - llvm::Type *VTy = GetNeonType(this, - NeonTypeFlags(NeonTypeFlags::Float64, false, true)); + llvm::FixedVectorType *VTy = + GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true)); Ops[2] = Builder.CreateBitCast(Ops[2], VTy); Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); Value *Result; @@ -10214,8 +10199,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v) Quad = true; Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); - llvm::Type *VTy = GetNeonType(this, - NeonTypeFlags(NeonTypeFlags::Float64, false, Quad)); + llvm::FixedVectorType *VTy = + GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad)); Ops[1] = Builder.CreateBitCast(Ops[1], VTy); Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); Value *Result = Builder.CreateFMul(Ops[0], Ops[1]); @@ -11087,7 +11072,8 @@ static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef Ops, llvm::PointerType::getUnqual(Ops[1]->getType())); Value *MaskVec = getMaskVecValue( - CGF, Ops[2], cast(Ops[1]->getType())->getNumElements()); + CGF, Ops[2], + cast(Ops[1]->getType())->getNumElements()); return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec); } @@ -11099,7 +11085,8 @@ static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef Ops, llvm::PointerType::getUnqual(Ops[1]->getType())); Value *MaskVec = getMaskVecValue( - CGF, Ops[2], cast(Ops[1]->getType())->getNumElements()); + CGF, Ops[2], + cast(Ops[1]->getType())->getNumElements()); return CGF.Builder.CreateMaskedLoad(Ptr, Alignment, MaskVec, Ops[1]); } @@ -11113,7 +11100,8 @@ static Value *EmitX86ExpandLoad(CodeGenFunction &CGF, Value *Ptr = CGF.Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(PtrTy)); - Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements()); + Value *MaskVec = getMaskVecValue( + CGF, Ops[2], cast(ResultTy)->getNumElements()); llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload, ResultTy); @@ -11123,7 +11111,7 @@ static Value *EmitX86ExpandLoad(CodeGenFunction &CGF, static Value *EmitX86CompressExpand(CodeGenFunction &CGF, ArrayRef Ops, bool IsCompress) { - auto *ResultTy = cast(Ops[1]->getType()); + auto *ResultTy = cast(Ops[1]->getType()); Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements()); @@ -11135,7 +11123,7 @@ static Value *EmitX86CompressExpand(CodeGenFunction &CGF, static Value *EmitX86CompressStore(CodeGenFunction &CGF, ArrayRef Ops) { - auto *ResultTy = cast(Ops[1]->getType()); + auto *ResultTy = cast(Ops[1]->getType()); llvm::Type *PtrTy = ResultTy->getElementType(); // Cast the pointer to element type. @@ -11171,7 +11159,7 @@ static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1, // Funnel shifts amounts are treated as modulo and types are all power-of-2 so // we only care about the lowest log2 bits anyway. if (Amt->getType() != Ty) { - unsigned NumElts = cast(Ty)->getNumElements(); + unsigned NumElts = cast(Ty)->getNumElements(); Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false); Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt); } @@ -11230,7 +11218,7 @@ static Value *EmitX86Select(CodeGenFunction &CGF, return Op0; Mask = getMaskVecValue( - CGF, Mask, cast(Op0->getType())->getNumElements()); + CGF, Mask, cast(Op0->getType())->getNumElements()); return CGF.Builder.CreateSelect(Mask, Op0, Op1); } @@ -11277,7 +11265,7 @@ static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC, assert((Ops.size() == 2 || Ops.size() == 4) && "Unexpected number of arguments"); unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Value *Cmp; if (CC == 3) { @@ -11554,7 +11542,8 @@ static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask, static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op, llvm::Type *DstTy) { - unsigned NumberOfElements = cast(DstTy)->getNumElements(); + unsigned NumberOfElements = + cast(DstTy)->getNumElements(); Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements); return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2"); } @@ -11590,11 +11579,12 @@ static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF, return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]}); } - unsigned NumDstElts = cast(DstTy)->getNumElements(); + unsigned NumDstElts = cast(DstTy)->getNumElements(); Value *Src = Ops[0]; // Extract the subvector. - if (NumDstElts != cast(Src->getType())->getNumElements()) { + if (NumDstElts != + cast(Src->getType())->getNumElements()) { assert(NumDstElts == 4 && "Unexpected vector size"); Src = CGF.Builder.CreateShuffleVector(Src, UndefValue::get(Src->getType()), ArrayRef{0, 1, 2, 3}); @@ -11893,7 +11883,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vec_ext_v8si: case X86::BI__builtin_ia32_vec_ext_v4di: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); uint64_t Index = cast(Ops[1])->getZExtValue(); Index &= NumElts - 1; // These builtins exist so we can ensure the index is an ICE and in range. @@ -11909,7 +11899,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vec_set_v8si: case X86::BI__builtin_ia32_vec_set_v4di: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned Index = cast(Ops[2])->getZExtValue(); Index &= NumElts - 1; // These builtins exist so we can ensure the index is an ICE and in range. @@ -12335,9 +12325,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, break; } - unsigned MinElts = - std::min(cast(Ops[0]->getType())->getNumElements(), - cast(Ops[2]->getType())->getNumElements()); + unsigned MinElts = std::min( + cast(Ops[0]->getType())->getNumElements(), + cast(Ops[2]->getType())->getNumElements()); Ops[3] = getMaskVecValue(*this, Ops[3], MinElts); Function *Intr = CGM.getIntrinsic(IID); return Builder.CreateCall(Intr, Ops); @@ -12444,9 +12434,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, break; } - unsigned MinElts = - std::min(cast(Ops[2]->getType())->getNumElements(), - cast(Ops[3]->getType())->getNumElements()); + unsigned MinElts = std::min( + cast(Ops[2]->getType())->getNumElements(), + cast(Ops[3]->getType())->getNumElements()); Ops[1] = getMaskVecValue(*this, Ops[1], MinElts); Function *Intr = CGM.getIntrinsic(IID); return Builder.CreateCall(Intr, Ops); @@ -12468,10 +12458,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_extracti64x2_256_mask: case X86::BI__builtin_ia32_extractf64x2_512_mask: case X86::BI__builtin_ia32_extracti64x2_512_mask: { - auto *DstTy = cast(ConvertType(E->getType())); + auto *DstTy = cast(ConvertType(E->getType())); unsigned NumElts = DstTy->getNumElements(); unsigned SrcNumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned SubVectors = SrcNumElts / NumElts; unsigned Index = cast(Ops[1])->getZExtValue(); assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); @@ -12509,9 +12499,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_insertf64x2_512: case X86::BI__builtin_ia32_inserti64x2_512: { unsigned DstNumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned SrcNumElts = - cast(Ops[1]->getType())->getNumElements(); + cast(Ops[1]->getType())->getNumElements(); unsigned SubVectors = DstNumElts / SrcNumElts; unsigned Index = cast(Ops[2])->getZExtValue(); assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); @@ -12576,7 +12566,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_pblendd128: case X86::BI__builtin_ia32_pblendd256: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned Imm = cast(Ops[2])->getZExtValue(); int Indices[16]; @@ -12593,7 +12583,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_pshuflw256: case X86::BI__builtin_ia32_pshuflw512: { uint32_t Imm = cast(Ops[1])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); // Splat the 8-bits of immediate 4 times to help the loop wrap around. @@ -12617,7 +12607,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_pshufhw256: case X86::BI__builtin_ia32_pshufhw512: { uint32_t Imm = cast(Ops[1])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); // Splat the 8-bits of immediate 4 times to help the loop wrap around. @@ -12647,7 +12637,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vpermilpd512: case X86::BI__builtin_ia32_vpermilps512: { uint32_t Imm = cast(Ops[1])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; unsigned NumLaneElts = NumElts / NumLanes; @@ -12674,7 +12664,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_shufps256: case X86::BI__builtin_ia32_shufps512: { uint32_t Imm = cast(Ops[2])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; unsigned NumLaneElts = NumElts / NumLanes; @@ -12702,7 +12692,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_permdi512: case X86::BI__builtin_ia32_permdf512: { unsigned Imm = cast(Ops[1])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); // These intrinsics operate on 256-bit lanes of four 64-bit elements. @@ -12721,7 +12711,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, unsigned ShiftVal = cast(Ops[2])->getZExtValue() & 0xff; unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); assert(NumElts % 16 == 0); // If palignr is shifting the pair of vectors more than the size of two @@ -12759,7 +12749,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_alignq256: case X86::BI__builtin_ia32_alignq512: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned ShiftVal = cast(Ops[2])->getZExtValue() & 0xff; // Mask the shift amount to width of two vectors. @@ -12782,7 +12772,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_shuf_i32x4: case X86::BI__builtin_ia32_shuf_i64x2: { unsigned Imm = cast(Ops[2])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2; unsigned NumLaneElts = NumElts / NumLanes; @@ -12809,7 +12799,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_permti256: { unsigned Imm = cast(Ops[2])->getZExtValue(); unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); // This takes a very simple approach since there are two lanes and a // shuffle can have 2 inputs. So we reserve the first input for the first @@ -12847,7 +12837,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_pslldqi256_byteshift: case X86::BI__builtin_ia32_pslldqi512_byteshift: { unsigned ShiftVal = cast(Ops[1])->getZExtValue() & 0xff; - auto *ResultType = cast(Ops[0]->getType()); + auto *ResultType = cast(Ops[0]->getType()); // Builtin type is vXi64 so multiply by 8 to get bytes. unsigned NumElts = ResultType->getNumElements() * 8; @@ -12877,7 +12867,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_psrldqi256_byteshift: case X86::BI__builtin_ia32_psrldqi512_byteshift: { unsigned ShiftVal = cast(Ops[1])->getZExtValue() & 0xff; - auto *ResultType = cast(Ops[0]->getType()); + auto *ResultType = cast(Ops[0]->getType()); // Builtin type is vXi64 so multiply by 8 to get bytes. unsigned NumElts = ResultType->getNumElements() * 8; @@ -13524,7 +13514,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_fpclasspd256_mask: case X86::BI__builtin_ia32_fpclasspd512_mask: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Value *MaskIn = Ops[2]; Ops.erase(&Ops[2]); @@ -13562,7 +13552,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vp2intersect_d_256: case X86::BI__builtin_ia32_vp2intersect_d_128: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Intrinsic::ID ID; switch (BuiltinID) { @@ -13621,7 +13611,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vpshufbitqmb256_mask: case X86::BI__builtin_ia32_vpshufbitqmb512_mask: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Value *MaskIn = Ops[2]; Ops.erase(&Ops[2]); @@ -13768,7 +13758,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Function *Intr = CGM.getIntrinsic(IID); if (IsMaskFCmp) { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Ops[3] = getMaskVecValue(*this, Ops[3], NumElts); Value *Cmp = Builder.CreateCall(Intr, Ops); return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr); @@ -13783,7 +13773,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // We ignore SAE if strict FP is disabled. We only keep precise // exception behavior under strict FP. unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Value *Cmp; if (IsSignaling) Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]); @@ -13841,7 +13831,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: { Ops[2] = getMaskVecValue( *this, Ops[2], - cast(Ops[0]->getType())->getNumElements()); + cast(Ops[0]->getType())->getNumElements()); Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128; return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 6bcbdc03dcd1a..11ce9b4b45c06 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -727,23 +727,39 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) { case BuiltinType::Id: \ return getOrCreateStructPtrType("opencl_" #ExtType, Id##Ty); #include "clang/Basic/OpenCLExtensionTypes.def" - // TODO: real support for SVE types requires more infrastructure - // to be added first. The types have a variable length and are - // represented in debug info as types whose length depends on a - // target-specific pseudo register. -#define SVE_TYPE(Name, Id, SingletonId) \ - case BuiltinType::Id: + +#define SVE_TYPE(Name, Id, SingletonId) case BuiltinType::Id: #include "clang/Basic/AArch64SVEACLETypes.def" - { - unsigned DiagID = CGM.getDiags().getCustomDiagID( - DiagnosticsEngine::Error, - "cannot yet generate debug info for SVE type '%0'"); - auto Name = BT->getName(CGM.getContext().getPrintingPolicy()); - CGM.getDiags().Report(DiagID) << Name; - // Return something safe. - return CreateType(cast(CGM.getContext().IntTy)); - } + { + ASTContext::BuiltinVectorTypeInfo Info = + CGM.getContext().getBuiltinVectorTypeInfo(BT); + unsigned NumElemsPerVG = (Info.EC.Min * Info.NumVectors) / 2; + + // Debuggers can't extract 1bit from a vector, so will display a + // bitpattern for svbool_t instead. + if (Info.ElementType == CGM.getContext().BoolTy) { + NumElemsPerVG /= 8; + Info.ElementType = CGM.getContext().UnsignedCharTy; + } + auto *LowerBound = + llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned( + llvm::Type::getInt64Ty(CGM.getLLVMContext()), 0)); + SmallVector Expr( + {llvm::dwarf::DW_OP_constu, NumElemsPerVG, llvm::dwarf::DW_OP_bregx, + /* AArch64::VG */ 46, 0, llvm::dwarf::DW_OP_mul, + llvm::dwarf::DW_OP_constu, 1, llvm::dwarf::DW_OP_minus}); + auto *UpperBound = DBuilder.createExpression(Expr); + + llvm::Metadata *Subscript = DBuilder.getOrCreateSubrange( + /*count*/ nullptr, LowerBound, UpperBound, /*stride*/ nullptr); + llvm::DINodeArray SubscriptArray = DBuilder.getOrCreateArray(Subscript); + llvm::DIType *ElemTy = + getOrCreateType(Info.ElementType, TheCU->getFile()); + auto Align = getTypeAlignIfRequired(BT, CGM.getContext()); + return DBuilder.createVectorType(/*Size*/ 0, Align, ElemTy, + SubscriptArray); + } case BuiltinType::UChar: case BuiltinType::Char_U: Encoding = llvm::dwarf::DW_ATE_unsigned_char; @@ -2267,6 +2283,25 @@ static bool hasExplicitMemberDefinition(CXXRecordDecl::method_iterator I, return false; } +static bool canUseCtorHoming(const CXXRecordDecl *RD) { + // Constructor homing can be used for classes that have at least one + // constructor and have no trivial or constexpr constructors. + // Skip this optimization if the class or any of its methods are marked + // dllimport. + if (RD->isLambda() || RD->hasConstexprNonCopyMoveConstructor() || + isClassOrMethodDLLImport(RD)) + return false; + + if (RD->ctors().empty()) + return false; + + for (const auto *Ctor : RD->ctors()) + if (Ctor->isTrivial() && !Ctor->isCopyOrMoveConstructor()) + return false; + + return true; +} + static bool shouldOmitDefinition(codegenoptions::DebugInfoKind DebugKind, bool DebugTypeExtRefs, const RecordDecl *RD, const LangOptions &LangOpts) { @@ -2301,23 +2336,6 @@ static bool shouldOmitDefinition(codegenoptions::DebugInfoKind DebugKind, !isClassOrMethodDLLImport(CXXDecl)) return true; - // In constructor debug mode, only emit debug info for a class when its - // constructor is emitted. Skip this optimization if the class or any of - // its methods are marked dllimport. - // - // This applies to classes that don't have any trivial constructors and have - // at least one constructor. - if (DebugKind == codegenoptions::DebugInfoConstructor && - !CXXDecl->isLambda() && !CXXDecl->hasConstexprNonCopyMoveConstructor() && - !isClassOrMethodDLLImport(CXXDecl)) { - if (CXXDecl->ctors().empty()) - return false; - for (const auto *Ctor : CXXDecl->ctors()) - if (Ctor->isTrivial() && !Ctor->isCopyOrMoveConstructor()) - return false; - return true; - } - TemplateSpecializationKind Spec = TSK_Undeclared; if (const auto *SD = dyn_cast(RD)) Spec = SD->getSpecializationKind(); @@ -2327,6 +2345,12 @@ static bool shouldOmitDefinition(codegenoptions::DebugInfoKind DebugKind, CXXDecl->method_end())) return true; + // In constructor homing mode, only emit complete debug info for a class + // when its constructor is emitted. + if ((DebugKind == codegenoptions::DebugInfoConstructor) && + canUseCtorHoming(CXXDecl)) + return true; + return false; } @@ -2552,12 +2576,11 @@ llvm::DIModule *CGDebugInfo::getOrCreateModuleRef(ASTSourceDescriptor Mod, // We use the lower 64 bits for debug info. uint64_t Signature = 0; - if (const auto &ModSig = Mod.getSignature()) { - for (unsigned I = 0; I != sizeof(Signature); ++I) - Signature |= (uint64_t)ModSig[I] << (I * 8); - } else { + if (const auto &ModSig = Mod.getSignature()) + Signature = ModSig.truncatedValue(); + else Signature = ~1ULL; - } + llvm::DIBuilder DIB(CGM.getModule()); SmallString<0> PCM; if (!llvm::sys::path::is_absolute(Mod.getASTFile())) diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 4a29dbf7f8953..db7a0751c4b19 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -1684,7 +1684,7 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile, if (Ty->isVectorType()) { const llvm::Type *EltTy = Addr.getElementType(); - const auto *VTy = cast(EltTy); + const auto *VTy = cast(EltTy); // Handle vectors of size 3 like size 4 for better performance. if (VTy->getNumElements() == 3) { @@ -1769,8 +1769,9 @@ static Address MaybeConvertMatrixAddress(Address Addr, CodeGenFunction &CGF, auto *VectorTy = dyn_cast( cast(Addr.getPointer()->getType())->getElementType()); if (VectorTy && !IsVector) { - auto *ArrayTy = llvm::ArrayType::get(VectorTy->getElementType(), - VectorTy->getNumElements()); + auto *ArrayTy = llvm::ArrayType::get( + VectorTy->getElementType(), + cast(VectorTy)->getNumElements()); return Address(CGF.Builder.CreateElementBitCast(Addr, ArrayTy)); } @@ -1801,7 +1802,7 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr, llvm::Type *SrcTy = Value->getType(); auto *VecTy = dyn_cast(SrcTy); // Handle vec3 special. - if (VecTy && VecTy->getNumElements() == 3) { + if (VecTy && cast(VecTy)->getNumElements() == 3) { // Our source is a vec3, do a shuffle vector to make it a vec4. Value = Builder.CreateShuffleVector(Value, llvm::UndefValue::get(VecTy), ArrayRef{0, 1, 2, -1}, @@ -2226,7 +2227,7 @@ void CodeGenFunction::EmitStoreThroughExtVectorComponentLValue(RValue Src, if (const VectorType *VTy = Dst.getType()->getAs()) { unsigned NumSrcElts = VTy->getNumElements(); unsigned NumDstElts = - cast(Vec->getType())->getNumElements(); + cast(Vec->getType())->getNumElements(); if (NumDstElts == NumSrcElts) { // Use shuffle vector is the src and destination are the same number of // elements and restore the vector mask since it is on the side it will be diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index ff997bc53ddfc..28da97382321d 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/FixedPointBuilder.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" @@ -356,11 +357,6 @@ class ScalarExprEmitter /// and an integer. Value *EmitFixedPointConversion(Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc); - Value *EmitFixedPointConversion(Value *Src, - llvm::FixedPointSemantics &SrcFixedSema, - llvm::FixedPointSemantics &DstFixedSema, - SourceLocation Loc, - bool DstIsInteger = false); /// Emit a conversion from the specified complex type to the specified /// destination type, where the destination type is an LLVM scalar type. @@ -1324,7 +1320,7 @@ Value *ScalarExprEmitter::EmitScalarConversion(Value *Src, QualType SrcType, "Splatted expr doesn't match with vector element type?"); // Splat the element across to all elements - unsigned NumElements = cast(DstTy)->getNumElements(); + unsigned NumElements = cast(DstTy)->getNumElements(); return Builder.CreateVectorSplat(NumElements, Src, "splat"); } @@ -1447,91 +1443,17 @@ Value *ScalarExprEmitter::EmitFixedPointConversion(Value *Src, QualType SrcTy, SourceLocation Loc) { auto SrcFPSema = CGF.getContext().getFixedPointSemantics(SrcTy); auto DstFPSema = CGF.getContext().getFixedPointSemantics(DstTy); - return EmitFixedPointConversion(Src, SrcFPSema, DstFPSema, Loc, - DstTy->isIntegerType()); -} - -Value *ScalarExprEmitter::EmitFixedPointConversion( - Value *Src, llvm::FixedPointSemantics &SrcFPSema, - llvm::FixedPointSemantics &DstFPSema, - SourceLocation Loc, bool DstIsInteger) { - using llvm::APFixedPoint; - using llvm::APInt; - using llvm::ConstantInt; - using llvm::Value; - - unsigned SrcWidth = SrcFPSema.getWidth(); - unsigned DstWidth = DstFPSema.getWidth(); - unsigned SrcScale = SrcFPSema.getScale(); - unsigned DstScale = DstFPSema.getScale(); - bool SrcIsSigned = SrcFPSema.isSigned(); - bool DstIsSigned = DstFPSema.isSigned(); - - llvm::Type *DstIntTy = Builder.getIntNTy(DstWidth); - - Value *Result = Src; - unsigned ResultWidth = SrcWidth; - - // Downscale. - if (DstScale < SrcScale) { - // When converting to integers, we round towards zero. For negative numbers, - // right shifting rounds towards negative infinity. In this case, we can - // just round up before shifting. - if (DstIsInteger && SrcIsSigned) { - Value *Zero = llvm::Constant::getNullValue(Result->getType()); - Value *IsNegative = Builder.CreateICmpSLT(Result, Zero); - Value *LowBits = ConstantInt::get( - CGF.getLLVMContext(), APInt::getLowBitsSet(ResultWidth, SrcScale)); - Value *Rounded = Builder.CreateAdd(Result, LowBits); - Result = Builder.CreateSelect(IsNegative, Rounded, Result); - } - - Result = SrcIsSigned - ? Builder.CreateAShr(Result, SrcScale - DstScale, "downscale") - : Builder.CreateLShr(Result, SrcScale - DstScale, "downscale"); - } - - if (!DstFPSema.isSaturated()) { - // Resize. - Result = Builder.CreateIntCast(Result, DstIntTy, SrcIsSigned, "resize"); - - // Upscale. - if (DstScale > SrcScale) - Result = Builder.CreateShl(Result, DstScale - SrcScale, "upscale"); - } else { - // Adjust the number of fractional bits. - if (DstScale > SrcScale) { - // Compare to DstWidth to prevent resizing twice. - ResultWidth = std::max(SrcWidth + DstScale - SrcScale, DstWidth); - llvm::Type *UpscaledTy = Builder.getIntNTy(ResultWidth); - Result = Builder.CreateIntCast(Result, UpscaledTy, SrcIsSigned, "resize"); - Result = Builder.CreateShl(Result, DstScale - SrcScale, "upscale"); - } - - // Handle saturation. - bool LessIntBits = DstFPSema.getIntegralBits() < SrcFPSema.getIntegralBits(); - if (LessIntBits) { - Value *Max = ConstantInt::get( - CGF.getLLVMContext(), - APFixedPoint::getMax(DstFPSema).getValue().extOrTrunc(ResultWidth)); - Value *TooHigh = SrcIsSigned ? Builder.CreateICmpSGT(Result, Max) - : Builder.CreateICmpUGT(Result, Max); - Result = Builder.CreateSelect(TooHigh, Max, Result, "satmax"); - } - // Cannot overflow min to dest type if src is unsigned since all fixed - // point types can cover the unsigned min of 0. - if (SrcIsSigned && (LessIntBits || !DstIsSigned)) { - Value *Min = ConstantInt::get( - CGF.getLLVMContext(), - APFixedPoint::getMin(DstFPSema).getValue().extOrTrunc(ResultWidth)); - Value *TooLow = Builder.CreateICmpSLT(Result, Min); - Result = Builder.CreateSelect(TooLow, Min, Result, "satmin"); - } - - // Resize the integer part to get the final destination size. - if (ResultWidth != DstWidth) - Result = Builder.CreateIntCast(Result, DstIntTy, SrcIsSigned, "resize"); - } + llvm::FixedPointBuilder FPBuilder(Builder); + llvm::Value *Result; + if (DstTy->isIntegerType()) + Result = FPBuilder.CreateFixedToInteger(Src, SrcFPSema, + DstFPSema.getWidth(), + DstFPSema.isSigned()); + else if (SrcTy->isIntegerType()) + Result = FPBuilder.CreateIntegerToFixed(Src, SrcFPSema.isSigned(), + DstFPSema); + else + Result = FPBuilder.CreateFixedToFixed(Src, SrcFPSema, DstFPSema); return Result; } @@ -1631,12 +1553,12 @@ Value *ScalarExprEmitter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { Value *RHS = CGF.EmitScalarExpr(E->getExpr(1)); Value *Mask; - llvm::VectorType *LTy = cast(LHS->getType()); + auto *LTy = cast(LHS->getType()); unsigned LHSElts = LTy->getNumElements(); Mask = RHS; - llvm::VectorType *MTy = cast(Mask->getType()); + auto *MTy = cast(Mask->getType()); // Mask off the high bits of each shuffle index. Value *MaskBits = @@ -1841,7 +1763,7 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) { return Visit(E->getInit(0)); } - unsigned ResElts = VType->getNumElements(); + unsigned ResElts = cast(VType)->getNumElements(); // Loop over initializers collecting the Value for each, and remembering // whether the source was swizzle (ExtVectorElementExpr). This will allow @@ -1865,7 +1787,8 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) { if (isa(IE)) { llvm::ExtractElementInst *EI = cast(Init); - if (EI->getVectorOperandType()->getNumElements() == ResElts) { + if (cast(EI->getVectorOperandType()) + ->getNumElements() == ResElts) { llvm::ConstantInt *C = cast(EI->getIndexOperand()); Value *LHS = nullptr, *RHS = nullptr; if (CurIdx == 0) { @@ -1903,7 +1826,7 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) { continue; } - unsigned InitElts = VVT->getNumElements(); + unsigned InitElts = cast(VVT)->getNumElements(); // If the initializer is an ExtVecEltExpr (a swizzle), and the swizzle's // input is the same width as the vector being constructed, generate an @@ -1912,7 +1835,7 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) { if (isa(IE)) { llvm::ShuffleVectorInst *SVI = cast(Init); Value *SVOp = SVI->getOperand(0); - llvm::VectorType *OpTy = cast(SVOp->getType()); + auto *OpTy = cast(SVOp->getType()); if (OpTy->getNumElements() == ResElts) { for (unsigned j = 0; j != CurIdx; ++j) { @@ -2271,7 +2194,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) { llvm::Type *DstTy = ConvertType(DestTy); Value *Elt = Visit(const_cast(E)); // Splat the element across to all elements - unsigned NumElements = cast(DstTy)->getNumElements(); + unsigned NumElements = cast(DstTy)->getNumElements(); return Builder.CreateVectorSplat(NumElements, Elt, "splat"); } @@ -2691,12 +2614,9 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV, // Now, convert from our invented integer literal to the type of the unary // op. This will upscale and saturate if necessary. This value can become // undef in some cases. - auto SrcSema = - llvm::FixedPointSemantics::GetIntegerSemantics( - value->getType()->getScalarSizeInBits(), /*IsSigned=*/true); + llvm::FixedPointBuilder FPBuilder(Builder); auto DstSema = CGF.getContext().getFixedPointSemantics(Info.Ty); - Info.RHS = EmitFixedPointConversion(Info.RHS, SrcSema, DstSema, - E->getExprLoc()); + Info.RHS = FPBuilder.CreateIntegerToFixed(Info.RHS, true, DstSema); value = EmitFixedPointBinOp(Info); // Objective-C pointer types. @@ -3666,91 +3586,52 @@ Value *ScalarExprEmitter::EmitFixedPointBinOp(const BinOpInfo &op) { auto ResultFixedSema = Ctx.getFixedPointSemantics(ResultTy); auto CommonFixedSema = LHSFixedSema.getCommonSemantics(RHSFixedSema); - // Convert the operands to the full precision type. - Value *FullLHS = EmitFixedPointConversion(LHS, LHSFixedSema, CommonFixedSema, - op.E->getExprLoc()); - Value *FullRHS = EmitFixedPointConversion(RHS, RHSFixedSema, CommonFixedSema, - op.E->getExprLoc()); - // Perform the actual operation. Value *Result; + llvm::FixedPointBuilder FPBuilder(Builder); switch (op.Opcode) { case BO_AddAssign: - case BO_Add: { - if (CommonFixedSema.isSaturated()) { - llvm::Intrinsic::ID IID = CommonFixedSema.isSigned() - ? llvm::Intrinsic::sadd_sat - : llvm::Intrinsic::uadd_sat; - Result = Builder.CreateBinaryIntrinsic(IID, FullLHS, FullRHS); - } else { - Result = Builder.CreateAdd(FullLHS, FullRHS); - } + case BO_Add: + Result = FPBuilder.CreateAdd(LHS, LHSFixedSema, RHS, RHSFixedSema); break; - } case BO_SubAssign: - case BO_Sub: { - if (CommonFixedSema.isSaturated()) { - llvm::Intrinsic::ID IID = CommonFixedSema.isSigned() - ? llvm::Intrinsic::ssub_sat - : llvm::Intrinsic::usub_sat; - Result = Builder.CreateBinaryIntrinsic(IID, FullLHS, FullRHS); - } else { - Result = Builder.CreateSub(FullLHS, FullRHS); - } + case BO_Sub: + Result = FPBuilder.CreateSub(LHS, LHSFixedSema, RHS, RHSFixedSema); break; - } case BO_MulAssign: - case BO_Mul: { - llvm::Intrinsic::ID IID; - if (CommonFixedSema.isSaturated()) - IID = CommonFixedSema.isSigned() ? llvm::Intrinsic::smul_fix_sat - : llvm::Intrinsic::umul_fix_sat; - else - IID = CommonFixedSema.isSigned() ? llvm::Intrinsic::smul_fix - : llvm::Intrinsic::umul_fix; - Result = Builder.CreateIntrinsic(IID, {FullLHS->getType()}, - {FullLHS, FullRHS, Builder.getInt32(CommonFixedSema.getScale())}); + case BO_Mul: + Result = FPBuilder.CreateMul(LHS, LHSFixedSema, RHS, RHSFixedSema); break; - } case BO_DivAssign: - case BO_Div: { - llvm::Intrinsic::ID IID; - if (CommonFixedSema.isSaturated()) - IID = CommonFixedSema.isSigned() ? llvm::Intrinsic::sdiv_fix_sat - : llvm::Intrinsic::udiv_fix_sat; - else - IID = CommonFixedSema.isSigned() ? llvm::Intrinsic::sdiv_fix - : llvm::Intrinsic::udiv_fix; - Result = Builder.CreateIntrinsic(IID, {FullLHS->getType()}, - {FullLHS, FullRHS, Builder.getInt32(CommonFixedSema.getScale())}); - break; - } + case BO_Div: + Result = FPBuilder.CreateDiv(LHS, LHSFixedSema, RHS, RHSFixedSema); + break; + case BO_ShlAssign: + case BO_Shl: + Result = FPBuilder.CreateShl(LHS, LHSFixedSema, RHS); + break; + case BO_ShrAssign: + case BO_Shr: + Result = FPBuilder.CreateShr(LHS, LHSFixedSema, RHS); + break; case BO_LT: - return CommonFixedSema.isSigned() ? Builder.CreateICmpSLT(FullLHS, FullRHS) - : Builder.CreateICmpULT(FullLHS, FullRHS); + return FPBuilder.CreateLT(LHS, LHSFixedSema, RHS, RHSFixedSema); case BO_GT: - return CommonFixedSema.isSigned() ? Builder.CreateICmpSGT(FullLHS, FullRHS) - : Builder.CreateICmpUGT(FullLHS, FullRHS); + return FPBuilder.CreateGT(LHS, LHSFixedSema, RHS, RHSFixedSema); case BO_LE: - return CommonFixedSema.isSigned() ? Builder.CreateICmpSLE(FullLHS, FullRHS) - : Builder.CreateICmpULE(FullLHS, FullRHS); + return FPBuilder.CreateLE(LHS, LHSFixedSema, RHS, RHSFixedSema); case BO_GE: - return CommonFixedSema.isSigned() ? Builder.CreateICmpSGE(FullLHS, FullRHS) - : Builder.CreateICmpUGE(FullLHS, FullRHS); + return FPBuilder.CreateGE(LHS, LHSFixedSema, RHS, RHSFixedSema); case BO_EQ: // For equality operations, we assume any padding bits on unsigned types are // zero'd out. They could be overwritten through non-saturating operations // that cause overflow, but this leads to undefined behavior. - return Builder.CreateICmpEQ(FullLHS, FullRHS); + return FPBuilder.CreateEQ(LHS, LHSFixedSema, RHS, RHSFixedSema); case BO_NE: - return Builder.CreateICmpNE(FullLHS, FullRHS); - case BO_Shl: - case BO_Shr: + return FPBuilder.CreateNE(LHS, LHSFixedSema, RHS, RHSFixedSema); case BO_Cmp: case BO_LAnd: case BO_LOr: - case BO_ShlAssign: - case BO_ShrAssign: llvm_unreachable("Found unimplemented fixed point binary operation"); case BO_PtrMemD: case BO_PtrMemI: @@ -3767,9 +3648,12 @@ Value *ScalarExprEmitter::EmitFixedPointBinOp(const BinOpInfo &op) { llvm_unreachable("Found unsupported binary operation for fixed point types."); } + bool IsShift = BinaryOperator::isShiftOp(op.Opcode) || + BinaryOperator::isShiftAssignOp(op.Opcode); // Convert to the result type. - return EmitFixedPointConversion(Result, CommonFixedSema, ResultFixedSema, - op.E->getExprLoc()); + return FPBuilder.CreateFixedToFixed(Result, IsShift ? LHSFixedSema + : CommonFixedSema, + ResultFixedSema); } Value *ScalarExprEmitter::EmitSub(const BinOpInfo &op) { @@ -3896,16 +3780,24 @@ Value *ScalarExprEmitter::ConstrainShiftValue(Value *LHS, Value *RHS, } Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) { + // TODO: This misses out on the sanitizer check below. + if (Ops.isFixedPointOp()) + return EmitFixedPointBinOp(Ops); + // LLVM requires the LHS and RHS to be the same type: promote or truncate the // RHS to the same size as the LHS. Value *RHS = Ops.RHS; if (Ops.LHS->getType() != RHS->getType()) RHS = Builder.CreateIntCast(RHS, Ops.LHS->getType(), false, "sh_prom"); - bool SanitizeBase = CGF.SanOpts.has(SanitizerKind::ShiftBase) && - Ops.Ty->hasSignedIntegerRepresentation() && - !CGF.getLangOpts().isSignedOverflowDefined() && - !CGF.getLangOpts().CPlusPlus20; + bool SanitizeSignedBase = CGF.SanOpts.has(SanitizerKind::ShiftBase) && + Ops.Ty->hasSignedIntegerRepresentation() && + !CGF.getLangOpts().isSignedOverflowDefined() && + !CGF.getLangOpts().CPlusPlus20; + bool SanitizeUnsignedBase = + CGF.SanOpts.has(SanitizerKind::UnsignedShiftBase) && + Ops.Ty->hasUnsignedIntegerRepresentation(); + bool SanitizeBase = SanitizeSignedBase || SanitizeUnsignedBase; bool SanitizeExponent = CGF.SanOpts.has(SanitizerKind::ShiftExponent); // OpenCL 6.3j: shift values are effectively % word size of LHS. if (CGF.getLangOpts().OpenCL) @@ -3938,11 +3830,12 @@ Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) { Ops.LHS, Builder.CreateSub(PromotedWidthMinusOne, RHS, "shl.zeros", /*NUW*/ true, /*NSW*/ true), "shl.check"); - if (CGF.getLangOpts().CPlusPlus) { + if (SanitizeUnsignedBase || CGF.getLangOpts().CPlusPlus) { // In C99, we are not permitted to shift a 1 bit into the sign bit. // Under C++11's rules, shifting a 1 bit into the sign bit is // OK, but shifting a 1 bit out of it is not. (C89 and C++03 don't // define signed left shifts, so we use the C99 and C++11 rules there). + // Unsigned shifts can always shift into the top bit. llvm::Value *One = llvm::ConstantInt::get(BitsShiftedOff->getType(), 1); BitsShiftedOff = Builder.CreateLShr(BitsShiftedOff, One); } @@ -3952,7 +3845,9 @@ Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) { llvm::PHINode *BaseCheck = Builder.CreatePHI(ValidBase->getType(), 2); BaseCheck->addIncoming(Builder.getTrue(), Orig); BaseCheck->addIncoming(ValidBase, CheckShiftBase); - Checks.push_back(std::make_pair(BaseCheck, SanitizerKind::ShiftBase)); + Checks.push_back(std::make_pair( + BaseCheck, SanitizeSignedBase ? SanitizerKind::ShiftBase + : SanitizerKind::UnsignedShiftBase)); } assert(!Checks.empty()); @@ -3963,6 +3858,10 @@ Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) { } Value *ScalarExprEmitter::EmitShr(const BinOpInfo &Ops) { + // TODO: This misses out on the sanitizer check below. + if (Ops.isFixedPointOp()) + return EmitFixedPointBinOp(Ops); + // LLVM requires the LHS and RHS to be the same type: promote or truncate the // RHS to the same size as the LHS. Value *RHS = Ops.RHS; @@ -4517,7 +4416,7 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) { llvm::Value *RHS = Visit(rhsExpr); llvm::Type *condType = ConvertType(condExpr->getType()); - llvm::VectorType *vecTy = cast(condType); + auto *vecTy = cast(condType); unsigned numElem = vecTy->getNumElements(); llvm::Type *elemType = vecTy->getElementType(); @@ -4737,10 +4636,14 @@ Value *ScalarExprEmitter::VisitAsTypeExpr(AsTypeExpr *E) { llvm::Type *DstTy = ConvertType(E->getType()); llvm::Type *SrcTy = Src->getType(); - unsigned NumElementsSrc = isa(SrcTy) ? - cast(SrcTy)->getNumElements() : 0; - unsigned NumElementsDst = isa(DstTy) ? - cast(DstTy)->getNumElements() : 0; + unsigned NumElementsSrc = + isa(SrcTy) + ? cast(SrcTy)->getNumElements() + : 0; + unsigned NumElementsDst = + isa(DstTy) + ? cast(DstTy)->getNumElements() + : 0; // Going from vec3 to non-vec3 is a special case and requires a shuffle // vector to get a vec4, then a bitcast if the target type is different. diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index df0426a76cc65..257343f678388 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1888,9 +1888,11 @@ void CodeGenModule::setNonAliasAttributes(GlobalDecl GD, // We know that GetCPUAndFeaturesAttributes will always have the // newest set, since it has the newest possible FunctionDecl, so the // new ones should replace the old. - F->removeFnAttr("target-cpu"); - F->removeFnAttr("target-features"); - F->removeFnAttr("tune-cpu"); + llvm::AttrBuilder RemoveAttrs; + RemoveAttrs.addAttribute("target-cpu"); + RemoveAttrs.addAttribute("target-features"); + RemoveAttrs.addAttribute("tune-cpu"); + F->removeAttributes(llvm::AttributeList::FunctionIndex, RemoveAttrs); F->addAttributes(llvm::AttributeList::FunctionIndex, Attrs); } } diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp index 8277804d27c0e..0227137713425 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.cpp +++ b/clang/lib/CodeGen/CoverageMappingGen.cpp @@ -868,13 +868,22 @@ struct CounterCoverageMappingBuilder /// Find a valid gap range between \p AfterLoc and \p BeforeLoc. Optional findGapAreaBetween(SourceLocation AfterLoc, SourceLocation BeforeLoc) { - AfterLoc = SM.getExpansionLoc(AfterLoc); - BeforeLoc = SM.getExpansionLoc(BeforeLoc); + // If the start and end locations of the gap are both within the same macro + // file, the range may not be in source order. + if (AfterLoc.isMacroID() || BeforeLoc.isMacroID()) + return None; if (!SM.isWrittenInSameFile(AfterLoc, BeforeLoc)) return None; return {{AfterLoc, BeforeLoc}}; } + /// Find the source range after \p AfterStmt and before \p BeforeStmt. + Optional findGapAreaBetween(const Stmt *AfterStmt, + const Stmt *BeforeStmt) { + return findGapAreaBetween(getPreciseTokenLocEnd(getEnd(AfterStmt)), + getStart(BeforeStmt)); + } + /// Emit a gap region between \p StartLoc and \p EndLoc with the given count. void fillGapAreaWithCount(SourceLocation StartLoc, SourceLocation EndLoc, Counter Count) { @@ -1039,8 +1048,7 @@ struct CounterCoverageMappingBuilder adjustForOutOfOrderTraversal(getEnd(S)); // The body count applies to the area immediately after the increment. - auto Gap = findGapAreaBetween(getPreciseTokenLocEnd(S->getRParenLoc()), - getStart(S->getBody())); + auto Gap = findGapAreaBetween(S->getCond(), S->getBody()); if (Gap) fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount); @@ -1257,8 +1265,7 @@ struct CounterCoverageMappingBuilder propagateCounts(ParentCount, S->getCond()); // The 'then' count applies to the area immediately after the condition. - auto Gap = findGapAreaBetween(getPreciseTokenLocEnd(S->getRParenLoc()), - getStart(S->getThen())); + auto Gap = findGapAreaBetween(S->getCond(), S->getThen()); if (Gap) fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), ThenCount); @@ -1268,8 +1275,7 @@ struct CounterCoverageMappingBuilder Counter ElseCount = subtractCounters(ParentCount, ThenCount); if (const Stmt *Else = S->getElse()) { // The 'else' count applies to the area immediately after the 'then'. - Gap = findGapAreaBetween(getPreciseTokenLocEnd(getEnd(S->getThen())), - getStart(Else)); + Gap = findGapAreaBetween(S->getThen(), Else); if (Gap) fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), ElseCount); extendRegion(Else); diff --git a/clang/lib/CodeGen/ObjectFilePCHContainerOperations.cpp b/clang/lib/CodeGen/ObjectFilePCHContainerOperations.cpp index 0c7e5f4598f88..04bd6680e31c5 100644 --- a/clang/lib/CodeGen/ObjectFilePCHContainerOperations.cpp +++ b/clang/lib/CodeGen/ObjectFilePCHContainerOperations.cpp @@ -250,10 +250,10 @@ class PCHContainerGenerator : public ASTConsumer { // PCH files don't have a signature field in the control block, // but LLVM detects DWO CUs by looking for a non-zero DWO id. // We use the lower 64 bits for debug info. + uint64_t Signature = - Buffer->Signature - ? (uint64_t)Buffer->Signature[1] << 32 | Buffer->Signature[0] - : ~1ULL; + Buffer->Signature ? Buffer->Signature.truncatedValue() : ~1ULL; + Builder->getModuleDebugInfo()->setDwoId(Signature); // Finalize the Builder. diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp index 3d7421ac2e16c..cbbe208426f73 100644 --- a/clang/lib/CodeGen/SwiftCallingConv.cpp +++ b/clang/lib/CodeGen/SwiftCallingConv.cpp @@ -320,9 +320,12 @@ void SwiftAggLowering::addEntry(llvm::Type *type, // If we have a vector type, split it. if (auto vecTy = dyn_cast_or_null(type)) { auto eltTy = vecTy->getElementType(); - CharUnits eltSize = (end - begin) / vecTy->getNumElements(); + CharUnits eltSize = + (end - begin) / cast(vecTy)->getNumElements(); assert(eltSize == getTypeStoreSize(CGM, eltTy)); - for (unsigned i = 0, e = vecTy->getNumElements(); i != e; ++i) { + for (unsigned i = 0, + e = cast(vecTy)->getNumElements(); + i != e; ++i) { addEntry(eltTy, begin, begin + eltSize); begin += eltSize; } @@ -674,8 +677,9 @@ bool swiftcall::isLegalIntegerType(CodeGenModule &CGM, bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize, llvm::VectorType *vectorTy) { - return isLegalVectorType(CGM, vectorSize, vectorTy->getElementType(), - vectorTy->getNumElements()); + return isLegalVectorType( + CGM, vectorSize, vectorTy->getElementType(), + cast(vectorTy)->getNumElements()); } bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize, @@ -688,7 +692,7 @@ bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize, std::pair swiftcall::splitLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize, llvm::VectorType *vectorTy) { - auto numElts = vectorTy->getNumElements(); + auto numElts = cast(vectorTy)->getNumElements(); auto eltTy = vectorTy->getElementType(); // Try to split the vector type in half. @@ -710,7 +714,7 @@ void swiftcall::legalizeVectorType(CodeGenModule &CGM, CharUnits origVectorSize, } // Try to split the vector into legal subvectors. - auto numElts = origVectorTy->getNumElements(); + auto numElts = cast(origVectorTy)->getNumElements(); auto eltTy = origVectorTy->getElementType(); assert(numElts != 1); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 836219453b682..4292cb1dd7181 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -73,6 +73,7 @@ #include "llvm/Option/Option.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ExitCodes.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Host.h" @@ -89,7 +90,6 @@ #include #if LLVM_ON_UNIX #include // getpid -#include // EX_IOERR #endif using namespace clang::driver; diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 8c49e92b2c0f9..cce0eb557a9c6 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -866,6 +866,9 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC, LinkCXXRuntimes) || D.CCCIsCXX(); + NeedsHeapProfRt = + Args.hasFlag(options::OPT_fmemprof, options::OPT_fno_memprof, false); + // Finally, initialize the set of available and recoverable sanitizers. Sanitizers.Mask |= Kinds; RecoverableSanitizers.Mask |= RecoverableKinds; diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index b1cdc2b5e3f6c..faf0b84963926 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1074,14 +1074,14 @@ SanitizerMask ToolChain::getSupportedSanitizers() const { // Return sanitizers which don't require runtime support and are not // platform dependent. - SanitizerMask Res = (SanitizerKind::Undefined & ~SanitizerKind::Vptr & - ~SanitizerKind::Function) | - (SanitizerKind::CFI & ~SanitizerKind::CFIICall) | - SanitizerKind::CFICastStrict | - SanitizerKind::FloatDivideByZero | - SanitizerKind::UnsignedIntegerOverflow | - SanitizerKind::ImplicitConversion | - SanitizerKind::Nullability | SanitizerKind::LocalBounds; + SanitizerMask Res = + (SanitizerKind::Undefined & ~SanitizerKind::Vptr & + ~SanitizerKind::Function) | + (SanitizerKind::CFI & ~SanitizerKind::CFIICall) | + SanitizerKind::CFICastStrict | SanitizerKind::FloatDivideByZero | + SanitizerKind::UnsignedIntegerOverflow | + SanitizerKind::UnsignedShiftBase | SanitizerKind::ImplicitConversion | + SanitizerKind::Nullability | SanitizerKind::LocalBounds; if (getTriple().getArch() == llvm::Triple::x86 || getTriple().getArch() == llvm::Triple::x86_64 || getTriple().getArch() == llvm::Triple::arm || getTriple().isWasm() || diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 064acb6de4b17..0b5e9fac93e8f 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2116,20 +2116,27 @@ void Clang::AddX86TargetArgs(const ArgList &Args, } // Handle -mtune. - // FIXME: We should default to "generic" unless -march is set to match gcc. + + // Default to "generic" unless -march is present. + std::string TuneCPU; + if (!Args.hasArg(clang::driver::options::OPT_march_EQ)) + TuneCPU = "generic"; + + // Override based on -mtune. if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { StringRef Name = A->getValue(); - if (Name == "native") + if (Name == "native") { Name = llvm::sys::getHostCPUName(); + if (!Name.empty()) + TuneCPU = std::string(Name); + } else + TuneCPU = std::string(Name); + } - // Ignore generic either from getHostCPUName or from command line. - // FIXME: We need to support this eventually but isValidCPUName and the - // backend aren't ready for it yet. - if (Name != "generic") { - CmdArgs.push_back("-tune-cpu"); - CmdArgs.push_back(Args.MakeArgString(Name)); - } + if (!TuneCPU.empty()) { + CmdArgs.push_back("-tune-cpu"); + CmdArgs.push_back(Args.MakeArgString(TuneCPU)); } } @@ -4358,6 +4365,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, if (Args.getLastArg(options::OPT_save_temps_EQ)) Args.AddLastArg(CmdArgs, options::OPT_save_temps_EQ); + if (Args.hasFlag(options::OPT_fmemprof, options::OPT_fno_memprof, false)) + Args.AddLastArg(CmdArgs, options::OPT_fmemprof); + // Embed-bitcode option. // Only white-listed flags below are allowed to be embedded. if (C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO() && @@ -5734,7 +5744,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, LanguageStandard = llvm::StringSwitch(StdArg->getValue()) .Case("c++14", "-std=c++14") .Case("c++17", "-std=c++17") - .Case("c++latest", "-std=c++2a") + .Case("c++latest", "-std=c++20") .Default(""); if (LanguageStandard.empty()) D.Diag(clang::diag::warn_drv_unused_argument) @@ -5801,7 +5811,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // FIXME: Find a better way to determine whether the language has modules // support by default, or just assume that all languages do. bool HaveModules = - Std && (Std->containsValue("c++2a") || Std->containsValue("c++latest")); + Std && (Std->containsValue("c++2a") || Std->containsValue("c++20") || + Std->containsValue("c++latest")); RenderModulesOptions(C, D, Args, Input, Output, CmdArgs, HaveModules); if (Args.hasFlag(options::OPT_fpch_validate_input_files_content, @@ -7698,6 +7709,7 @@ void SPIRVTranslator::ConstructJob(Compilation &C, const JobAction &JA, TranslatorArgs.push_back(Output.getFilename()); if (getToolChain().getTriple().isSYCLDeviceEnvironment()) { TranslatorArgs.push_back("-spirv-max-version=1.1"); + TranslatorArgs.push_back("-spirv-debug-info-version=legacy"); if (C.getArgs().hasArg(options::OPT_fsycl_esimd)) TranslatorArgs.push_back("-spirv-allow-unknown-intrinsics"); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index c935a44b4cf02..26acea20dceb5 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -694,6 +694,11 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid()) HelperStaticRuntimes.push_back("asan-preinit"); } + if (SanArgs.needsHeapProfRt() && SanArgs.linkRuntimes()) { + SharedRuntimes.push_back("heapprof"); + if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid()) + HelperStaticRuntimes.push_back("heapprof-preinit"); + } if (SanArgs.needsUbsanRt() && SanArgs.linkRuntimes()) { if (SanArgs.requiresMinimalRuntime()) SharedRuntimes.push_back("ubsan_minimal"); @@ -729,6 +734,13 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, StaticRuntimes.push_back("asan_cxx"); } + if (!SanArgs.needsSharedRt() && SanArgs.needsHeapProfRt() && + SanArgs.linkRuntimes()) { + StaticRuntimes.push_back("heapprof"); + if (SanArgs.linkCXXRuntimes()) + StaticRuntimes.push_back("heapprof_cxx"); + } + if (!SanArgs.needsSharedRt() && SanArgs.needsHwasanRt() && SanArgs.linkRuntimes()) { StaticRuntimes.push_back("hwasan"); if (SanArgs.linkCXXRuntimes()) diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index 0394bf25a7532..764eb0c965e05 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -66,6 +66,20 @@ using namespace llvm::opt; static bool getSystemRegistryString(const char *keyPath, const char *valueName, std::string &value, std::string *phValue); +// Check command line arguments to try and find a toolchain. +static bool +findVCToolChainViaCommandLine(const ArgList &Args, std::string &Path, + MSVCToolChain::ToolsetLayout &VSLayout) { + // Don't validate the input; trust the value supplied by the user. + // The primary motivation is to prevent unnecessary file and registry access. + if (Arg *A = Args.getLastArg(options::OPT__SLASH_vctoolsdir)) { + Path = A->getValue(); + VSLayout = MSVCToolChain::ToolsetLayout::VS2017OrNewer; + return true; + } + return false; +} + // Check various environment variables to try and find a toolchain. static bool findVCToolChainViaEnvironment(std::string &Path, MSVCToolChain::ToolsetLayout &VSLayout) { @@ -807,11 +821,12 @@ MSVCToolChain::MSVCToolChain(const Driver &D, const llvm::Triple &Triple, if (getDriver().getInstalledDir() != getDriver().Dir) getProgramPaths().push_back(getDriver().Dir); - // Check the environment first, since that's probably the user telling us - // what they want to use. - // Failing that, just try to find the newest Visual Studio version we can - // and use its default VC toolchain. - findVCToolChainViaEnvironment(VCToolChainPath, VSLayout) || + // Check the command line first, that's the user explicitly telling us what to + // use. Check the environment next, in case we're being invoked from a VS + // command prompt. Failing that, just try to find the newest Visual Studio + // version we can and use its default VC toolchain. + findVCToolChainViaCommandLine(Args, VCToolChainPath, VSLayout) || + findVCToolChainViaEnvironment(VCToolChainPath, VSLayout) || findVCToolChainViaSetupConfig(VCToolChainPath, VSLayout) || findVCToolChainViaRegistry(VCToolChainPath, VSLayout); } @@ -1323,15 +1338,18 @@ void MSVCToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, return; // Honor %INCLUDE%. It should know essential search paths with vcvarsall.bat. - if (llvm::Optional cl_include_dir = - llvm::sys::Process::GetEnv("INCLUDE")) { - SmallVector Dirs; - StringRef(*cl_include_dir) - .split(Dirs, ";", /*MaxSplit=*/-1, /*KeepEmpty=*/false); - for (StringRef Dir : Dirs) - addSystemInclude(DriverArgs, CC1Args, Dir); - if (!Dirs.empty()) - return; + // Skip if the user expressly set a vctoolsdir + if (!DriverArgs.getLastArg(options::OPT__SLASH_vctoolsdir)) { + if (llvm::Optional cl_include_dir = + llvm::sys::Process::GetEnv("INCLUDE")) { + SmallVector Dirs; + StringRef(*cl_include_dir) + .split(Dirs, ";", /*MaxSplit=*/-1, /*KeepEmpty=*/false); + for (StringRef Dir : Dirs) + addSystemInclude(DriverArgs, CC1Args, Dir); + if (!Dirs.empty()) + return; + } } // When built with access to the proper Windows APIs, try to actually find diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp index 4f2d04058d249..1177fba965628 100644 --- a/clang/lib/Driver/ToolChains/OpenBSD.cpp +++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp @@ -313,15 +313,6 @@ std::string OpenBSD::getCompilerRT(const ArgList &Args, return std::string(Path.str()); } -void OpenBSD::addClangTargetOptions(const ArgList &DriverArgs, - ArgStringList &CC1Args, - Action::OffloadKind) const { - // Support for .init_array is still new (Aug 2016). - if (!DriverArgs.hasFlag(options::OPT_fuse_init_array, - options::OPT_fno_use_init_array, false)) - CC1Args.push_back("-fno-use-init-array"); -} - Tool *OpenBSD::buildAssembler() const { return new tools::openbsd::Assembler(*this); } diff --git a/clang/lib/Driver/ToolChains/OpenBSD.h b/clang/lib/Driver/ToolChains/OpenBSD.h index 09595faf9d6bb..5f9b259bf8613 100644 --- a/clang/lib/Driver/ToolChains/OpenBSD.h +++ b/clang/lib/Driver/ToolChains/OpenBSD.h @@ -86,11 +86,6 @@ class LLVM_LIBRARY_VISIBILITY OpenBSD : public Generic_ELF { SanitizerMask getSupportedSanitizers() const override; - void - addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args, - Action::OffloadKind DeviceOffloadKind) const override; - protected: Tool *buildAssembler() const override; Tool *buildLinker() const override; diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index f3b58dd3d50f6..81297fe3ebe24 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -42,6 +42,7 @@ const char *SYCL::Linker::constructLLVMSpirvCommand(Compilation &C, } else { CmdArgs.push_back("-spirv-max-version=1.1"); CmdArgs.push_back("-spirv-ext=+all"); + CmdArgs.push_back("-spirv-debug-info-version=legacy"); if (C.getArgs().hasArg(options::OPT_fsycl_esimd)) CmdArgs.push_back("-spirv-allow-unknown-intrinsics"); CmdArgs.push_back("-o"); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 4df22778c516e..392ba0f30f105 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -843,6 +843,9 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, llvm::is_contained(DebugEntryValueArchs, T.getArch())) Opts.EmitCallSiteInfo = true; + Opts.ValueTrackingVariableLocations = + Args.hasArg(OPT_fexperimental_debug_variable_locations); + Opts.DisableO0ImplyOptNone = Args.hasArg(OPT_disable_O0_optnone); Opts.DisableRedZone = Args.hasArg(OPT_disable_red_zone); Opts.IndirectTlsSegRefs = Args.hasArg(OPT_mno_tls_direct_seg_refs); @@ -1038,6 +1041,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, Opts.ThinLinkBitcodeFile = std::string(Args.getLastArgValue(OPT_fthin_link_bitcode_EQ)); + Opts.HeapProf = Args.hasArg(OPT_fmemprof); + Opts.MSVolatile = Args.hasArg(OPT_fms_volatile); Opts.VectorizeLoop = Args.hasArg(OPT_vectorize_loops); @@ -2962,8 +2967,8 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, // Recovery AST still heavily relies on dependent-type machinery. Opts.RecoveryAST = Args.hasFlag(OPT_frecovery_ast, OPT_fno_recovery_ast, Opts.CPlusPlus); - Opts.RecoveryASTType = - Args.hasFlag(OPT_frecovery_ast_type, OPT_fno_recovery_ast_type, false); + Opts.RecoveryASTType = Args.hasFlag( + OPT_frecovery_ast_type, OPT_fno_recovery_ast_type, Opts.CPlusPlus); Opts.HeinousExtensions = Args.hasArg(OPT_fheinous_gnu_extensions); Opts.AccessControl = !Args.hasArg(OPT_fno_access_control); Opts.ElideConstructors = !Args.hasArg(OPT_fno_elide_constructors); @@ -3906,7 +3911,7 @@ std::string CompilerInvocation::getModuleHash() const { // Extend the signature with the target options. code = hash_combine(code, TargetOpts->Triple, TargetOpts->CPU, - TargetOpts->ABI); + TargetOpts->TuneCPU, TargetOpts->ABI); for (const auto &FeatureAsWritten : TargetOpts->FeaturesAsWritten) code = hash_combine(code, FeatureAsWritten); diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp index 711e7336c8203..77a88f696abcd 100644 --- a/clang/lib/Frontend/FrontendActions.cpp +++ b/clang/lib/Frontend/FrontendActions.cpp @@ -561,6 +561,7 @@ namespace { Out.indent(2) << "Target options:\n"; Out.indent(4) << " Triple: " << TargetOpts.Triple << "\n"; Out.indent(4) << " CPU: " << TargetOpts.CPU << "\n"; + Out.indent(4) << " TuneCPU: " << TargetOpts.TuneCPU << "\n"; Out.indent(4) << " ABI: " << TargetOpts.ABI << "\n"; if (!TargetOpts.FeaturesAsWritten.empty()) { diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h index b1e70f6c41bbc..6583b0f22a162 100644 --- a/clang/lib/Headers/altivec.h +++ b/clang/lib/Headers/altivec.h @@ -3288,6 +3288,30 @@ static __inline__ vector double __ATTRS_o_ai vec_div(vector double __a, } #endif +/* vec_dive */ + +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed int __ATTRS_o_ai +vec_dive(vector signed int __a, vector signed int __b) { + return __builtin_altivec_vdivesw(__a, __b); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_dive(vector unsigned int __a, vector unsigned int __b) { + return __builtin_altivec_vdiveuw(__a, __b); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_dive(vector signed long long __a, vector signed long long __b) { + return __builtin_altivec_vdivesd(__a, __b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_dive(vector unsigned long long __a, vector unsigned long long __b) { + return __builtin_altivec_vdiveud(__a, __b); +} +#endif + /* vec_dss */ #define vec_dss __builtin_altivec_dss @@ -5737,6 +5761,30 @@ vec_vmuleuh(vector unsigned short __a, vector unsigned short __b) { #endif } +/* vec_mulh */ + +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed int __ATTRS_o_ai +vec_mulh(vector signed int __a, vector signed int __b) { + return __builtin_altivec_vmulhsw(__a, __b); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_mulh(vector unsigned int __a, vector unsigned int __b) { + return __builtin_altivec_vmulhuw(__a, __b); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_mulh(vector signed long long __a, vector signed long long __b) { + return __builtin_altivec_vmulhsd(__a, __b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_mulh(vector unsigned long long __a, vector unsigned long long __b) { + return __builtin_altivec_vmulhud(__a, __b); +} +#endif + /* vec_mulo */ static __inline__ vector short __ATTRS_o_ai vec_mulo(vector signed char __a, diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index bcdc5b8062a02..12da5a85da373 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -1903,18 +1903,16 @@ void ModuleMapParser::parseModuleDecl() { continue; } - if (ActiveModule) { - Diags.Report(Id[I].second, diag::err_mmap_missing_module_qualified) - << Id[I].first - << ActiveModule->getTopLevelModule()->getFullModuleName(); - } else { - Diags.Report(Id[I].second, diag::err_mmap_expected_module_name); - } + Diags.Report(Id[I].second, diag::err_mmap_missing_parent_module) + << Id[I].first << (ActiveModule != nullptr) + << (ActiveModule + ? ActiveModule->getTopLevelModule()->getFullModuleName() + : ""); HadError = true; - return; } - if (ModuleMapFile != Map.getContainingModuleMapFile(TopLevelModule)) { + if (TopLevelModule && + ModuleMapFile != Map.getContainingModuleMapFile(TopLevelModule)) { assert(ModuleMapFile != Map.getModuleMapFileForUniquing(TopLevelModule) && "submodule defined in same file as 'module *' that allowed its " "top-level module"); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index cf33e86f6618b..11d9cd1badba0 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -8038,7 +8038,7 @@ void Sema::CheckVariableDeclarationType(VarDecl *NewVD) { return; } - if (!NewVD->hasLocalStorage() && T->isSizelessType() && !T->isVLST()) { + if (!NewVD->hasLocalStorage() && T->isSizelessType()) { Diag(NewVD->getLocation(), diag::err_sizeless_nonlocal) << T; NewVD->setInvalidDecl(); return; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 142b23751b9af..db3a80ae876ac 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -9034,6 +9034,14 @@ Sema::CheckAssignmentConstraints(QualType LHSType, ExprResult &RHS, } } + // Allow assignments between fixed-length and sizeless SVE vectors. + if (((LHSType->isSizelessBuiltinType() && RHSType->isVectorType()) || + (LHSType->isVectorType() && RHSType->isSizelessBuiltinType())) && + Context.areCompatibleSveTypes(LHSType, RHSType)) { + Kind = CK_BitCast; + return Compatible; + } + return Incompatible; } @@ -9924,6 +9932,22 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS, // Okay, the expression is invalid. + // Returns true if the operands are SVE VLA and VLS types. + auto IsSveConversion = [](QualType FirstType, QualType SecondType) { + const VectorType *VecType = SecondType->getAs(); + return FirstType->isSizelessBuiltinType() && VecType && + (VecType->getVectorKind() == VectorType::SveFixedLengthDataVector || + VecType->getVectorKind() == + VectorType::SveFixedLengthPredicateVector); + }; + + // If there's a sizeless and fixed-length operand, diagnose that. + if (IsSveConversion(LHSType, RHSType) || IsSveConversion(RHSType, LHSType)) { + Diag(Loc, diag::err_typecheck_vector_not_convertable_sizeless) + << LHSType << RHSType; + return QualType(); + } + // If there's a non-vector, non-real operand, diagnose that. if ((!RHSVecType && !RHSType->isRealType()) || (!LHSVecType && !LHSType->isRealType())) { diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 3f1568185ecb8..4dc6fd8f72042 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -4332,6 +4332,12 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, VK_RValue, /*BasePath=*/nullptr, CCK).get(); break; + case ICK_SVE_Vector_Conversion: + From = ImpCastExprToType(From, ToType, CK_BitCast, VK_RValue, + /*BasePath=*/nullptr, CCK) + .get(); + break; + case ICK_Vector_Splat: { // Vector splat from any arithmetic type to a vector. Expr *Elem = prepareVectorSplat(ToType, From).get(); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 53917ef98acdf..7b62c841b48a2 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -2417,17 +2417,20 @@ bool Sema::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level, if (const auto *VD = dyn_cast(D)) { if (!VD->hasLocalStorage()) { + if (isInOpenMPTargetExecutionDirective()) + return true; DSAStackTy::DSAVarData TopDVar = DSAStack->getTopDSA(D, /*FromParent=*/false); unsigned NumLevels = getOpenMPCaptureLevels(DSAStack->getDirective(Level)); if (Level == 0) return (NumLevels == CaptureLevel + 1) && TopDVar.CKind != OMPC_shared; - DSAStackTy::DSAVarData DVar = DSAStack->getImplicitDSA(D, Level - 1); - return DVar.CKind != OMPC_shared || - isOpenMPGlobalCapturedDecl( - D, Level - 1, - getOpenMPCaptureLevels(DSAStack->getDirective(Level - 1)) - 1); + do { + --Level; + DSAStackTy::DSAVarData DVar = DSAStack->getImplicitDSA(D, Level); + if (DVar.CKind != OMPC_shared) + return true; + } while (Level >= 0); } } return true; @@ -2473,7 +2476,7 @@ void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, StringRef HostDevTy = getOpenMPSimpleClauseTypeName(OMPC_device_type, OMPC_DEVICE_TYPE_host); Diag(Loc, diag::err_omp_wrong_device_function_call) << HostDevTy << 0; - Diag(FD->getAttr()->getLocation(), + Diag(*OMPDeclareTargetDeclAttr::getLocation(FD), diag::note_omp_marked_device_type_here) << HostDevTy; return; @@ -2484,7 +2487,7 @@ void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName( OMPC_device_type, OMPC_DEVICE_TYPE_nohost); Diag(Loc, diag::err_omp_wrong_device_function_call) << NoHostDevTy << 1; - Diag(FD->getAttr()->getLocation(), + Diag(*OMPDeclareTargetDeclAttr::getLocation(FD), diag::note_omp_marked_device_type_here) << NoHostDevTy; } @@ -18483,14 +18486,14 @@ bool Sema::ActOnStartOpenMPDeclareTargetDirective(SourceLocation Loc) { Diag(Loc, diag::err_omp_region_not_file_context); return false; } - ++DeclareTargetNestingLevel; + DeclareTargetNesting.push_back(Loc); return true; } void Sema::ActOnFinishOpenMPDeclareTargetDirective() { - assert(DeclareTargetNestingLevel > 0 && + assert(!DeclareTargetNesting.empty() && "Unexpected ActOnFinishOpenMPDeclareTargetDirective"); - --DeclareTargetNestingLevel; + DeclareTargetNesting.pop_back(); } NamedDecl * @@ -18543,19 +18546,25 @@ void Sema::ActOnOpenMPDeclareTargetName( (ND->isUsed(/*CheckUsedAttr=*/false) || ND->isReferenced())) Diag(Loc, diag::warn_omp_declare_target_after_first_use); + auto *VD = cast(ND); Optional DevTy = - OMPDeclareTargetDeclAttr::getDeviceType(cast(ND)); - if (DevTy.hasValue() && *DevTy != DT) { + OMPDeclareTargetDeclAttr::getDeviceType(VD); + Optional AttrLoc = OMPDeclareTargetDeclAttr::getLocation(VD); + if (DevTy.hasValue() && *DevTy != DT && + (DeclareTargetNesting.empty() || + *AttrLoc != DeclareTargetNesting.back())) { Diag(Loc, diag::err_omp_device_type_mismatch) << OMPDeclareTargetDeclAttr::ConvertDevTypeTyToStr(DT) << OMPDeclareTargetDeclAttr::ConvertDevTypeTyToStr(*DevTy); return; } Optional Res = - OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(cast(ND)); - if (!Res) { - auto *A = OMPDeclareTargetDeclAttr::CreateImplicit(Context, MT, DT, - SourceRange(Loc, Loc)); + OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD); + if (!Res || (!DeclareTargetNesting.empty() && + *AttrLoc == DeclareTargetNesting.back())) { + auto *A = OMPDeclareTargetDeclAttr::CreateImplicit( + Context, MT, DT, DeclareTargetNesting.size() + 1, + SourceRange(Loc, Loc)); ND->addAttr(A); if (ASTMutationListener *ML = Context.getASTMutationListener()) ML->DeclarationMarkedOpenMPDeclareTarget(ND, A); @@ -18647,7 +18656,9 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, isa(D)) { auto *A = OMPDeclareTargetDeclAttr::CreateImplicit( Context, OMPDeclareTargetDeclAttr::MT_To, - OMPDeclareTargetDeclAttr::DT_Any, SourceRange(IdLoc, IdLoc)); + OMPDeclareTargetDeclAttr::DT_Any, DeclareTargetNesting.size(), + SourceRange(DeclareTargetNesting.back(), + DeclareTargetNesting.back())); D->addAttr(A); if (ASTMutationListener *ML = Context.getASTMutationListener()) ML->DeclarationMarkedOpenMPDeclareTarget(D, A); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index dc0098964be43..ec7c41e8ed099 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -137,6 +137,7 @@ ImplicitConversionRank clang::GetConversionRank(ImplicitConversionKind Kind) { ICR_Conversion, ICR_Conversion, ICR_Conversion, + ICR_Conversion, ICR_OCL_Scalar_Widening, ICR_Complex_Real_Conversion, ICR_Conversion, @@ -174,6 +175,7 @@ static const char* GetImplicitConversionName(ImplicitConversionKind Kind) { "Compatible-types conversion", "Derived-to-base conversion", "Vector conversion", + "SVE Vector conversion", "Vector splat", "Complex-real conversion", "Block Pointer conversion", @@ -1650,6 +1652,12 @@ static bool IsVectorConversion(Sema &S, QualType FromType, } } + if ((ToType->isSizelessBuiltinType() || FromType->isSizelessBuiltinType()) && + S.Context.areCompatibleSveTypes(FromType, ToType)) { + ICK = ICK_SVE_Vector_Conversion; + return true; + } + // We can perform the conversion between vector types in the following cases: // 1)vector types are equivalent AltiVec and GCC vector types // 2)lax vector conversions are permitted and the vector types are of the @@ -4104,6 +4112,20 @@ CompareStandardConversionSequences(Sema &S, SourceLocation Loc, : ImplicitConversionSequence::Worse; } + if (SCS1.Second == ICK_SVE_Vector_Conversion && + SCS2.Second == ICK_SVE_Vector_Conversion) { + bool SCS1IsCompatibleSVEVectorConversion = + S.Context.areCompatibleSveTypes(SCS1.getFromType(), SCS1.getToType(2)); + bool SCS2IsCompatibleSVEVectorConversion = + S.Context.areCompatibleSveTypes(SCS2.getFromType(), SCS2.getToType(2)); + + if (SCS1IsCompatibleSVEVectorConversion != + SCS2IsCompatibleSVEVectorConversion) + return SCS1IsCompatibleSVEVectorConversion + ? ImplicitConversionSequence::Better + : ImplicitConversionSequence::Worse; + } + return ImplicitConversionSequence::Indistinguishable; } @@ -5524,6 +5546,7 @@ static bool CheckConvertedConstantConversions(Sema &S, case ICK_Compatible_Conversion: case ICK_Derived_To_Base: case ICK_Vector_Conversion: + case ICK_SVE_Vector_Conversion: case ICK_Vector_Splat: case ICK_Complex_Real: case ICK_Block_Pointer_Conversion: diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 98a708dec8f8f..a9526671fd5a3 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -2149,6 +2149,13 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl( // typedef (C++ [dcl.typedef]p4). if (Previous.isSingleTagDecl()) Previous.clear(); + + // Filter out previous declarations that don't match the scope. The only + // effect this has is to remove declarations found in inline namespaces + // for friend declarations with unqualified names. + SemaRef.FilterLookupForScope(Previous, DC, /*Scope*/ nullptr, + /*ConsiderLinkage*/ true, + QualifierLoc.hasQualifier()); } SemaRef.CheckFunctionDeclaration(/*Scope*/ nullptr, Function, Previous, diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 12c4808b6a34d..2063425f46b16 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -2351,7 +2351,7 @@ QualType Sema::BuildArrayType(QualType T, ArrayType::ArraySizeModifier ASM, return QualType(); } - if (T->isSizelessType() && !T->isVLST()) { + if (T->isSizelessType()) { Diag(Loc, diag::err_array_incomplete_or_sizeless_type) << 1 << T; return QualType(); } @@ -7893,14 +7893,10 @@ static void HandleNeonVectorTypeAttr(QualType &CurType, const ParsedAttr &Attr, /// HandleArmSveVectorBitsTypeAttr - The "arm_sve_vector_bits" attribute is /// used to create fixed-length versions of sizeless SVE types defined by /// the ACLE, such as svint32_t and svbool_t. -static void HandleArmSveVectorBitsTypeAttr(TypeProcessingState &State, - QualType &CurType, - ParsedAttr &Attr) { - Sema &S = State.getSema(); - ASTContext &Ctx = S.Context; - +static void HandleArmSveVectorBitsTypeAttr(QualType &CurType, ParsedAttr &Attr, + Sema &S) { // Target must have SVE. - if (!Ctx.getTargetInfo().hasFeature("sve")) { + if (!S.Context.getTargetInfo().hasFeature("sve")) { S.Diag(Attr.getLoc(), diag::err_attribute_unsupported) << Attr; Attr.setInvalid(); return; @@ -7945,8 +7941,18 @@ static void HandleArmSveVectorBitsTypeAttr(TypeProcessingState &State, return; } - auto *A = ::new (Ctx) ArmSveVectorBitsAttr(Ctx, Attr, VecSize); - CurType = State.getAttributedType(A, CurType, CurType); + const auto *BT = CurType->castAs(); + + QualType EltType = CurType->getSveEltType(S.Context); + unsigned TypeSize = S.Context.getTypeSize(EltType); + VectorType::VectorKind VecKind = VectorType::SveFixedLengthDataVector; + if (BT->getKind() == BuiltinType::SveBool) { + // Predicates are represented as i8. + VecSize /= S.Context.getCharWidth() * S.Context.getCharWidth(); + VecKind = VectorType::SveFixedLengthPredicateVector; + } else + VecSize /= TypeSize; + CurType = S.Context.getVectorType(EltType, VecSize, VecKind); } static void HandleArmMveStrictPolymorphismAttr(TypeProcessingState &State, @@ -8218,7 +8224,7 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type, attr.setUsedAsTypeAttr(); break; case ParsedAttr::AT_ArmSveVectorBits: - HandleArmSveVectorBitsTypeAttr(state, type, attr); + HandleArmSveVectorBitsTypeAttr(type, attr, state.getSema()); attr.setUsedAsTypeAttr(); break; case ParsedAttr::AT_ArmMveStrictPolymorphism: { diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 5e0836d870645..9d95fa9b447ee 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -390,8 +390,10 @@ static bool checkTargetOptions(const TargetOptions &TargetOpts, // We can tolerate different CPUs in many cases, notably when one CPU // supports a strict superset of another. When allowing compatible // differences skip this check. - if (!AllowCompatibleDifferences) + if (!AllowCompatibleDifferences) { CHECK_TARGET_OPT(CPU, "target CPU"); + CHECK_TARGET_OPT(TuneCPU, "tune CPU"); + } #undef CHECK_TARGET_OPT @@ -5779,6 +5781,7 @@ bool ASTReader::ParseTargetOptions(const RecordData &Record, bool Complain, TargetOptions TargetOpts; TargetOpts.Triple = ReadString(Record, Idx); TargetOpts.CPU = ReadString(Record, Idx); + TargetOpts.TuneCPU = ReadString(Record, Idx); TargetOpts.ABI = ReadString(Record, Idx); for (unsigned N = Record[Idx++]; N; --N) { TargetOpts.FeaturesAsWritten.push_back(ReadString(Record, Idx)); diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index c7a009d1e50dc..47b378f5727b4 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -4666,12 +4666,11 @@ void ASTDeclReader::UpdateDecl(Decl *D, } case UPD_DECL_MARKED_OPENMP_DECLARETARGET: { - OMPDeclareTargetDeclAttr::MapTypeTy MapType = - static_cast(Record.readInt()); - OMPDeclareTargetDeclAttr::DevTypeTy DevType = - static_cast(Record.readInt()); + auto MapType = Record.readEnum(); + auto DevType = Record.readEnum(); + unsigned Level = Record.readInt(); D->addAttr(OMPDeclareTargetDeclAttr::CreateImplicit( - Reader.getContext(), MapType, DevType, readSourceRange(), + Reader.getContext(), MapType, DevType, Level, readSourceRange(), AttributeCommonInfo::AS_Pragma)); break; } diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 21dd8534a6f78..9a72108cb02c2 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -1272,6 +1272,7 @@ void ASTWriter::WriteControlBlock(Preprocessor &PP, ASTContext &Context, const TargetOptions &TargetOpts = Target.getTargetOpts(); AddString(TargetOpts.Triple, Record); AddString(TargetOpts.CPU, Record); + AddString(TargetOpts.TuneCPU, Record); AddString(TargetOpts.ABI, Record); Record.push_back(TargetOpts.FeaturesAsWritten.size()); for (unsigned I = 0, N = TargetOpts.FeaturesAsWritten.size(); I != N; ++I) { diff --git a/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp index 56c6f8d02e0f6..131c1345af997 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp @@ -106,7 +106,7 @@ static const NoteTag *getNoteTag(CheckerContext &C, QualType CastToTy, const Expr *Object, bool CastSucceeds, bool IsKnownCast) { std::string CastToName = - CastInfo ? CastInfo->to()->getPointeeCXXRecordDecl()->getNameAsString() + CastInfo ? CastInfo->to()->getAsCXXRecordDecl()->getNameAsString() : CastToTy->getPointeeCXXRecordDecl()->getNameAsString(); Object = Object->IgnoreParenImpCasts(); @@ -135,6 +135,47 @@ static const NoteTag *getNoteTag(CheckerContext &C, /*IsPrunable=*/true); } +static const NoteTag *getNoteTag(CheckerContext &C, + SmallVector CastToTyVec, + const Expr *Object, + bool IsKnownCast) { + Object = Object->IgnoreParenImpCasts(); + + return C.getNoteTag( + [=]() -> std::string { + SmallString<128> Msg; + llvm::raw_svector_ostream Out(Msg); + + if (!IsKnownCast) + Out << "Assuming "; + + if (const auto *DRE = dyn_cast(Object)) { + Out << '\'' << DRE->getDecl()->getNameAsString() << '\''; + } else if (const auto *ME = dyn_cast(Object)) { + Out << (IsKnownCast ? "Field '" : "field '") + << ME->getMemberDecl()->getNameAsString() << '\''; + } else { + Out << (IsKnownCast ? "The object" : "the object"); + } + Out << " is"; + + bool First = true; + for (QualType CastToTy: CastToTyVec) { + std::string CastToName = + CastToTy->getAsCXXRecordDecl() ? + CastToTy->getAsCXXRecordDecl()->getNameAsString() : + CastToTy->getPointeeCXXRecordDecl()->getNameAsString(); + Out << ' ' << ((CastToTyVec.size() == 1) ? "not" : + (First ? "neither" : "nor")) << " a '" << CastToName + << '\''; + First = false; + } + + return std::string(Out.str()); + }, + /*IsPrunable=*/true); +} + //===----------------------------------------------------------------------===// // Main logic to evaluate a cast. //===----------------------------------------------------------------------===// @@ -220,40 +261,76 @@ static void addInstanceOfTransition(const CallEvent &Call, bool IsInstanceOf) { const FunctionDecl *FD = Call.getDecl()->getAsFunction(); QualType CastFromTy = Call.parameters()[0]->getType(); - QualType CastToTy = FD->getTemplateSpecializationArgs()->get(0).getAsType(); - if (CastFromTy->isPointerType()) - CastToTy = C.getASTContext().getPointerType(CastToTy); - else if (CastFromTy->isReferenceType()) - CastToTy = alignReferenceTypes(CastToTy, CastFromTy, C.getASTContext()); - else - return; + SmallVector CastToTyVec; + for (unsigned idx = 0; idx < FD->getTemplateSpecializationArgs()->size() - 1; + ++idx) { + TemplateArgument CastToTempArg = + FD->getTemplateSpecializationArgs()->get(idx); + switch (CastToTempArg.getKind()) { + default: + return; + case TemplateArgument::Type: + CastToTyVec.push_back(CastToTempArg.getAsType()); + break; + case TemplateArgument::Pack: + for (TemplateArgument ArgInPack: CastToTempArg.pack_elements()) + CastToTyVec.push_back(ArgInPack.getAsType()); + break; + } + } const MemRegion *MR = DV.getAsRegion(); - const DynamicCastInfo *CastInfo = - getDynamicCastInfo(State, MR, CastFromTy, CastToTy); + if (MR && CastFromTy->isReferenceType()) + MR = State->getSVal(DV.castAs()).getAsRegion(); + + bool Success = false; + bool IsAnyKnown = false; + for (QualType CastToTy: CastToTyVec) { + if (CastFromTy->isPointerType()) + CastToTy = C.getASTContext().getPointerType(CastToTy); + else if (CastFromTy->isReferenceType()) + CastToTy = alignReferenceTypes(CastToTy, CastFromTy, C.getASTContext()); + else + return; - bool CastSucceeds; - if (CastInfo) - CastSucceeds = IsInstanceOf && CastInfo->succeeds(); - else - CastSucceeds = IsInstanceOf || CastFromTy == CastToTy; + const DynamicCastInfo *CastInfo = + getDynamicCastInfo(State, MR, CastFromTy, CastToTy); - if (isInfeasibleCast(CastInfo, CastSucceeds)) { - C.generateSink(State, C.getPredecessor()); - return; + bool CastSucceeds; + if (CastInfo) + CastSucceeds = IsInstanceOf && CastInfo->succeeds(); + else + CastSucceeds = IsInstanceOf || CastFromTy == CastToTy; + + // Store the type and the cast information. + bool IsKnownCast = CastInfo || CastFromTy == CastToTy; + IsAnyKnown = IsAnyKnown || IsKnownCast; + ProgramStateRef NewState = State; + if (!IsKnownCast) + NewState = setDynamicTypeAndCastInfo(State, MR, CastFromTy, CastToTy, + IsInstanceOf); + + if (CastSucceeds) { + Success = true; + C.addTransition( + NewState->BindExpr(Call.getOriginExpr(), C.getLocationContext(), + C.getSValBuilder().makeTruthVal(true)), + getNoteTag(C, CastInfo, CastToTy, Call.getArgExpr(0), true, + IsKnownCast)); + if (IsKnownCast) + return; + } else if (CastInfo && CastInfo->succeeds()) { + C.generateSink(NewState, C.getPredecessor()); + return; + } } - // Store the type and the cast information. - bool IsKnownCast = CastInfo || CastFromTy == CastToTy; - if (!IsKnownCast) - State = setDynamicTypeAndCastInfo(State, MR, CastFromTy, CastToTy, - IsInstanceOf); - - C.addTransition( - State->BindExpr(Call.getOriginExpr(), C.getLocationContext(), - C.getSValBuilder().makeTruthVal(CastSucceeds)), - getNoteTag(C, CastInfo, CastToTy, Call.getArgExpr(0), CastSucceeds, - IsKnownCast)); + if (!Success) { + C.addTransition( + State->BindExpr(Call.getOriginExpr(), C.getLocationContext(), + C.getSValBuilder().makeTruthVal(false)), + getNoteTag(C, CastToTyVec, Call.getArgExpr(0), IsAnyKnown)); + } } //===----------------------------------------------------------------------===// @@ -402,8 +479,9 @@ bool CastValueChecker::evalCall(const CallEvent &Call, QualType ParamT = Call.parameters()[0]->getType(); QualType ResultT = Call.getResultType(); if (!(ParamT->isPointerType() && ResultT->isPointerType()) && - !(ParamT->isReferenceType() && ResultT->isReferenceType())) + !(ParamT->isReferenceType() && ResultT->isReferenceType())) { return false; + } DV = Call.getArgSVal(0).getAs(); break; diff --git a/clang/lib/StaticAnalyzer/Checkers/SmartPtrModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/SmartPtrModeling.cpp index 0b084accbfbe8..c405ef12433a7 100644 --- a/clang/lib/StaticAnalyzer/Checkers/SmartPtrModeling.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/SmartPtrModeling.cpp @@ -37,7 +37,7 @@ namespace { class SmartPtrModeling : public Checker { - bool isNullAfterMoveMethod(const CallEvent &Call) const; + bool isAssignOpMethod(const CallEvent &Call) const; public: // Whether the checker should model for null dereferences of smart pointers. @@ -57,6 +57,7 @@ class SmartPtrModeling void handleRelease(const CallEvent &Call, CheckerContext &C) const; void handleSwap(const CallEvent &Call, CheckerContext &C) const; void handleGet(const CallEvent &Call, CheckerContext &C) const; + bool handleAssignOp(const CallEvent &Call, CheckerContext &C) const; using SmartPtrMethodHandlerFn = void (SmartPtrModeling::*)(const CallEvent &Call, CheckerContext &) const; @@ -123,7 +124,7 @@ static ProgramStateRef updateSwappedRegion(ProgramStateRef State, return State; } -bool SmartPtrModeling::isNullAfterMoveMethod(const CallEvent &Call) const { +bool SmartPtrModeling::isAssignOpMethod(const CallEvent &Call) const { // TODO: Update CallDescription to support anonymous calls? // TODO: Handle other methods, such as .get() or .release(). // But once we do, we'd need a visitor to explain null dereferences @@ -134,12 +135,11 @@ bool SmartPtrModeling::isNullAfterMoveMethod(const CallEvent &Call) const { bool SmartPtrModeling::evalCall(const CallEvent &Call, CheckerContext &C) const { - ProgramStateRef State = C.getState(); if (!smartptr::isStdSmartPtrCall(Call)) return false; - if (isNullAfterMoveMethod(Call)) { + if (isAssignOpMethod(Call)) { const MemRegion *ThisR = cast(&Call)->getCXXThisVal().getAsRegion(); @@ -206,6 +206,9 @@ bool SmartPtrModeling::evalCall(const CallEvent &Call, return true; } + if (handleAssignOp(Call, C)) + return true; + const SmartPtrMethodHandlerFn *Handler = SmartPtrMethodHandlers.lookup(Call); if (!Handler) return false; @@ -374,6 +377,87 @@ void SmartPtrModeling::handleGet(const CallEvent &Call, C.addTransition(State); } +bool SmartPtrModeling::handleAssignOp(const CallEvent &Call, + CheckerContext &C) const { + ProgramStateRef State = C.getState(); + const auto *OC = dyn_cast(&Call); + if (!OC) + return false; + OverloadedOperatorKind OOK = OC->getOverloadedOperator(); + if (OOK != OO_Equal) + return false; + const MemRegion *ThisRegion = OC->getCXXThisVal().getAsRegion(); + if (!ThisRegion) + return false; + + const MemRegion *OtherSmartPtrRegion = OC->getArgSVal(0).getAsRegion(); + // In case of 'nullptr' or '0' assigned + if (!OtherSmartPtrRegion) { + bool AssignedNull = Call.getArgSVal(0).isZeroConstant(); + if (!AssignedNull) + return false; + auto NullVal = C.getSValBuilder().makeNull(); + State = State->set(ThisRegion, NullVal); + C.addTransition(State, C.getNoteTag([ThisRegion](PathSensitiveBugReport &BR, + llvm::raw_ostream &OS) { + if (&BR.getBugType() != smartptr::getNullDereferenceBugType() || + !BR.isInteresting(ThisRegion)) + return; + OS << "Smart pointer "; + ThisRegion->printPretty(OS); + OS << " is assigned to null"; + })); + return true; + } + + const auto *OtherInnerPtr = State->get(OtherSmartPtrRegion); + if (OtherInnerPtr) { + State = State->set(ThisRegion, *OtherInnerPtr); + auto NullVal = C.getSValBuilder().makeNull(); + State = State->set(OtherSmartPtrRegion, NullVal); + bool IsArgValNull = OtherInnerPtr->isZeroConstant(); + + C.addTransition( + State, + C.getNoteTag([ThisRegion, OtherSmartPtrRegion, IsArgValNull]( + PathSensitiveBugReport &BR, llvm::raw_ostream &OS) { + if (&BR.getBugType() != smartptr::getNullDereferenceBugType()) + return; + if (BR.isInteresting(OtherSmartPtrRegion)) { + OS << "Smart pointer "; + OtherSmartPtrRegion->printPretty(OS); + OS << " is null after being moved to "; + ThisRegion->printPretty(OS); + } + if (BR.isInteresting(ThisRegion) && IsArgValNull) { + OS << "Null pointer value move-assigned to "; + ThisRegion->printPretty(OS); + BR.markInteresting(OtherSmartPtrRegion); + } + })); + return true; + } else { + // In case we dont know anything about value we are moving from + // remove the entry from map for which smart pointer got moved to. + auto NullVal = C.getSValBuilder().makeNull(); + State = State->remove(ThisRegion); + State = State->set(OtherSmartPtrRegion, NullVal); + C.addTransition(State, C.getNoteTag([OtherSmartPtrRegion, + ThisRegion](PathSensitiveBugReport &BR, + llvm::raw_ostream &OS) { + if (&BR.getBugType() != smartptr::getNullDereferenceBugType() || + !BR.isInteresting(OtherSmartPtrRegion)) + return; + OS << "Smart pointer "; + OtherSmartPtrRegion->printPretty(OS); + OS << " is null after; previous value moved to "; + ThisRegion->printPretty(OS); + })); + return true; + } + return false; +} + void ento::registerSmartPtrModeling(CheckerManager &Mgr) { auto *Checker = Mgr.registerChecker(); Checker->ModelSmartPtrDereference = diff --git a/clang/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp b/clang/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp index 01ac2bc83bb6b..8cd7f75e4e389 100644 --- a/clang/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp +++ b/clang/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp @@ -40,7 +40,7 @@ void AnalyzerOptions::printFormattedEntry( const size_t PadForDesc = InitialPad + EntryWidth; FOut.PadToColumn(InitialPad) << EntryDescPair.first; - // If the buffer's length is greater then PadForDesc, print a newline. + // If the buffer's length is greater than PadForDesc, print a newline. if (FOut.getColumn() > PadForDesc) FOut << '\n'; diff --git a/clang/lib/StaticAnalyzer/Core/DynamicType.cpp b/clang/lib/StaticAnalyzer/Core/DynamicType.cpp index e9b64fd79614d..9ed915aafcab0 100644 --- a/clang/lib/StaticAnalyzer/Core/DynamicType.cpp +++ b/clang/lib/StaticAnalyzer/Core/DynamicType.cpp @@ -65,6 +65,13 @@ const DynamicTypeInfo *getRawDynamicTypeInfo(ProgramStateRef State, return State->get(MR); } +static void unbox(QualType &Ty) { + // FIXME: Why are we being fed references to pointers in the first place? + while (Ty->isReferenceType() || Ty->isPointerType()) + Ty = Ty->getPointeeType(); + Ty = Ty.getCanonicalType().getUnqualifiedType(); +} + const DynamicCastInfo *getDynamicCastInfo(ProgramStateRef State, const MemRegion *MR, QualType CastFromTy, @@ -73,6 +80,9 @@ const DynamicCastInfo *getDynamicCastInfo(ProgramStateRef State, if (!Lookup) return nullptr; + unbox(CastFromTy); + unbox(CastToTy); + for (const DynamicCastInfo &Cast : *Lookup) if (Cast.equals(CastFromTy, CastToTy)) return &Cast; @@ -112,6 +122,9 @@ ProgramStateRef setDynamicTypeAndCastInfo(ProgramStateRef State, State = State->set(MR, CastToTy); } + unbox(CastFromTy); + unbox(CastToTy); + DynamicCastInfo::CastResult ResultKind = CastSucceeds ? DynamicCastInfo::CastResult::Success : DynamicCastInfo::CastResult::Failure; diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index 27b3e7ddb44e2..a4b11b5e8a961 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -3154,8 +3154,9 @@ void ExprEngine::ViewGraph(bool trim) { #ifndef NDEBUG std::string Filename = DumpGraph(trim); llvm::DisplayGraph(Filename, false, llvm::GraphProgram::DOT); -#endif +#else llvm::errs() << "Warning: viewing graph requires assertions" << "\n"; +#endif } @@ -3163,8 +3164,9 @@ void ExprEngine::ViewGraph(ArrayRef Nodes) { #ifndef NDEBUG std::string Filename = DumpGraph(Nodes); llvm::DisplayGraph(Filename, false, llvm::GraphProgram::DOT); -#endif +#else llvm::errs() << "Warning: viewing graph requires assertions" << "\n"; +#endif } std::string ExprEngine::DumpGraph(bool trim, StringRef Filename) { @@ -3201,15 +3203,17 @@ std::string ExprEngine::DumpGraph(ArrayRef Nodes, if (!TrimmedG.get()) { llvm::errs() << "warning: Trimmed ExplodedGraph is empty.\n"; + return ""; } else { return llvm::WriteGraph(TrimmedG.get(), "TrimmedExprEngine", /*ShortNames=*/false, /*Title=*/"Trimmed Exploded Graph", /*Filename=*/std::string(Filename)); } -#endif +#else llvm::errs() << "Warning: dumping graph requires assertions" << "\n"; return ""; +#endif } void *ProgramStateTrait::GDMIndex() { diff --git a/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp b/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp index b65d7f0c1a395..e0368975ea3ed 100644 --- a/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp +++ b/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp @@ -12,6 +12,7 @@ #include "clang/Lex/Lexer.h" #include "llvm/ADT/Optional.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/Path.h" namespace clang { namespace tooling { @@ -174,12 +175,22 @@ inline StringRef trimInclude(StringRef IncludeName) { const char IncludeRegexPattern[] = R"(^[\t\ ]*#[\t\ ]*(import|include)[^"<]*(["<][^">]*[">]))"; +// The filename of Path excluding extension. +// Used to match implementation with headers, this differs from sys::path::stem: +// - in names with multiple dots (foo.cu.cc) it terminates at the *first* +// - an empty stem is never returned: /foo/.bar.x => .bar +// - we don't bother to handle . and .. specially +StringRef matchingStem(llvm::StringRef Path) { + StringRef Name = llvm::sys::path::filename(Path); + return Name.substr(0, Name.find('.', 1)); +} + } // anonymous namespace IncludeCategoryManager::IncludeCategoryManager(const IncludeStyle &Style, StringRef FileName) : Style(Style), FileName(FileName) { - FileStem = llvm::sys::path::stem(FileName); + FileStem = matchingStem(FileName); for (const auto &Category : Style.IncludeCategories) CategoryRegexs.emplace_back(Category.Regex, llvm::Regex::IgnoreCase); IsMainFile = FileName.endswith(".c") || FileName.endswith(".cc") || @@ -222,8 +233,7 @@ int IncludeCategoryManager::getSortIncludePriority(StringRef IncludeName, bool IncludeCategoryManager::isMainHeader(StringRef IncludeName) const { if (!IncludeName.startswith("\"")) return false; - StringRef HeaderStem = - llvm::sys::path::stem(IncludeName.drop_front(1).drop_back(1)); + StringRef HeaderStem = matchingStem(IncludeName.drop_front(1).drop_back(1)); if (FileStem.startswith(HeaderStem) || FileStem.startswith_lower(HeaderStem)) { llvm::Regex MainIncludeRegex(HeaderStem.str() + Style.IncludeIsMainRegex, diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp index 3ab52ce5b7b4b..b07e9c3faf9d5 100644 --- a/clang/lib/Tooling/Syntax/BuildTree.cpp +++ b/clang/lib/Tooling/Syntax/BuildTree.cpp @@ -184,10 +184,11 @@ static syntax::NodeKind getOperatorNodeKind(const CXXOperatorCallExpr &E) { case OO_Array_New: case OO_Array_Delete: case OO_Coawait: - case OO_Call: case OO_Subscript: case OO_Arrow: return syntax::NodeKind::UnknownExpression; + case OO_Call: + return syntax::NodeKind::CallExpression; case OO_Conditional: // not overloadable case NUM_OVERLOADED_OPERATORS: case OO_None: @@ -546,7 +547,7 @@ class syntax::TreeBuilder { R += std::string( formatv("- '{0}' covers '{1}'+{2} tokens\n", It->second->kind(), It->first->text(A.sourceManager()), CoveredTokens)); - R += It->second->dump(A); + R += It->second->dump(A.sourceManager()); } return R; } @@ -661,7 +662,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markChildToken(S->getLBracLoc(), NodeRole::OpenParen); for (auto *Child : S->body()) - Builder.markStmtChild(Child, NodeRole::CompoundStatement_statement); + Builder.markStmtChild(Child, NodeRole::Statement); Builder.markChildToken(S->getRBracLoc(), NodeRole::CloseParen); Builder.foldNode(Builder.getStmtRange(S), @@ -819,7 +820,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { } syntax::NameSpecifier * - BuildNameSpecifier(const NestedNameSpecifierLoc &NNSLoc) { + buildNameSpecifier(const NestedNameSpecifierLoc &NNSLoc) { assert(NNSLoc.hasQualifier()); auto NameSpecifierTokens = Builder.getRange(getLocalSourceRange(NNSLoc)).drop_back(); @@ -869,11 +870,11 @@ class BuildTreeVisitor : public RecursiveASTVisitor { if (!QualifierLoc) return true; for (auto it = QualifierLoc; it; it = it.getPrefix()) { - auto *NS = BuildNameSpecifier(it); + auto *NS = buildNameSpecifier(it); if (!NS) return false; - Builder.markChild(NS, syntax::NodeRole::List_element); - Builder.markChildToken(it.getEndLoc(), syntax::NodeRole::List_delimiter); + Builder.markChild(NS, syntax::NodeRole::ListElement); + Builder.markChildToken(it.getEndLoc(), syntax::NodeRole::ListDelimiter); } Builder.foldNode(Builder.getRange(QualifierLoc.getSourceRange()), new (allocator()) syntax::NestedNameSpecifier, @@ -886,7 +887,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { SourceRange UnqualifiedIdLoc, ASTPtr From) { if (QualifierLoc) { - Builder.markChild(QualifierLoc, syntax::NodeRole::IdExpression_qualifier); + Builder.markChild(QualifierLoc, syntax::NodeRole::Qualifier); if (TemplateKeywordLoc.isValid()) Builder.markChildToken(TemplateKeywordLoc, syntax::NodeRole::TemplateKeyword); @@ -895,7 +896,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { auto *TheUnqualifiedId = new (allocator()) syntax::UnqualifiedId; Builder.foldNode(Builder.getRange(UnqualifiedIdLoc), TheUnqualifiedId, nullptr); - Builder.markChild(TheUnqualifiedId, syntax::NodeRole::IdExpression_id); + Builder.markChild(TheUnqualifiedId, syntax::NodeRole::UnqualifiedId); auto IdExpressionBeginLoc = QualifierLoc ? QualifierLoc.getBeginLoc() : UnqualifiedIdLoc.getBegin(); @@ -922,13 +923,10 @@ class BuildTreeVisitor : public RecursiveASTVisitor { S->getQualifierLoc(), S->getTemplateKeywordLoc(), SourceRange(S->getMemberLoc(), S->getEndLoc()), nullptr); - Builder.markChild(TheIdExpression, - syntax::NodeRole::MemberExpression_member); + Builder.markChild(TheIdExpression, syntax::NodeRole::Member); - Builder.markExprChild(S->getBase(), - syntax::NodeRole::MemberExpression_object); - Builder.markChildToken(S->getOperatorLoc(), - syntax::NodeRole::MemberExpression_accessToken); + Builder.markExprChild(S->getBase(), syntax::NodeRole::Object); + Builder.markChildToken(S->getOperatorLoc(), syntax::NodeRole::AccessToken); Builder.foldNode(Builder.getExprRange(S), new (allocator()) syntax::MemberExpression, S); @@ -962,8 +960,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromParenExpr(ParenExpr *S) { Builder.markChildToken(S->getLParen(), syntax::NodeRole::OpenParen); - Builder.markExprChild(S->getSubExpr(), - syntax::NodeRole::ParenExpression_subExpression); + Builder.markExprChild(S->getSubExpr(), syntax::NodeRole::SubExpression); Builder.markChildToken(S->getRParen(), syntax::NodeRole::CloseParen); Builder.foldNode(Builder.getExprRange(S), new (allocator()) syntax::ParenExpression, S); @@ -1014,9 +1011,8 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromUnaryOperator(UnaryOperator *S) { Builder.markChildToken(S->getOperatorLoc(), - syntax::NodeRole::OperatorExpression_operatorToken); - Builder.markExprChild(S->getSubExpr(), - syntax::NodeRole::UnaryOperatorExpression_operand); + syntax::NodeRole::OperatorToken); + Builder.markExprChild(S->getSubExpr(), syntax::NodeRole::Operand); if (S->isPostfix()) Builder.foldNode(Builder.getExprRange(S), @@ -1031,17 +1027,53 @@ class BuildTreeVisitor : public RecursiveASTVisitor { } bool WalkUpFromBinaryOperator(BinaryOperator *S) { - Builder.markExprChild( - S->getLHS(), syntax::NodeRole::BinaryOperatorExpression_leftHandSide); + Builder.markExprChild(S->getLHS(), syntax::NodeRole::LeftHandSide); Builder.markChildToken(S->getOperatorLoc(), - syntax::NodeRole::OperatorExpression_operatorToken); - Builder.markExprChild( - S->getRHS(), syntax::NodeRole::BinaryOperatorExpression_rightHandSide); + syntax::NodeRole::OperatorToken); + Builder.markExprChild(S->getRHS(), syntax::NodeRole::RightHandSide); Builder.foldNode(Builder.getExprRange(S), new (allocator()) syntax::BinaryOperatorExpression, S); return true; } + syntax::CallArguments *buildCallArguments(CallExpr::arg_range Args) { + for (const auto &Arg : Args) { + Builder.markExprChild(Arg, syntax::NodeRole::ListElement); + const auto *DelimiterToken = + std::next(Builder.findToken(Arg->getEndLoc())); + if (DelimiterToken->kind() == clang::tok::TokenKind::comma) + Builder.markChildToken(DelimiterToken, syntax::NodeRole::ListDelimiter); + } + + auto *Arguments = new (allocator()) syntax::CallArguments; + if (!Args.empty()) + Builder.foldNode(Builder.getRange((*Args.begin())->getBeginLoc(), + (*(Args.end() - 1))->getEndLoc()), + Arguments, nullptr); + + return Arguments; + } + + bool WalkUpFromCallExpr(CallExpr *S) { + Builder.markExprChild(S->getCallee(), syntax::NodeRole::Callee); + + const auto *LParenToken = + std::next(Builder.findToken(S->getCallee()->getEndLoc())); + // FIXME: Assert that `LParenToken` is indeed a `l_paren` once we have fixed + // the test on decltype desctructors. + if (LParenToken->kind() == clang::tok::l_paren) + Builder.markChildToken(LParenToken, syntax::NodeRole::OpenParen); + + Builder.markChild(buildCallArguments(S->arguments()), + syntax::NodeRole::Arguments); + + Builder.markChildToken(S->getRParenLoc(), syntax::NodeRole::CloseParen); + + Builder.foldNode(Builder.getRange(S->getSourceRange()), + new (allocator()) syntax::CallExpression, S); + return true; + } + bool TraverseCXXOperatorCallExpr(CXXOperatorCallExpr *S) { // To construct a syntax tree of the same shape for calls to built-in and // user-defined operators, ignore the `DeclRefExpr` that refers to the @@ -1068,40 +1100,51 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromCXXOperatorCallExpr(CXXOperatorCallExpr *S) { switch (getOperatorNodeKind(*S)) { case syntax::NodeKind::BinaryOperatorExpression: - Builder.markExprChild( - S->getArg(0), - syntax::NodeRole::BinaryOperatorExpression_leftHandSide); - Builder.markChildToken( - S->getOperatorLoc(), - syntax::NodeRole::OperatorExpression_operatorToken); - Builder.markExprChild( - S->getArg(1), - syntax::NodeRole::BinaryOperatorExpression_rightHandSide); + Builder.markExprChild(S->getArg(0), syntax::NodeRole::LeftHandSide); + Builder.markChildToken(S->getOperatorLoc(), + syntax::NodeRole::OperatorToken); + Builder.markExprChild(S->getArg(1), syntax::NodeRole::RightHandSide); Builder.foldNode(Builder.getExprRange(S), new (allocator()) syntax::BinaryOperatorExpression, S); return true; case syntax::NodeKind::PrefixUnaryOperatorExpression: - Builder.markChildToken( - S->getOperatorLoc(), - syntax::NodeRole::OperatorExpression_operatorToken); - Builder.markExprChild(S->getArg(0), - syntax::NodeRole::UnaryOperatorExpression_operand); + Builder.markChildToken(S->getOperatorLoc(), + syntax::NodeRole::OperatorToken); + Builder.markExprChild(S->getArg(0), syntax::NodeRole::Operand); Builder.foldNode(Builder.getExprRange(S), new (allocator()) syntax::PrefixUnaryOperatorExpression, S); return true; case syntax::NodeKind::PostfixUnaryOperatorExpression: - Builder.markChildToken( - S->getOperatorLoc(), - syntax::NodeRole::OperatorExpression_operatorToken); - Builder.markExprChild(S->getArg(0), - syntax::NodeRole::UnaryOperatorExpression_operand); + Builder.markChildToken(S->getOperatorLoc(), + syntax::NodeRole::OperatorToken); + Builder.markExprChild(S->getArg(0), syntax::NodeRole::Operand); Builder.foldNode(Builder.getExprRange(S), new (allocator()) syntax::PostfixUnaryOperatorExpression, S); return true; + case syntax::NodeKind::CallExpression: { + Builder.markExprChild(S->getArg(0), syntax::NodeRole::Callee); + + const auto *LParenToken = + std::next(Builder.findToken(S->getArg(0)->getEndLoc())); + // FIXME: Assert that `LParenToken` is indeed a `l_paren` once we have + // fixed the test on decltype desctructors. + if (LParenToken->kind() == clang::tok::l_paren) + Builder.markChildToken(LParenToken, syntax::NodeRole::OpenParen); + + Builder.markChild(buildCallArguments(CallExpr::arg_range( + S->arg_begin() + 1, S->arg_end())), + syntax::NodeRole::Arguments); + + Builder.markChildToken(S->getRParenLoc(), syntax::NodeRole::CloseParen); + + Builder.foldNode(Builder.getRange(S->getSourceRange()), + new (allocator()) syntax::CallExpression, S); + return true; + } case syntax::NodeKind::UnknownExpression: - return RecursiveASTVisitor::WalkUpFromCXXOperatorCallExpr(S); + return WalkUpFromExpr(S); default: llvm_unreachable("getOperatorNodeKind() does not return this value"); } @@ -1139,19 +1182,35 @@ class BuildTreeVisitor : public RecursiveASTVisitor { // Declarator chunks, they are produced by type locs and some clang::Decls. bool WalkUpFromArrayTypeLoc(ArrayTypeLoc L) { Builder.markChildToken(L.getLBracketLoc(), syntax::NodeRole::OpenParen); - Builder.markExprChild(L.getSizeExpr(), - syntax::NodeRole::ArraySubscript_sizeExpression); + Builder.markExprChild(L.getSizeExpr(), syntax::NodeRole::Size); Builder.markChildToken(L.getRBracketLoc(), syntax::NodeRole::CloseParen); Builder.foldNode(Builder.getRange(L.getLBracketLoc(), L.getRBracketLoc()), new (allocator()) syntax::ArraySubscript, L); return true; } + syntax::ParameterDeclarationList * + buildParameterDeclarationList(ArrayRef Params) { + for (auto *P : Params) { + Builder.markChild(P, syntax::NodeRole::ListElement); + const auto *DelimiterToken = std::next(Builder.findToken(P->getEndLoc())); + if (DelimiterToken->kind() == clang::tok::TokenKind::comma) + Builder.markChildToken(DelimiterToken, syntax::NodeRole::ListDelimiter); + } + auto *Parameters = new (allocator()) syntax::ParameterDeclarationList; + if (!Params.empty()) + Builder.foldNode(Builder.getRange(Params.front()->getBeginLoc(), + Params.back()->getEndLoc()), + Parameters, nullptr); + return Parameters; + } + bool WalkUpFromFunctionTypeLoc(FunctionTypeLoc L) { Builder.markChildToken(L.getLParenLoc(), syntax::NodeRole::OpenParen); - for (auto *P : L.getParams()) { - Builder.markChild(P, syntax::NodeRole::ParametersAndQualifiers_parameter); - } + + Builder.markChild(buildParameterDeclarationList(L.getParams()), + syntax::NodeRole::Parameters); + Builder.markChildToken(L.getRParenLoc(), syntax::NodeRole::CloseParen); Builder.foldNode(Builder.getRange(L.getLParenLoc(), L.getEndLoc()), new (allocator()) syntax::ParametersAndQualifiers, L); @@ -1162,10 +1221,9 @@ class BuildTreeVisitor : public RecursiveASTVisitor { if (!L.getTypePtr()->hasTrailingReturn()) return WalkUpFromFunctionTypeLoc(L); - auto *TrailingReturnTokens = BuildTrailingReturn(L); + auto *TrailingReturnTokens = buildTrailingReturn(L); // Finish building the node for parameters. - Builder.markChild(TrailingReturnTokens, - syntax::NodeRole::ParametersAndQualifiers_trailingReturn); + Builder.markChild(TrailingReturnTokens, syntax::NodeRole::TrailingReturn); return WalkUpFromFunctionTypeLoc(L); } @@ -1213,7 +1271,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromCaseStmt(CaseStmt *S) { Builder.markChildToken(S->getKeywordLoc(), syntax::NodeRole::IntroducerKeyword); - Builder.markExprChild(S->getLHS(), syntax::NodeRole::CaseStatement_value); + Builder.markExprChild(S->getLHS(), syntax::NodeRole::CaseValue); Builder.markStmtChild(S->getSubStmt(), syntax::NodeRole::BodyStatement); Builder.foldNode(Builder.getStmtRange(S), new (allocator()) syntax::CaseStatement, S); @@ -1231,12 +1289,9 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromIfStmt(IfStmt *S) { Builder.markChildToken(S->getIfLoc(), syntax::NodeRole::IntroducerKeyword); - Builder.markStmtChild(S->getThen(), - syntax::NodeRole::IfStatement_thenStatement); - Builder.markChildToken(S->getElseLoc(), - syntax::NodeRole::IfStatement_elseKeyword); - Builder.markStmtChild(S->getElse(), - syntax::NodeRole::IfStatement_elseStatement); + Builder.markStmtChild(S->getThen(), syntax::NodeRole::ThenStatement); + Builder.markChildToken(S->getElseLoc(), syntax::NodeRole::ElseKeyword); + Builder.markStmtChild(S->getElse(), syntax::NodeRole::ElseStatement); Builder.foldNode(Builder.getStmtRange(S), new (allocator()) syntax::IfStatement, S); return true; @@ -1278,8 +1333,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromReturnStmt(ReturnStmt *S) { Builder.markChildToken(S->getReturnLoc(), syntax::NodeRole::IntroducerKeyword); - Builder.markExprChild(S->getRetValue(), - syntax::NodeRole::ReturnStatement_value); + Builder.markExprChild(S->getRetValue(), syntax::NodeRole::ReturnValue); Builder.foldNode(Builder.getStmtRange(S), new (allocator()) syntax::ReturnStatement, S); return true; @@ -1300,10 +1354,8 @@ class BuildTreeVisitor : public RecursiveASTVisitor { } bool WalkUpFromStaticAssertDecl(StaticAssertDecl *S) { - Builder.markExprChild(S->getAssertExpr(), - syntax::NodeRole::StaticAssertDeclaration_condition); - Builder.markExprChild(S->getMessage(), - syntax::NodeRole::StaticAssertDeclaration_message); + Builder.markExprChild(S->getAssertExpr(), syntax::NodeRole::Condition); + Builder.markExprChild(S->getMessage(), syntax::NodeRole::Message); Builder.foldNode(Builder.getDeclarationRange(S), new (allocator()) syntax::StaticAssertDeclaration, S); return true; @@ -1396,7 +1448,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { if (Range.getBegin().isValid()) { auto *N = new (allocator()) syntax::SimpleDeclarator; Builder.foldNode(Builder.getRange(Range), N, nullptr); - Builder.markChild(N, syntax::NodeRole::SimpleDeclaration_declarator); + Builder.markChild(N, syntax::NodeRole::Declarator); } if (Builder.isResponsibleForCreatingDeclaration(D)) { @@ -1407,7 +1459,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { } /// Returns the range of the built node. - syntax::TrailingReturnType *BuildTrailingReturn(FunctionProtoTypeLoc L) { + syntax::TrailingReturnType *buildTrailingReturn(FunctionProtoTypeLoc L) { assert(L.getTypePtr()->hasTrailingReturn()); auto ReturnedType = L.getReturnLoc(); @@ -1430,8 +1482,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { auto Tokens = llvm::makeArrayRef(Arrow, Return.end()); Builder.markChildToken(Arrow, syntax::NodeRole::ArrowToken); if (ReturnDeclarator) - Builder.markChild(ReturnDeclarator, - syntax::NodeRole::TrailingReturnType_declarator); + Builder.markChild(ReturnDeclarator, syntax::NodeRole::Declarator); auto *R = new (allocator()) syntax::TrailingReturnType; Builder.foldNode(Tokens, R, L); return R; @@ -1445,9 +1496,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { assert(TemplateKW && TemplateKW->kind() == tok::kw_template); Builder.markChildToken(ExternKW, syntax::NodeRole::ExternKeyword); Builder.markChildToken(TemplateKW, syntax::NodeRole::IntroducerKeyword); - Builder.markChild( - InnerDeclaration, - syntax::NodeRole::ExplicitTemplateInstantiation_declaration); + Builder.markChild(InnerDeclaration, syntax::NodeRole::Declaration); Builder.foldNode( Range, new (allocator()) syntax::ExplicitTemplateInstantiation, From); } @@ -1460,7 +1509,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { auto *N = new (allocator()) syntax::TemplateDeclaration; Builder.foldNode(Range, N, From); - Builder.markChild(N, syntax::NodeRole::TemplateDeclaration_declaration); + Builder.markChild(N, syntax::NodeRole::Declaration); return N; } @@ -1512,7 +1561,7 @@ void syntax::TreeBuilder::markStmtChild(Stmt *Child, NodeRole Role) { if (Expr *ChildExpr = dyn_cast(Child)) { // This is an expression in a statement position, consume the trailing // semicolon and form an 'ExpressionStatement' node. - markExprChild(ChildExpr, NodeRole::ExpressionStatement_expression); + markExprChild(ChildExpr, NodeRole::Expression); ChildNode = new (allocator()) syntax::ExpressionStatement; // (!) 'getStmtRange()' ensures this covers a trailing semicolon. Pending.foldChildren(Arena, getStmtRange(Child), ChildNode); diff --git a/clang/lib/Tooling/Syntax/Nodes.cpp b/clang/lib/Tooling/Syntax/Nodes.cpp index ecd4c4dac7288..6102c45a08e4d 100644 --- a/clang/lib/Tooling/Syntax/Nodes.cpp +++ b/clang/lib/Tooling/Syntax/Nodes.cpp @@ -52,6 +52,8 @@ raw_ostream &syntax::operator<<(raw_ostream &OS, NodeKind K) { return OS << "UnqualifiedId"; case NodeKind::IdExpression: return OS << "IdExpression"; + case NodeKind::CallExpression: + return OS << "CallExpression"; case NodeKind::UnknownStatement: return OS << "UnknownStatement"; case NodeKind::DeclarationStatement: @@ -130,6 +132,10 @@ raw_ostream &syntax::operator<<(raw_ostream &OS, NodeKind K) { return OS << "NestedNameSpecifier"; case NodeKind::MemberExpression: return OS << "MemberExpression"; + case NodeKind::CallArguments: + return OS << "CallArguments"; + case NodeKind::ParameterDeclarationList: + return OS << "ParameterDeclarationList"; } llvm_unreachable("unknown node kind"); } @@ -156,69 +162,70 @@ raw_ostream &syntax::operator<<(raw_ostream &OS, NodeRole R) { return OS << "TemplateKeyword"; case syntax::NodeRole::BodyStatement: return OS << "BodyStatement"; - case syntax::NodeRole::List_element: - return OS << "List_element"; - case syntax::NodeRole::List_delimiter: - return OS << "List_delimiter"; - case syntax::NodeRole::CaseStatement_value: - return OS << "CaseStatement_value"; - case syntax::NodeRole::IfStatement_thenStatement: - return OS << "IfStatement_thenStatement"; - case syntax::NodeRole::IfStatement_elseKeyword: - return OS << "IfStatement_elseKeyword"; - case syntax::NodeRole::IfStatement_elseStatement: - return OS << "IfStatement_elseStatement"; - case syntax::NodeRole::OperatorExpression_operatorToken: - return OS << "OperatorExpression_operatorToken"; - case syntax::NodeRole::UnaryOperatorExpression_operand: - return OS << "UnaryOperatorExpression_operand"; - case syntax::NodeRole::BinaryOperatorExpression_leftHandSide: - return OS << "BinaryOperatorExpression_leftHandSide"; - case syntax::NodeRole::BinaryOperatorExpression_rightHandSide: - return OS << "BinaryOperatorExpression_rightHandSide"; - case syntax::NodeRole::ReturnStatement_value: - return OS << "ReturnStatement_value"; - case syntax::NodeRole::ExpressionStatement_expression: - return OS << "ExpressionStatement_expression"; - case syntax::NodeRole::CompoundStatement_statement: - return OS << "CompoundStatement_statement"; - case syntax::NodeRole::StaticAssertDeclaration_condition: - return OS << "StaticAssertDeclaration_condition"; - case syntax::NodeRole::StaticAssertDeclaration_message: - return OS << "StaticAssertDeclaration_message"; - case syntax::NodeRole::SimpleDeclaration_declarator: - return OS << "SimpleDeclaration_declarator"; - case syntax::NodeRole::TemplateDeclaration_declaration: - return OS << "TemplateDeclaration_declaration"; - case syntax::NodeRole::ExplicitTemplateInstantiation_declaration: - return OS << "ExplicitTemplateInstantiation_declaration"; - case syntax::NodeRole::ArraySubscript_sizeExpression: - return OS << "ArraySubscript_sizeExpression"; - case syntax::NodeRole::TrailingReturnType_declarator: - return OS << "TrailingReturnType_declarator"; - case syntax::NodeRole::ParametersAndQualifiers_parameter: - return OS << "ParametersAndQualifiers_parameter"; - case syntax::NodeRole::ParametersAndQualifiers_trailingReturn: - return OS << "ParametersAndQualifiers_trailingReturn"; - case syntax::NodeRole::IdExpression_id: - return OS << "IdExpression_id"; - case syntax::NodeRole::IdExpression_qualifier: - return OS << "IdExpression_qualifier"; - case syntax::NodeRole::ParenExpression_subExpression: - return OS << "ParenExpression_subExpression"; - case syntax::NodeRole::MemberExpression_object: - return OS << "MemberExpression_object"; - case syntax::NodeRole::MemberExpression_accessToken: - return OS << "MemberExpression_accessToken"; - case syntax::NodeRole::MemberExpression_member: - return OS << "MemberExpression_member"; + case syntax::NodeRole::ListElement: + return OS << "ListElement"; + case syntax::NodeRole::ListDelimiter: + return OS << "ListDelimiter"; + case syntax::NodeRole::CaseValue: + return OS << "CaseValue"; + case syntax::NodeRole::ReturnValue: + return OS << "ReturnValue"; + case syntax::NodeRole::ThenStatement: + return OS << "ThenStatement"; + case syntax::NodeRole::ElseKeyword: + return OS << "ElseKeyword"; + case syntax::NodeRole::ElseStatement: + return OS << "ElseStatement"; + case syntax::NodeRole::OperatorToken: + return OS << "OperatorToken"; + case syntax::NodeRole::Operand: + return OS << "Operand"; + case syntax::NodeRole::LeftHandSide: + return OS << "LeftHandSide"; + case syntax::NodeRole::RightHandSide: + return OS << "RightHandSide"; + case syntax::NodeRole::Expression: + return OS << "Expression"; + case syntax::NodeRole::Statement: + return OS << "Statement"; + case syntax::NodeRole::Condition: + return OS << "Condition"; + case syntax::NodeRole::Message: + return OS << "Message"; + case syntax::NodeRole::Declarator: + return OS << "Declarator"; + case syntax::NodeRole::Declaration: + return OS << "Declaration"; + case syntax::NodeRole::Size: + return OS << "Size"; + case syntax::NodeRole::Parameters: + return OS << "Parameters"; + case syntax::NodeRole::TrailingReturn: + return OS << "TrailingReturn"; + case syntax::NodeRole::UnqualifiedId: + return OS << "UnqualifiedId"; + case syntax::NodeRole::Qualifier: + return OS << "Qualifier"; + case syntax::NodeRole::SubExpression: + return OS << "SubExpression"; + case syntax::NodeRole::Object: + return OS << "Object"; + case syntax::NodeRole::AccessToken: + return OS << "AccessToken"; + case syntax::NodeRole::Member: + return OS << "Member"; + case syntax::NodeRole::Callee: + return OS << "Callee"; + case syntax::NodeRole::Arguments: + return OS << "Arguments"; } llvm_unreachable("invalid role"); } // We could have an interator in list to not pay memory costs of temporary // vector -std::vector syntax::NestedNameSpecifier::specifiers() { +std::vector +syntax::NestedNameSpecifier::getSpecifiers() { auto specifiersAsNodes = getElementsAsNodes(); std::vector Children; for (const auto &element : specifiersAsNodes) { @@ -228,7 +235,7 @@ std::vector syntax::NestedNameSpecifier::specifiers() { } std::vector> -syntax::NestedNameSpecifier::specifiersAndDoubleColons() { +syntax::NestedNameSpecifier::getSpecifiersAndDoubleColons() { auto specifiersAsNodesAndDoubleColons = getElementsAsNodesAndDelimiters(); std::vector> Children; @@ -240,308 +247,359 @@ syntax::NestedNameSpecifier::specifiersAndDoubleColons() { return Children; } -syntax::Expression *syntax::MemberExpression::object() { - return cast_or_null( - findChild(syntax::NodeRole::MemberExpression_object)); +std::vector syntax::CallArguments::getArguments() { + auto ArgumentsAsNodes = getElementsAsNodes(); + std::vector Children; + for (const auto &ArgumentAsNode : ArgumentsAsNodes) { + Children.push_back(llvm::cast(ArgumentAsNode)); + } + return Children; +} + +std::vector> +syntax::CallArguments::getArgumentsAndCommas() { + auto ArgumentsAsNodesAndCommas = getElementsAsNodesAndDelimiters(); + std::vector> Children; + for (const auto &ArgumentAsNodeAndComma : ArgumentsAsNodesAndCommas) { + Children.push_back( + {llvm::cast(ArgumentAsNodeAndComma.element), + ArgumentAsNodeAndComma.delimiter}); + } + return Children; +} + +std::vector +syntax::ParameterDeclarationList::getParameterDeclarations() { + auto ParametersAsNodes = getElementsAsNodes(); + std::vector Children; + for (const auto &ParameterAsNode : ParametersAsNodes) { + Children.push_back(llvm::cast(ParameterAsNode)); + } + return Children; +} + +std::vector> +syntax::ParameterDeclarationList::getParametersAndCommas() { + auto ParametersAsNodesAndCommas = getElementsAsNodesAndDelimiters(); + std::vector> + Children; + for (const auto &ParameterAsNodeAndComma : ParametersAsNodesAndCommas) { + Children.push_back( + {llvm::cast(ParameterAsNodeAndComma.element), + ParameterAsNodeAndComma.delimiter}); + } + return Children; +} + +syntax::Expression *syntax::MemberExpression::getObject() { + return cast_or_null(findChild(syntax::NodeRole::Object)); } -syntax::Leaf *syntax::MemberExpression::templateKeyword() { +syntax::Leaf *syntax::MemberExpression::getTemplateKeyword() { return llvm::cast_or_null( findChild(syntax::NodeRole::TemplateKeyword)); } -syntax::Leaf *syntax::MemberExpression::accessToken() { +syntax::Leaf *syntax::MemberExpression::getAccessToken() { return llvm::cast_or_null( - findChild(syntax::NodeRole::MemberExpression_accessToken)); + findChild(syntax::NodeRole::AccessToken)); } -syntax::IdExpression *syntax::MemberExpression::member() { +syntax::IdExpression *syntax::MemberExpression::getMember() { return cast_or_null( - findChild(syntax::NodeRole::MemberExpression_member)); + findChild(syntax::NodeRole::Member)); } -syntax::NestedNameSpecifier *syntax::IdExpression::qualifier() { +syntax::NestedNameSpecifier *syntax::IdExpression::getQualifier() { return cast_or_null( - findChild(syntax::NodeRole::IdExpression_qualifier)); + findChild(syntax::NodeRole::Qualifier)); } -syntax::Leaf *syntax::IdExpression::templateKeyword() { +syntax::Leaf *syntax::IdExpression::getTemplateKeyword() { return llvm::cast_or_null( findChild(syntax::NodeRole::TemplateKeyword)); } -syntax::UnqualifiedId *syntax::IdExpression::unqualifiedId() { +syntax::UnqualifiedId *syntax::IdExpression::getUnqualifiedId() { return cast_or_null( - findChild(syntax::NodeRole::IdExpression_id)); + findChild(syntax::NodeRole::UnqualifiedId)); } -syntax::Leaf *syntax::ParenExpression::openParen() { +syntax::Leaf *syntax::ParenExpression::getOpenParen() { return cast_or_null(findChild(syntax::NodeRole::OpenParen)); } -syntax::Expression *syntax::ParenExpression::subExpression() { +syntax::Expression *syntax::ParenExpression::getSubExpression() { return cast_or_null( - findChild(syntax::NodeRole::ParenExpression_subExpression)); + findChild(syntax::NodeRole::SubExpression)); } -syntax::Leaf *syntax::ParenExpression::closeParen() { +syntax::Leaf *syntax::ParenExpression::getCloseParen() { return cast_or_null(findChild(syntax::NodeRole::CloseParen)); } -syntax::Leaf *syntax::ThisExpression::thisKeyword() { +syntax::Leaf *syntax::ThisExpression::getThisKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Leaf *syntax::LiteralExpression::literalToken() { +syntax::Leaf *syntax::LiteralExpression::getLiteralToken() { return cast_or_null(findChild(syntax::NodeRole::LiteralToken)); } -syntax::Expression *syntax::BinaryOperatorExpression::lhs() { +syntax::Expression *syntax::BinaryOperatorExpression::getLhs() { return cast_or_null( - findChild(syntax::NodeRole::BinaryOperatorExpression_leftHandSide)); + findChild(syntax::NodeRole::LeftHandSide)); } -syntax::Leaf *syntax::UnaryOperatorExpression::operatorToken() { - return cast_or_null( - findChild(syntax::NodeRole::OperatorExpression_operatorToken)); +syntax::Leaf *syntax::UnaryOperatorExpression::getOperatorToken() { + return cast_or_null(findChild(syntax::NodeRole::OperatorToken)); } -syntax::Expression *syntax::UnaryOperatorExpression::operand() { - return cast_or_null( - findChild(syntax::NodeRole::UnaryOperatorExpression_operand)); +syntax::Expression *syntax::UnaryOperatorExpression::getOperand() { + return cast_or_null(findChild(syntax::NodeRole::Operand)); } -syntax::Leaf *syntax::BinaryOperatorExpression::operatorToken() { - return cast_or_null( - findChild(syntax::NodeRole::OperatorExpression_operatorToken)); +syntax::Leaf *syntax::BinaryOperatorExpression::getOperatorToken() { + return cast_or_null(findChild(syntax::NodeRole::OperatorToken)); } -syntax::Expression *syntax::BinaryOperatorExpression::rhs() { +syntax::Expression *syntax::BinaryOperatorExpression::getRhs() { return cast_or_null( - findChild(syntax::NodeRole::BinaryOperatorExpression_rightHandSide)); + findChild(syntax::NodeRole::RightHandSide)); +} + +syntax::Expression *syntax::CallExpression::getCallee() { + return cast_or_null(findChild(syntax::NodeRole::Callee)); +} + +syntax::Leaf *syntax::CallExpression::getOpenParen() { + return cast_or_null(findChild(syntax::NodeRole::OpenParen)); +} + +syntax::CallArguments *syntax::CallExpression::getArguments() { + return cast_or_null( + findChild(syntax::NodeRole::Arguments)); } -syntax::Leaf *syntax::SwitchStatement::switchKeyword() { +syntax::Leaf *syntax::CallExpression::getCloseParen() { + return cast_or_null(findChild(syntax::NodeRole::CloseParen)); +} + +syntax::Leaf *syntax::SwitchStatement::getSwitchKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Statement *syntax::SwitchStatement::body() { +syntax::Statement *syntax::SwitchStatement::getBody() { return cast_or_null( findChild(syntax::NodeRole::BodyStatement)); } -syntax::Leaf *syntax::CaseStatement::caseKeyword() { +syntax::Leaf *syntax::CaseStatement::getCaseKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Expression *syntax::CaseStatement::value() { +syntax::Expression *syntax::CaseStatement::getCaseValue() { return cast_or_null( - findChild(syntax::NodeRole::CaseStatement_value)); + findChild(syntax::NodeRole::CaseValue)); } -syntax::Statement *syntax::CaseStatement::body() { +syntax::Statement *syntax::CaseStatement::getBody() { return cast_or_null( findChild(syntax::NodeRole::BodyStatement)); } -syntax::Leaf *syntax::DefaultStatement::defaultKeyword() { +syntax::Leaf *syntax::DefaultStatement::getDefaultKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Statement *syntax::DefaultStatement::body() { +syntax::Statement *syntax::DefaultStatement::getBody() { return cast_or_null( findChild(syntax::NodeRole::BodyStatement)); } -syntax::Leaf *syntax::IfStatement::ifKeyword() { +syntax::Leaf *syntax::IfStatement::getIfKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Statement *syntax::IfStatement::thenStatement() { +syntax::Statement *syntax::IfStatement::getThenStatement() { return cast_or_null( - findChild(syntax::NodeRole::IfStatement_thenStatement)); + findChild(syntax::NodeRole::ThenStatement)); } -syntax::Leaf *syntax::IfStatement::elseKeyword() { - return cast_or_null( - findChild(syntax::NodeRole::IfStatement_elseKeyword)); +syntax::Leaf *syntax::IfStatement::getElseKeyword() { + return cast_or_null(findChild(syntax::NodeRole::ElseKeyword)); } -syntax::Statement *syntax::IfStatement::elseStatement() { +syntax::Statement *syntax::IfStatement::getElseStatement() { return cast_or_null( - findChild(syntax::NodeRole::IfStatement_elseStatement)); + findChild(syntax::NodeRole::ElseStatement)); } -syntax::Leaf *syntax::ForStatement::forKeyword() { +syntax::Leaf *syntax::ForStatement::getForKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Statement *syntax::ForStatement::body() { +syntax::Statement *syntax::ForStatement::getBody() { return cast_or_null( findChild(syntax::NodeRole::BodyStatement)); } -syntax::Leaf *syntax::WhileStatement::whileKeyword() { +syntax::Leaf *syntax::WhileStatement::getWhileKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Statement *syntax::WhileStatement::body() { +syntax::Statement *syntax::WhileStatement::getBody() { return cast_or_null( findChild(syntax::NodeRole::BodyStatement)); } -syntax::Leaf *syntax::ContinueStatement::continueKeyword() { +syntax::Leaf *syntax::ContinueStatement::getContinueKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Leaf *syntax::BreakStatement::breakKeyword() { +syntax::Leaf *syntax::BreakStatement::getBreakKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Leaf *syntax::ReturnStatement::returnKeyword() { +syntax::Leaf *syntax::ReturnStatement::getReturnKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Expression *syntax::ReturnStatement::value() { +syntax::Expression *syntax::ReturnStatement::getReturnValue() { return cast_or_null( - findChild(syntax::NodeRole::ReturnStatement_value)); + findChild(syntax::NodeRole::ReturnValue)); } -syntax::Leaf *syntax::RangeBasedForStatement::forKeyword() { +syntax::Leaf *syntax::RangeBasedForStatement::getForKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Statement *syntax::RangeBasedForStatement::body() { +syntax::Statement *syntax::RangeBasedForStatement::getBody() { return cast_or_null( findChild(syntax::NodeRole::BodyStatement)); } -syntax::Expression *syntax::ExpressionStatement::expression() { +syntax::Expression *syntax::ExpressionStatement::getExpression() { return cast_or_null( - findChild(syntax::NodeRole::ExpressionStatement_expression)); + findChild(syntax::NodeRole::Expression)); } -syntax::Leaf *syntax::CompoundStatement::lbrace() { +syntax::Leaf *syntax::CompoundStatement::getLbrace() { return cast_or_null(findChild(syntax::NodeRole::OpenParen)); } -std::vector syntax::CompoundStatement::statements() { +std::vector syntax::CompoundStatement::getStatements() { std::vector Children; for (auto *C = firstChild(); C; C = C->nextSibling()) { - assert(C->role() == syntax::NodeRole::CompoundStatement_statement); + assert(C->role() == syntax::NodeRole::Statement); Children.push_back(cast(C)); } return Children; } -syntax::Leaf *syntax::CompoundStatement::rbrace() { +syntax::Leaf *syntax::CompoundStatement::getRbrace() { return cast_or_null(findChild(syntax::NodeRole::CloseParen)); } -syntax::Expression *syntax::StaticAssertDeclaration::condition() { +syntax::Expression *syntax::StaticAssertDeclaration::getCondition() { return cast_or_null( - findChild(syntax::NodeRole::StaticAssertDeclaration_condition)); + findChild(syntax::NodeRole::Condition)); } -syntax::Expression *syntax::StaticAssertDeclaration::message() { - return cast_or_null( - findChild(syntax::NodeRole::StaticAssertDeclaration_message)); +syntax::Expression *syntax::StaticAssertDeclaration::getMessage() { + return cast_or_null(findChild(syntax::NodeRole::Message)); } std::vector -syntax::SimpleDeclaration::declarators() { +syntax::SimpleDeclaration::getDeclarators() { std::vector Children; for (auto *C = firstChild(); C; C = C->nextSibling()) { - if (C->role() == syntax::NodeRole::SimpleDeclaration_declarator) + if (C->role() == syntax::NodeRole::Declarator) Children.push_back(cast(C)); } return Children; } -syntax::Leaf *syntax::TemplateDeclaration::templateKeyword() { +syntax::Leaf *syntax::TemplateDeclaration::getTemplateKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Declaration *syntax::TemplateDeclaration::declaration() { +syntax::Declaration *syntax::TemplateDeclaration::getDeclaration() { return cast_or_null( - findChild(syntax::NodeRole::TemplateDeclaration_declaration)); + findChild(syntax::NodeRole::Declaration)); } -syntax::Leaf *syntax::ExplicitTemplateInstantiation::templateKeyword() { +syntax::Leaf *syntax::ExplicitTemplateInstantiation::getTemplateKeyword() { return cast_or_null( findChild(syntax::NodeRole::IntroducerKeyword)); } -syntax::Leaf *syntax::ExplicitTemplateInstantiation::externKeyword() { +syntax::Leaf *syntax::ExplicitTemplateInstantiation::getExternKeyword() { return cast_or_null(findChild(syntax::NodeRole::ExternKeyword)); } -syntax::Declaration *syntax::ExplicitTemplateInstantiation::declaration() { +syntax::Declaration *syntax::ExplicitTemplateInstantiation::getDeclaration() { return cast_or_null( - findChild(syntax::NodeRole::ExplicitTemplateInstantiation_declaration)); + findChild(syntax::NodeRole::Declaration)); } -syntax::Leaf *syntax::ParenDeclarator::lparen() { +syntax::Leaf *syntax::ParenDeclarator::getLparen() { return cast_or_null(findChild(syntax::NodeRole::OpenParen)); } -syntax::Leaf *syntax::ParenDeclarator::rparen() { +syntax::Leaf *syntax::ParenDeclarator::getRparen() { return cast_or_null(findChild(syntax::NodeRole::CloseParen)); } -syntax::Leaf *syntax::ArraySubscript::lbracket() { +syntax::Leaf *syntax::ArraySubscript::getLbracket() { return cast_or_null(findChild(syntax::NodeRole::OpenParen)); } -syntax::Expression *syntax::ArraySubscript::sizeExpression() { - return cast_or_null( - findChild(syntax::NodeRole::ArraySubscript_sizeExpression)); +syntax::Expression *syntax::ArraySubscript::getSize() { + return cast_or_null(findChild(syntax::NodeRole::Size)); } -syntax::Leaf *syntax::ArraySubscript::rbracket() { +syntax::Leaf *syntax::ArraySubscript::getRbracket() { return cast_or_null(findChild(syntax::NodeRole::CloseParen)); } -syntax::Leaf *syntax::TrailingReturnType::arrowToken() { +syntax::Leaf *syntax::TrailingReturnType::getArrowToken() { return cast_or_null(findChild(syntax::NodeRole::ArrowToken)); } -syntax::SimpleDeclarator *syntax::TrailingReturnType::declarator() { +syntax::SimpleDeclarator *syntax::TrailingReturnType::getDeclarator() { return cast_or_null( - findChild(syntax::NodeRole::TrailingReturnType_declarator)); + findChild(syntax::NodeRole::Declarator)); } -syntax::Leaf *syntax::ParametersAndQualifiers::lparen() { +syntax::Leaf *syntax::ParametersAndQualifiers::getLparen() { return cast_or_null(findChild(syntax::NodeRole::OpenParen)); } -std::vector -syntax::ParametersAndQualifiers::parameters() { - std::vector Children; - for (auto *C = firstChild(); C; C = C->nextSibling()) { - if (C->role() == syntax::NodeRole::ParametersAndQualifiers_parameter) - Children.push_back(cast(C)); - } - return Children; +syntax::ParameterDeclarationList * +syntax::ParametersAndQualifiers::getParameters() { + return cast_or_null( + findChild(syntax::NodeRole::Parameters)); } -syntax::Leaf *syntax::ParametersAndQualifiers::rparen() { +syntax::Leaf *syntax::ParametersAndQualifiers::getRparen() { return cast_or_null(findChild(syntax::NodeRole::CloseParen)); } -syntax::TrailingReturnType *syntax::ParametersAndQualifiers::trailingReturn() { +syntax::TrailingReturnType * +syntax::ParametersAndQualifiers::getTrailingReturn() { return cast_or_null( - findChild(syntax::NodeRole::ParametersAndQualifiers_trailingReturn)); + findChild(syntax::NodeRole::TrailingReturn)); } diff --git a/clang/lib/Tooling/Syntax/Tree.cpp b/clang/lib/Tooling/Syntax/Tree.cpp index 70e3c8e02783a..2cef806937bfc 100644 --- a/clang/lib/Tooling/Syntax/Tree.cpp +++ b/clang/lib/Tooling/Syntax/Tree.cpp @@ -133,46 +133,45 @@ void syntax::Tree::replaceChildRangeLowLevel(Node *BeforeBegin, Node *End, } namespace { -static void dumpTokens(raw_ostream &OS, ArrayRef Tokens, - const SourceManager &SM) { - assert(!Tokens.empty()); - bool First = true; - for (const auto &T : Tokens) { - if (!First) - OS << " "; - else - First = false; - // Handle 'eof' separately, calling text() on it produces an empty string. - if (T.kind() == tok::eof) { - OS << ""; - continue; - } - OS << T.text(SM); - } +static void dumpLeaf(raw_ostream &OS, const syntax::Leaf *L, + const SourceManager &SM) { + assert(L); + const auto *Token = L->token(); + assert(Token); + // Handle 'eof' separately, calling text() on it produces an empty string. + if (Token->kind() == tok::eof) + OS << ""; + else + OS << Token->text(SM); } -static void dumpTree(raw_ostream &OS, const syntax::Node *N, - const syntax::Arena &A, std::vector IndentMask) { - std::string Marks; - if (!N->isOriginal()) - Marks += "M"; - if (N->role() == syntax::NodeRole::Detached) - Marks += "*"; // FIXME: find a nice way to print other roles. - if (!N->canModify()) - Marks += "I"; - if (!Marks.empty()) - OS << Marks << ": "; - - if (auto *L = dyn_cast(N)) { - dumpTokens(OS, *L->token(), A.sourceManager()); +static void dumpNode(raw_ostream &OS, const syntax::Node *N, + const SourceManager &SM, std::vector IndentMask) { + auto dumpExtraInfo = [&OS](const syntax::Node *N) { + if (N->role() != syntax::NodeRole::Unknown) + OS << " " << N->role(); + if (!N->isOriginal()) + OS << " synthesized"; + if (!N->canModify()) + OS << " unmodifiable"; + }; + + assert(N); + if (const auto *L = dyn_cast(N)) { + OS << "'"; + dumpLeaf(OS, L, SM); + OS << "'"; + dumpExtraInfo(N); OS << "\n"; return; } - auto *T = cast(N); - OS << T->kind() << "\n"; + const auto *T = cast(N); + OS << T->kind(); + dumpExtraInfo(N); + OS << "\n"; - for (auto It = T->firstChild(); It != nullptr; It = It->nextSibling()) { + for (const auto *It = T->firstChild(); It; It = It->nextSibling()) { for (bool Filled : IndentMask) { if (Filled) OS << "| "; @@ -186,28 +185,27 @@ static void dumpTree(raw_ostream &OS, const syntax::Node *N, OS << "|-"; IndentMask.push_back(true); } - dumpTree(OS, It, A, IndentMask); + dumpNode(OS, It, SM, IndentMask); IndentMask.pop_back(); } } } // namespace -std::string syntax::Node::dump(const Arena &A) const { +std::string syntax::Node::dump(const SourceManager &SM) const { std::string Str; llvm::raw_string_ostream OS(Str); - dumpTree(OS, this, A, /*IndentMask=*/{}); + dumpNode(OS, this, SM, /*IndentMask=*/{}); return std::move(OS.str()); } -std::string syntax::Node::dumpTokens(const Arena &A) const { +std::string syntax::Node::dumpTokens(const SourceManager &SM) const { std::string Storage; llvm::raw_string_ostream OS(Storage); traverse(this, [&](const syntax::Node *N) { - auto *L = dyn_cast(N); - if (!L) - return; - ::dumpTokens(OS, *L->token(), A.sourceManager()); - OS << " "; + if (const auto *L = dyn_cast(N)) { + dumpLeaf(OS, L, SM); + OS << " "; + } }); return OS.str(); } @@ -278,14 +276,14 @@ syntax::List::getElementsAsNodesAndDelimiters() { syntax::Node *elementWithoutDelimiter = nullptr; for (auto *C = firstChild(); C; C = C->nextSibling()) { switch (C->role()) { - case syntax::NodeRole::List_element: { + case syntax::NodeRole::ListElement: { if (elementWithoutDelimiter) { children.push_back({elementWithoutDelimiter, nullptr}); } elementWithoutDelimiter = C; break; } - case syntax::NodeRole::List_delimiter: { + case syntax::NodeRole::ListDelimiter: { children.push_back({elementWithoutDelimiter, cast(C)}); elementWithoutDelimiter = nullptr; break; @@ -323,14 +321,14 @@ std::vector syntax::List::getElementsAsNodes() { syntax::Node *elementWithoutDelimiter = nullptr; for (auto *C = firstChild(); C; C = C->nextSibling()) { switch (C->role()) { - case syntax::NodeRole::List_element: { + case syntax::NodeRole::ListElement: { if (elementWithoutDelimiter) { children.push_back(elementWithoutDelimiter); } elementWithoutDelimiter = C; break; } - case syntax::NodeRole::List_delimiter: { + case syntax::NodeRole::ListDelimiter: { children.push_back(elementWithoutDelimiter); elementWithoutDelimiter = nullptr; break; @@ -361,6 +359,9 @@ clang::tok::TokenKind syntax::List::getDelimiterTokenKind() { switch (this->kind()) { case NodeKind::NestedNameSpecifier: return clang::tok::coloncolon; + case NodeKind::CallArguments: + case NodeKind::ParametersAndQualifiers: + return clang::tok::comma; default: llvm_unreachable("This is not a subclass of List, thus " "getDelimiterTokenKind() cannot be called"); @@ -371,6 +372,9 @@ syntax::List::TerminationKind syntax::List::getTerminationKind() { switch (this->kind()) { case NodeKind::NestedNameSpecifier: return TerminationKind::Terminated; + case NodeKind::CallArguments: + case NodeKind::ParametersAndQualifiers: + return TerminationKind::Separated; default: llvm_unreachable("This is not a subclass of List, thus " "getTerminationKind() cannot be called"); @@ -381,6 +385,10 @@ bool syntax::List::canBeEmpty() { switch (this->kind()) { case NodeKind::NestedNameSpecifier: return false; + case NodeKind::CallArguments: + return true; + case NodeKind::ParametersAndQualifiers: + return true; default: llvm_unreachable("This is not a subclass of List, thus canBeEmpty() " "cannot be called"); diff --git a/clang/runtime/CMakeLists.txt b/clang/runtime/CMakeLists.txt index e20cc26f60af8..61bbbf8faedd5 100644 --- a/clang/runtime/CMakeLists.txt +++ b/clang/runtime/CMakeLists.txt @@ -75,6 +75,7 @@ if(LLVM_BUILD_EXTERNAL_COMPILER_RT AND EXISTS ${COMPILER_RT_SRC_ROOT}/) CMAKE_ARGS ${CLANG_COMPILER_RT_CMAKE_ARGS} -DCMAKE_C_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/clang -DCMAKE_CXX_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++ + -DCMAKE_ASM_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/clang -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} -DLLVM_CONFIG_PATH=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-config diff --git a/clang/test/AST/dump.cpp b/clang/test/AST/dump.cpp index 06af15f070899..bbd388cbf0957 100644 --- a/clang/test/AST/dump.cpp +++ b/clang/test/AST/dump.cpp @@ -86,4 +86,4 @@ int bar() { // CHECK-NEXT: | `-ReturnStmt {{.+}} // CHECK-NEXT: | `-ImplicitCastExpr {{.+}} 'int' // CHECK-NEXT: | `-DeclRefExpr {{.+}} 'int' lvalue Var {{.+}} 'f' 'int' -// CHECK-NEXT: `-OMPDeclareTargetDeclAttr {{.+}} <> Implicit MT_To +// CHECK-NEXT: `-OMPDeclareTargetDeclAttr {{.+}} Implicit MT_To DT_Any 1 diff --git a/clang/test/Analysis/Inputs/llvm.h b/clang/test/Analysis/Inputs/llvm.h index c9d66ba2374d3..b80567bcb5863 100644 --- a/clang/test/Analysis/Inputs/llvm.h +++ b/clang/test/Analysis/Inputs/llvm.h @@ -19,11 +19,19 @@ const X *dyn_cast_or_null(Y *Value); template const X *dyn_cast_or_null(Y &Value); -template -bool isa(Y Value); - -template -bool isa_and_nonnull(Y Value); +template inline bool isa(const Y &Val); + +template +inline bool isa(const Y &Val) { + return isa(Val) || isa(Val); +} + +template +inline bool isa_and_nonnull(const Y &Val) { + if (!Val) + return false; + return isa(Val); +} template std::unique_ptr cast(std::unique_ptr &&Value); diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h index a0759479bfebf..f2b148cbc692b 100644 --- a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h +++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h @@ -970,6 +970,7 @@ class unique_ptr { T *operator->() const noexcept; operator bool() const noexcept; unique_ptr &operator=(unique_ptr &&p) noexcept; + unique_ptr &operator=(nullptr_t) noexcept; }; // TODO :: Once the deleter parameter is added update with additional template parameter. diff --git a/clang/test/Analysis/cast-value-logic.cpp b/clang/test/Analysis/cast-value-logic.cpp index 1411ede92e366..52a94f24fba67 100644 --- a/clang/test/Analysis/cast-value-logic.cpp +++ b/clang/test/Analysis/cast-value-logic.cpp @@ -19,6 +19,8 @@ struct Shape { virtual double area(); }; class Triangle : public Shape {}; +class Rectangle : public Shape {}; +class Hexagon : public Shape {}; class Circle : public Shape { public: ~Circle(); @@ -39,6 +41,23 @@ void test_regions_isa(const Shape *A, const Shape *B) { clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} } +void test_regions_isa_variadic(const Shape *A, const Shape *B) { + if (isa(A) && + !isa(B)) + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void test_regions_isa_and_nonnull(const Shape *A, const Shape *B) { + if (isa_and_nonnull(A) && !isa_and_nonnull(B)) + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void test_regions_isa_and_nonnull_variadic(const Shape *A, const Shape *B) { + if (isa_and_nonnull(A) && + !isa_and_nonnull(B)) + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + namespace test_cast { void evalLogic(const Shape *S) { const Circle *C = cast(S); diff --git a/clang/test/Analysis/cast-value-notes.cpp b/clang/test/Analysis/cast-value-notes.cpp index eb5d1b3d3fe27..a09586309fb41 100644 --- a/clang/test/Analysis/cast-value-notes.cpp +++ b/clang/test/Analysis/cast-value-notes.cpp @@ -13,6 +13,8 @@ struct Shape { const T *getAs() const; }; class Triangle : public Shape {}; +class Rectangle : public Shape {}; +class Hexagon : public Shape {}; class Circle : public Shape {}; } // namespace clang @@ -27,7 +29,6 @@ void evalReferences(const Shape &S) { } void evalNonNullParamNonNullReturnReference(const Shape &S) { - // Unmodeled cast from reference to pointer. const auto *C = dyn_cast_or_null(S); // expected-note@-1 {{'C' initialized here}} @@ -43,13 +44,37 @@ void evalNonNullParamNonNullReturnReference(const Shape &S) { return; } + if (dyn_cast_or_null(C)) { + // expected-note@-1 {{Assuming 'C' is not a 'Rectangle'}} + // expected-note@-2 {{Taking false branch}} + return; + } + + if (dyn_cast_or_null(C)) { + // expected-note@-1 {{Assuming 'C' is not a 'Hexagon'}} + // expected-note@-2 {{Taking false branch}} + return; + } + if (isa(C)) { // expected-note@-1 {{'C' is not a 'Triangle'}} // expected-note@-2 {{Taking false branch}} return; } - if (isa(C)) { + if (isa(C)) { + // expected-note@-1 {{'C' is neither a 'Triangle' nor a 'Rectangle'}} + // expected-note@-2 {{Taking false branch}} + return; + } + + if (isa(C)) { + // expected-note@-1 {{'C' is neither a 'Triangle' nor a 'Rectangle' nor a 'Hexagon'}} + // expected-note@-2 {{Taking false branch}} + return; + } + + if (isa(C)) { // expected-note@-1 {{'C' is a 'Circle'}} // expected-note@-2 {{Taking true branch}} @@ -65,22 +90,57 @@ void evalNonNullParamNonNullReturn(const Shape *S) { // expected-note@-1 {{'S' is a 'Circle'}} // expected-note@-2 {{'C' initialized here}} - if (!isa(C)) { - // expected-note@-1 {{Assuming 'C' is a 'Triangle'}} + if (!dyn_cast_or_null(C)) { + // expected-note@-1 {{'C' is a 'Circle'}} // expected-note@-2 {{Taking false branch}} return; } - if (!isa(C)) { - // expected-note@-1 {{'C' is a 'Triangle'}} + if (dyn_cast_or_null(C)) { + // expected-note@-1 {{Assuming 'C' is not a 'Triangle'}} // expected-note@-2 {{Taking false branch}} return; } - (void)(1 / !C); - // expected-note@-1 {{'C' is non-null}} - // expected-note@-2 {{Division by zero}} - // expected-warning@-3 {{Division by zero}} + if (dyn_cast_or_null(C)) { + // expected-note@-1 {{Assuming 'C' is not a 'Rectangle'}} + // expected-note@-2 {{Taking false branch}} + return; + } + + if (dyn_cast_or_null(C)) { + // expected-note@-1 {{Assuming 'C' is not a 'Hexagon'}} + // expected-note@-2 {{Taking false branch}} + return; + } + + if (isa(C)) { + // expected-note@-1 {{'C' is not a 'Triangle'}} + // expected-note@-2 {{Taking false branch}} + return; + } + + if (isa(C)) { + // expected-note@-1 {{'C' is neither a 'Triangle' nor a 'Rectangle'}} + // expected-note@-2 {{Taking false branch}} + return; + } + + if (isa(C)) { + // expected-note@-1 {{'C' is neither a 'Triangle' nor a 'Rectangle' nor a 'Hexagon'}} + // expected-note@-2 {{Taking false branch}} + return; + } + + if (isa(C)) { + // expected-note@-1 {{'C' is a 'Circle'}} + // expected-note@-2 {{Taking true branch}} + + (void)(1 / !C); + // expected-note@-1 {{'C' is non-null}} + // expected-note@-2 {{Division by zero}} + // expected-warning@-3 {{Division by zero}} + } } void evalNonNullParamNullReturn(const Shape *S) { diff --git a/clang/test/Analysis/cast-value-state-dump.cpp b/clang/test/Analysis/cast-value-state-dump.cpp index 3dffb78767cf4..3e6a40cf1319b 100644 --- a/clang/test/Analysis/cast-value-state-dump.cpp +++ b/clang/test/Analysis/cast-value-state-dump.cpp @@ -35,8 +35,8 @@ void evalNonNullParamNonNullReturn(const Shape *S) { // CHECK-NEXT: ], // CHECK-NEXT: "dynamic_casts": [ // CHECK: { "region": "SymRegion{reg_$0}", "casts": [ - // CHECK-NEXT: { "from": "const struct clang::Shape *", "to": "const class clang::Circle *", "kind": "success" }, - // CHECK-NEXT: { "from": "const struct clang::Shape *", "to": "const class clang::Square *", "kind": "fail" } + // CHECK-NEXT: { "from": "struct clang::Shape", "to": "class clang::Circle", "kind": "success" }, + // CHECK-NEXT: { "from": "struct clang::Shape", "to": "class clang::Square", "kind": "fail" } // CHECK-NEXT: ] } (void)(1 / !C); diff --git a/clang/test/Analysis/smart-ptr-text-output.cpp b/clang/test/Analysis/smart-ptr-text-output.cpp index 9af6f251e01d2..d63cd9b805f87 100644 --- a/clang/test/Analysis/smart-ptr-text-output.cpp +++ b/clang/test/Analysis/smart-ptr-text-output.cpp @@ -80,7 +80,7 @@ void derefOnSwappedNullPtr() { void derefOnStdSwappedNullPtr() { std::unique_ptr P; // expected-note {{Default constructed smart pointer 'P' is null}} std::unique_ptr PNull; // expected-note {{Default constructed smart pointer 'PNull' is null}} - std::swap(P, PNull); // expected-note@Inputs/system-header-simulator-cxx.h:978 {{Swapped null smart pointer 'PNull' with smart pointer 'P'}} + std::swap(P, PNull); // expected-note@Inputs/system-header-simulator-cxx.h:979 {{Swapped null smart pointer 'PNull' with smart pointer 'P'}} // expected-note@-1 {{Calling 'swap'}} // expected-note@-2 {{Returning from 'swap'}} P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}} @@ -109,14 +109,6 @@ void noNoteTagsForNonInterestingRegion() { // expected-note@-1{{Dereference of null smart pointer 'P'}} } -void noNoteTagsForNonMatchingBugType() { - std::unique_ptr P; // No note. - std::unique_ptr P1; // No note. - P1 = std::move(P); // expected-note {{Smart pointer 'P' of type 'std::unique_ptr' is reset to null when moved from}} - P->foo(); // expected-warning {{Dereference of null smart pointer 'P' of type 'std::unique_ptr' [cplusplus.Move]}} - // expected-note@-1 {{Dereference of null smart pointer 'P' of type 'std::unique_ptr'}} -} - void derefOnRawPtrFromGetOnNullPtr() { std::unique_ptr P; // FIXME: add note "Default constructed smart pointer 'P' is null" P.get()->foo(); // expected-warning {{Called C++ object pointer is null [core.CallAndMessage]}} @@ -131,3 +123,50 @@ void derefOnRawPtrFromGetOnValidPtr() { void derefOnRawPtrFromGetOnUnknownPtr(std::unique_ptr P) { P.get()->foo(); // No warning. } + +void derefOnMovedFromValidPtr() { + std::unique_ptr PToMove(new A()); // expected-note {{Smart pointer 'PToMove' is constructed}} + // FIXME: above note should go away once we fix marking region not interested. + std::unique_ptr P; + P = std::move(PToMove); // expected-note {{Smart pointer 'PToMove' is null after being moved to 'P'}} + PToMove->foo(); // expected-warning {{Dereference of null smart pointer 'PToMove' [alpha.cplusplus.SmartPtr]}} + // expected-note@-1 {{Dereference of null smart pointer 'PToMove'}} +} + +void derefOnMovedToNullPtr() { + std::unique_ptr PToMove(new A()); + std::unique_ptr P; + P = std::move(PToMove); // No note. + P->foo(); // No warning. +} + +void derefOnNullPtrGotMovedFromValidPtr() { + std::unique_ptr P(new A()); // expected-note {{Smart pointer 'P' is constructed}} + // FIXME: above note should go away once we fix marking region not interested. + std::unique_ptr PToMove; // expected-note {{Default constructed smart pointer 'PToMove' is null}} + P = std::move(PToMove); // expected-note {{Null pointer value move-assigned to 'P'}} + P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}} + // expected-note@-1 {{Dereference of null smart pointer 'P'}} +} + +void derefOnMovedUnknownPtr(std::unique_ptr PToMove) { + std::unique_ptr P; + P = std::move(PToMove); // expected-note {{Smart pointer 'PToMove' is null after; previous value moved to 'P'}} + PToMove->foo(); // expected-warning {{Dereference of null smart pointer 'PToMove' [alpha.cplusplus.SmartPtr]}} + // expected-note@-1 {{Dereference of null smart pointer 'PToMove'}} +} + +void derefOnAssignedNullPtrToNullSmartPtr() { + std::unique_ptr P; // expected-note {{Default constructed smart pointer 'P' is null}} + P = nullptr; // expected-note {{Smart pointer 'P' is assigned to null}} + P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}} + // expected-note@-1 {{Dereference of null smart pointer 'P'}} +} + +void derefOnAssignedZeroToNullSmartPtr() { + std::unique_ptr P(new A()); // expected-note {{Smart pointer 'P' is constructed}} + // FIXME: above note should go away once we fix marking region not interested. + P = 0; // expected-note {{Smart pointer 'P' is assigned to null}} + P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}} + // expected-note@-1 {{Dereference of null smart pointer 'P'}} +} diff --git a/clang/test/Analysis/smart-ptr.cpp b/clang/test/Analysis/smart-ptr.cpp index 17f6718c66057..1403cd6492b2b 100644 --- a/clang/test/Analysis/smart-ptr.cpp +++ b/clang/test/Analysis/smart-ptr.cpp @@ -216,8 +216,7 @@ void derefAfterAssignment() { std::unique_ptr P; std::unique_ptr Q; Q = std::move(P); - // TODO: Fix test with expecting warning after '=' operator overloading modeling. - Q->foo(); // no-warning + Q->foo(); // expected-warning {{Dereference of null smart pointer 'Q' [alpha.cplusplus.SmartPtr]}} } } @@ -276,3 +275,61 @@ void derefOnRawPtrFromMultipleGetOnUnknownPtr(std::unique_ptr P) { Y->foo(); // expected-warning {{Called C++ object pointer is null [core.CallAndMessage]}} } } + +void derefOnMovedFromValidPtr() { + std::unique_ptr PToMove(new A()); + std::unique_ptr P; + P = std::move(PToMove); + PToMove->foo(); // expected-warning {{Dereference of null smart pointer 'PToMove' [alpha.cplusplus.SmartPtr]}} +} + +void derefOnMovedToNullPtr() { + std::unique_ptr PToMove(new A()); + std::unique_ptr P; + P = std::move(PToMove); // No note. + P->foo(); // No warning. +} + +void derefOnNullPtrGotMovedFromValidPtr() { + std::unique_ptr P(new A()); + std::unique_ptr PToMove; + P = std::move(PToMove); + P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}} +} + +void derefOnMovedFromUnknownPtr(std::unique_ptr PToMove) { + std::unique_ptr P; + P = std::move(PToMove); + P->foo(); // No warning. +} + +void derefOnMovedUnknownPtr(std::unique_ptr PToMove) { + std::unique_ptr P; + P = std::move(PToMove); + PToMove->foo(); // expected-warning {{Dereference of null smart pointer 'PToMove' [alpha.cplusplus.SmartPtr]}} +} + +void derefOnAssignedNullPtrToNullSmartPtr() { + std::unique_ptr P; + P = nullptr; + P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}} +} + +void derefOnAssignedZeroToNullSmartPtr() { + std::unique_ptr P(new A()); + P = 0; + P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}} +} + +void derefOnAssignedNullToUnknowSmartPtr(std::unique_ptr P) { + P = nullptr; + P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}} +} + +std::unique_ptr &&returnRValRefOfUniquePtr(); + +void drefOnAssignedNullFromMethodPtrValidSmartPtr() { + std::unique_ptr P(new A()); + P = returnRValRefOfUniquePtr(); + P->foo(); // No warning. +} diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp index 8ae6308714451..b2fe2cd52006d 100644 --- a/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp +++ b/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp @@ -67,7 +67,8 @@ void f() { } // expected-note@-1 {{candidate function [with T = long long, U = int]}} static_assert(sizeof(f())); -// expected-error@-1 {{call to 'f' is ambiguous}} +// expected-error@-1 {{call to 'f' is ambiguous}} \ + expected-error@-1 {{invalid application of 'sizeof' to an incomplete type 'void'}} template concept C3 = true; diff --git a/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c index 22e1396787cec..b9b5013bf6bdb 100644 --- a/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c +++ b/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c @@ -1,146 +1,138 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ // RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg -instcombine | FileCheck %s #include -// CHECK-LABEL: test_vbfdot_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <4 x bfloat> %a to <8 x i8> -// CHECK-NEXT %1 = bitcast <4 x bfloat> %b to <8 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) -// CHECK-NEXT ret <2 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdot_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] +// float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { return vbfdot_f32(r, a, b); } -// CHECK-LABEL: test_vbfdotq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdotq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] +// float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ return vbfdotq_f32(r, a, b); } -// CHECK-LABEL: test_vbfdot_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float> -// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer -// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8> -// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) -// CHECK-NEXT ret <2 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdot_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] +// float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ return vbfdot_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfdotq_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float> -// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) -// CHECK-NEXT ret <4 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdotq_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] +// float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfdotq_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: test_vbfdot_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float> -// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> -// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8> -// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) -// CHECK-NEXT ret <2 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdot_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] +// float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { return vbfdot_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: test_vbfdotq_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float> -// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer -// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) -// CHECK-NEXT ret <4 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdotq_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] +// float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfdotq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfmmlaq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmmla1.i +// CHECK-LABEL: @test_vbfmmlaq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]] +// float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmmlaq_f32(r, a, b); } -// CHECK-LABEL: test_vbfmlalbq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalb1.i +// CHECK-LABEL: @test_vbfmlalbq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] +// float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_f32(r, a, b); } -// CHECK-LABEL: test_vbfmlaltq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalt1.i +// CHECK-LABEL: @test_vbfmlaltq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] +// float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_f32(r, a, b); } -// CHECK-LABEL: test_vbfmlalbq_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalb1.i +// CHECK-LABEL: @test_vbfmlalbq_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] +// float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlalbq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfmlalbq_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalb1.i +// CHECK-LABEL: @test_vbfmlalbq_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] +// float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: test_vbfmlaltq_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalt1.i +// CHECK-LABEL: @test_vbfmlaltq_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] +// float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlaltq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfmlaltq_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalt1.i +// CHECK-LABEL: @test_vbfmlaltq_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] +// float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_laneq_f32(r, a, b, 3); } diff --git a/clang/test/CodeGen/aarch64-debug-sve-vector-types.c b/clang/test/CodeGen/aarch64-debug-sve-vector-types.c new file mode 100644 index 0000000000000..4325e3f44747b --- /dev/null +++ b/clang/test/CodeGen/aarch64-debug-sve-vector-types.c @@ -0,0 +1,71 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ +// RUN: -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s + +void test_locals(void) { + // CHECK-DAG: name: "__SVBool_t",{{.*}}, baseType: ![[CT1:[0-9]+]] + // CHECK-DAG: ![[CT1]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYU8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64:[0-9]+]]) + // CHECK-DAG: ![[ELTTYU8]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char) + // CHECK-DAG: ![[ELTS1_64]] = !{![[REALELTS1_64:[0-9]+]]} + // CHECK-DAG: ![[REALELTS1_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 1, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __SVBool_t b8; + + // CHECK-DAG: name: "__SVInt8_t",{{.*}}, baseType: ![[CT8:[0-9]+]] + // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYS8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8:[0-9]+]]) + // CHECK-DAG: ![[ELTTYS8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char) + // CHECK-DAG: ![[ELTS8]] = !{![[REALELTS8:[0-9]+]]} + // CHECK-DAG: ![[REALELTS8]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __SVInt8_t s8; + + // CHECK-DAG: name: "__SVUint8_t",{{.*}}, baseType: ![[CT8:[0-9]+]] + // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYU8]], flags: DIFlagVector, elements: ![[ELTS8]]) + __SVUint8_t u8; + + // CHECK-DAG: name: "__SVInt16_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16:[0-9]+]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS16]] = !{![[REALELTS16:[0-9]+]]} + // CHECK-DAG: ![[REALELTS16]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 4, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __SVInt16_t s16; + + // CHECK-DAG: name: "__SVUint16_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned) + __SVUint16_t u16; + + // CHECK-DAG: name: "__SVInt32_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32:[0-9]+]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS32]] = !{![[REALELTS32:[0-9]+]]} + // CHECK-DAG: ![[REALELTS32]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 2, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __SVInt32_t s32; + + // CHECK-DAG: name: "__SVUint32_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) + __SVUint32_t u32; + + // CHECK-DAG: name: "__SVInt64_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) + __SVInt64_t s64; + + // CHECK-DAG: name: "__SVUint64_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned) + __SVUint64_t u64; + + // CHECK: name: "__SVFloat16_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "__fp16", size: 16, encoding: DW_ATE_float) + __SVFloat16_t f16; + + // CHECK: name: "__SVFloat32_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) + __SVFloat32_t f32; + + // CHECK: name: "__SVFloat64_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float) + __SVFloat64_t f64; +} diff --git a/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c b/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c new file mode 100644 index 0000000000000..0d874c0b557cd --- /dev/null +++ b/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c @@ -0,0 +1,67 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ +// RUN: -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s + +void test_locals(void) { + // CHECK-DAG: name: "__clang_svint8x2_t",{{.*}}, baseType: ![[CT8:[0-9]+]] + // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x2:[0-9]+]]) + // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char) + // CHECK-DAG: ![[ELTS8x2]] = !{![[REALELTS8x2:[0-9]+]]} + // CHECK-DAG: ![[REALELTS8x2]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 16, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint8x2_t s8; + + // CHECK-DAG: name: "__clang_svuint8x2_t",{{.*}}, baseType: ![[CT8:[0-9]+]] + // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x2]]) + // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char) + __clang_svuint8x2_t u8; + + // CHECK-DAG: name: "__clang_svint16x2_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x2:[0-9]+]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS16x2]] = !{![[REALELTS16x2:[0-9]+]]} + // CHECK-DAG: ![[REALELTS16x2]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint16x2_t s16; + + // CHECK-DAG: name: "__clang_svuint16x2_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x2]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned) + __clang_svuint16x2_t u16; + + // CHECK-DAG: name: "__clang_svint32x2_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x2:[0-9]+]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS32x2]] = !{![[REALELTS32x2:[0-9]+]]} + // CHECK-DAG: ![[REALELTS32x2]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 4, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint32x2_t s32; + + // CHECK-DAG: name: "__clang_svuint32x2_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x2]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) + __clang_svuint32x2_t u32; + + // CHECK-DAG: name: "__clang_svint64x2_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x2_64:[0-9]+]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS1x2_64]] = !{![[REALELTS1x2_64:[0-9]+]]} + // CHECK-DAG: ![[REALELTS1x2_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 2, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint64x2_t s64; + + // CHECK-DAG: name: "__clang_svuint64x2_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x2_64]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned) + __clang_svuint64x2_t u64; + + // CHECK: name: "__clang_svfloat16x2_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x2]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "__fp16", size: 16, encoding: DW_ATE_float) + __clang_svfloat16x2_t f16; + + // CHECK: name: "__clang_svfloat32x2_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x2]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) + __clang_svfloat32x2_t f32; + + // CHECK: name: "__clang_svfloat64x2_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x2_64]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float) + __clang_svfloat64x2_t f64; +} diff --git a/clang/test/CodeGen/aarch64-debug-sve-vectorx3-types.c b/clang/test/CodeGen/aarch64-debug-sve-vectorx3-types.c new file mode 100644 index 0000000000000..c5dde7d1295d1 --- /dev/null +++ b/clang/test/CodeGen/aarch64-debug-sve-vectorx3-types.c @@ -0,0 +1,67 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ +// RUN: -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s + +void test_locals(void) { + // CHECK-DAG: name: "__clang_svint8x3_t",{{.*}}, baseType: ![[CT8:[0-9]+]] + // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x3:[0-9]+]]) + // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char) + // CHECK-DAG: ![[ELTS8x3]] = !{![[REALELTS8x3:[0-9]+]]} + // CHECK-DAG: ![[REALELTS8x3]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 24, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint8x3_t s8; + + // CHECK-DAG: name: "__clang_svuint8x3_t",{{.*}}, baseType: ![[CT8:[0-9]+]] + // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x3]]) + // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char) + __clang_svuint8x3_t u8; + + // CHECK-DAG: name: "__clang_svint16x3_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x3:[0-9]+]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS16x3]] = !{![[REALELTS16x3:[0-9]+]]} + // CHECK-DAG: ![[REALELTS16x3]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 12, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint16x3_t s16; + + // CHECK-DAG: name: "__clang_svuint16x3_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x3]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned) + __clang_svuint16x3_t u16; + + // CHECK-DAG: name: "__clang_svint32x3_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x3:[0-9]+]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS32x3]] = !{![[REALELTS32x3:[0-9]+]]} + // CHECK-DAG: ![[REALELTS32x3]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 6, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint32x3_t s32; + + // CHECK-DAG: name: "__clang_svuint32x3_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x3]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) + __clang_svuint32x3_t u32; + + // CHECK-DAG: name: "__clang_svint64x3_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x3_64:[0-9]+]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS1x3_64]] = !{![[REALELTS1x3_64:[0-9]+]]} + // CHECK-DAG: ![[REALELTS1x3_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 3, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint64x3_t s64; + + // CHECK-DAG: name: "__clang_svuint64x3_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x3_64]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned) + __clang_svuint64x3_t u64; + + // CHECK: name: "__clang_svfloat16x3_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x3]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "__fp16", size: 16, encoding: DW_ATE_float) + __clang_svfloat16x3_t f16; + + // CHECK: name: "__clang_svfloat32x3_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x3]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) + __clang_svfloat32x3_t f32; + + // CHECK: name: "__clang_svfloat64x3_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x3_64]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float) + __clang_svfloat64x3_t f64; +} diff --git a/clang/test/CodeGen/aarch64-debug-sve-vectorx4-types.c b/clang/test/CodeGen/aarch64-debug-sve-vectorx4-types.c new file mode 100644 index 0000000000000..90a266c53f907 --- /dev/null +++ b/clang/test/CodeGen/aarch64-debug-sve-vectorx4-types.c @@ -0,0 +1,67 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ +// RUN: -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s + +void test_locals(void) { + // CHECK-DAG: name: "__clang_svint8x4_t",{{.*}}, baseType: ![[CT8:[0-9]+]] + // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x4:[0-9]+]]) + // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char) + // CHECK-DAG: ![[ELTS8x4]] = !{![[REALELTS8x4:[0-9]+]]} + // CHECK-DAG: ![[REALELTS8x4]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 32, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint8x4_t s8; + + // CHECK-DAG: name: "__clang_svuint8x4_t",{{.*}}, baseType: ![[CT8:[0-9]+]] + // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x4]]) + // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char) + __clang_svuint8x4_t u8; + + // CHECK-DAG: name: "__clang_svint16x4_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x4:[0-9]+]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS16x4]] = !{![[REALELTS16x4:[0-9]+]]} + // CHECK-DAG: ![[REALELTS16x4]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 16, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint16x4_t s16; + + // CHECK-DAG: name: "__clang_svuint16x4_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x4]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned) + __clang_svuint16x4_t u16; + + // CHECK-DAG: name: "__clang_svint32x4_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x4:[0-9]+]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS32x4]] = !{![[REALELTS32x4:[0-9]+]]} + // CHECK-DAG: ![[REALELTS32x4]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint32x4_t s32; + + // CHECK-DAG: name: "__clang_svuint32x4_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x4]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) + __clang_svuint32x4_t u32; + + // CHECK-DAG: name: "__clang_svint64x4_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x4_64:[0-9]+]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) + // CHECK-DAG: ![[ELTS1x4_64]] = !{![[REALELTS1x4_64:[0-9]+]]} + // CHECK-DAG: ![[REALELTS1x4_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 4, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + __clang_svint64x4_t s64; + + // CHECK-DAG: name: "__clang_svuint64x4_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x4_64]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned) + __clang_svuint64x4_t u64; + + // CHECK: name: "__clang_svfloat16x4_t",{{.*}}, baseType: ![[CT16:[0-9]+]] + // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x4]]) + // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "__fp16", size: 16, encoding: DW_ATE_float) + __clang_svfloat16x4_t f16; + + // CHECK: name: "__clang_svfloat32x4_t",{{.*}}, baseType: ![[CT32:[0-9]+]] + // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x4]]) + // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) + __clang_svfloat32x4_t f32; + + // CHECK: name: "__clang_svfloat64x4_t",{{.*}}, baseType: ![[CT64:[0-9]+]] + // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x4_64]]) + // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float) + __clang_svfloat64x4_t f64; +} diff --git a/clang/test/CodeGen/aarch64-sve.c b/clang/test/CodeGen/aarch64-sve.c index d21af74319f99..ebcf334f11d69 100644 --- a/clang/test/CodeGen/aarch64-sve.c +++ b/clang/test/CodeGen/aarch64-sve.c @@ -1,22 +1,6 @@ -// RUN: not %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ -// RUN: -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s -check-prefix=CHECK-DEBUG // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ // RUN: -emit-llvm -o - %s 2>&1 | FileCheck %s -check-prefix=CHECK -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVInt8_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVInt16_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVInt32_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVInt64_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVUint8_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVUint16_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVUint32_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVUint64_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVFloat16_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVFloat32_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVFloat64_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVBFloat16_t' -// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVBool_t' - // CHECK: @ptr = global * null, align 8 // CHECK: %s8 = alloca , align 16 // CHECK: %s16 = alloca , align 16 diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c index 0eb130a377bdd..a14889599f782 100644 --- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c @@ -12,10 +12,8 @@ // CHECK-LABEL: @test_vbfdot_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { return vbfdot_f32(r, a, b); @@ -23,10 +21,8 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { // CHECK-LABEL: @test_vbfdotq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ return vbfdotq_f32(r, a, b); @@ -36,10 +32,9 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer -// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3 -// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ return vbfdot_lane_f32(r, a, b, 0); @@ -49,10 +44,9 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> -// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfdotq_laneq_f32(r, a, b, 3); @@ -62,10 +56,9 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> -// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3 -// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { return vbfdot_laneq_f32(r, a, b, 3); @@ -75,10 +68,9 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer -// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfdotq_lane_f32(r, a, b, 0); @@ -86,10 +78,8 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) // CHECK-LABEL: @test_vbfmmlaq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMMLA1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMMLA1_I]] +// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]] // float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmmlaq_f32(r, a, b); @@ -97,10 +87,8 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlalbq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]] +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] // float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_f32(r, a, b); @@ -108,10 +96,8 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlaltq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]] +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] // float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_f32(r, a, b); @@ -120,10 +106,8 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlalbq_lane_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]] +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] // float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlalbq_lane_f32(r, a, b, 0); @@ -132,10 +116,8 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-LABEL: @test_vbfmlalbq_laneq_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]] +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] // float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_laneq_f32(r, a, b, 3); @@ -144,10 +126,8 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t // CHECK-LABEL: @test_vbfmlaltq_lane_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]] +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] // float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlaltq_lane_f32(r, a, b, 0); @@ -156,10 +136,8 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-LABEL: @test_vbfmlaltq_laneq_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]] +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] // float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_laneq_f32(r, a, b, 3); diff --git a/clang/test/CodeGen/attr-target-mv.c b/clang/test/CodeGen/attr-target-mv.c index 89b219c25fecc..57b266403dfc7 100644 --- a/clang/test/CodeGen/attr-target-mv.c +++ b/clang/test/CodeGen/attr-target-mv.c @@ -11,6 +11,7 @@ int __attribute__((target("arch=icelake-client"))) foo(void) {return 6;} int __attribute__((target("arch=icelake-server"))) foo(void) {return 7;} int __attribute__((target("arch=cooperlake"))) foo(void) {return 8;} int __attribute__((target("arch=tigerlake"))) foo(void) {return 9;} +int __attribute__((target("arch=sapphirerapids"))) foo(void) {return 10;} int __attribute__((target("default"))) foo(void) { return 2; } int bar() { @@ -91,6 +92,8 @@ __attribute__((target("avx,sse4.2"), used)) inline void foo_used2(int i, double // LINUX: ret i32 8 // LINUX: define i32 @foo.arch_tigerlake() // LINUX: ret i32 9 +// LINUX: define i32 @foo.arch_sapphirerapids() +// LINUX: ret i32 10 // LINUX: define i32 @foo() // LINUX: ret i32 2 // LINUX: define i32 @bar() diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c index fe3e678a57948..16e468b623184 100644 --- a/clang/test/CodeGen/builtins-ppc-p10vector.c +++ b/clang/test/CodeGen/builtins-ppc-p10vector.c @@ -61,6 +61,54 @@ vector unsigned long long test_vec_div_ull(void) { return vec_div(vulla, vullb); } +vector signed int test_vec_dive_si(void) { + // CHECK: @llvm.ppc.altivec.vdivesw(<4 x i32> %{{.+}}, <4 x i32> %{{.+}}) + // CHECK-NEXT: ret <4 x i32> + return vec_dive(vsia, vsib); +} + +vector unsigned int test_vec_dive_ui(void) { + // CHECK: @llvm.ppc.altivec.vdiveuw(<4 x i32> %{{.+}}, <4 x i32> %{{.+}}) + // CHECK-NEXT: ret <4 x i32> + return vec_dive(vuia, vuib); +} + +vector signed long long test_vec_dive_sll(void) { + // CHECK: @llvm.ppc.altivec.vdivesd(<2 x i64> %{{.+}}, <2 x i64> %{{.+}}) + // CHECK-NEXT: ret <2 x i64> + return vec_dive(vslla, vsllb); +} + +vector unsigned long long test_vec_dive_ull(void) { + // CHECK: @llvm.ppc.altivec.vdiveud(<2 x i64> %{{.+}}, <2 x i64> %{{.+}}) + // CHECK-NEXT: ret <2 x i64> + return vec_dive(vulla, vullb); +} + +vector signed int test_vec_mulh_si(void) { + // CHECK: @llvm.ppc.altivec.vmulhsw(<4 x i32> %{{.+}}, <4 x i32> %{{.+}}) + // CHECK-NEXT: ret <4 x i32> + return vec_mulh(vsia, vsib); +} + +vector unsigned int test_vec_mulh_ui(void) { + // CHECK: @llvm.ppc.altivec.vmulhuw(<4 x i32> %{{.+}}, <4 x i32> %{{.+}}) + // CHECK-NEXT: ret <4 x i32> + return vec_mulh(vuia, vuib); +} + +vector signed long long test_vec_mulh_sll(void) { + // CHECK: @llvm.ppc.altivec.vmulhsd(<2 x i64> %{{.+}}, <2 x i64> %{{.+}}) + // CHECK-NEXT: ret <2 x i64> + return vec_mulh(vslla, vsllb); +} + +vector unsigned long long test_vec_mulh_ull(void) { + // CHECK: @llvm.ppc.altivec.vmulhud(<2 x i64> %{{.+}}, <2 x i64> %{{.+}}) + // CHECK-NEXT: ret <2 x i64> + return vec_mulh(vulla, vullb); +} + vector signed int test_vec_mod_si(void) { // CHECK: srem <4 x i32> // CHECK-NEXT: ret <4 x i32> diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c index 339e5b15c88d7..42164303e4a6b 100644 --- a/clang/test/CodeGen/target-builtin-noerror.c +++ b/clang/test/CodeGen/target-builtin-noerror.c @@ -122,6 +122,7 @@ void verifycpustrings() { (void)__builtin_cpu_is("skylake-avx512"); (void)__builtin_cpu_is("slm"); (void)__builtin_cpu_is("tigerlake"); + (void)__builtin_cpu_is("sapphirerapids"); (void)__builtin_cpu_is("tremont"); (void)__builtin_cpu_is("westmere"); (void)__builtin_cpu_is("znver1"); diff --git a/clang/test/CodeGen/unsigned-shift-base.c b/clang/test/CodeGen/unsigned-shift-base.c new file mode 100644 index 0000000000000..2260005512f00 --- /dev/null +++ b/clang/test/CodeGen/unsigned-shift-base.c @@ -0,0 +1,28 @@ +// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsanitize=unsigned-shift-base,shift-exponent %s -emit-llvm -o - | FileCheck %s + +// CHECK-LABEL: lsh_overflow( +unsigned lsh_overflow(unsigned a, unsigned b) { + // CHECK: %[[RHS_INBOUNDS:.*]] = icmp ule i32 %[[RHS:.*]], 31 + // CHECK-NEXT: br i1 %[[RHS_INBOUNDS]], label %[[CHECK_BB:.*]], label %[[CONT_BB:.*]], + + // CHECK: [[CHECK_BB]]: + // CHECK-NEXT: %[[SHIFTED_OUT_WIDTH:.*]] = sub nuw nsw i32 31, %[[RHS]] + // CHECK-NEXT: %[[SHIFTED_OUT:.*]] = lshr i32 %[[LHS:.*]], %[[SHIFTED_OUT_WIDTH]] + + // CHECK-NEXT: %[[SHIFTED_OUT_NOT_SIGN:.*]] = lshr i32 %[[SHIFTED_OUT]], 1 + + // CHECK-NEXT: %[[NO_OVERFLOW:.*]] = icmp eq i32 %[[SHIFTED_OUT_NOT_SIGN]], 0 + // CHECK-NEXT: br label %[[CONT_BB]] + + // CHECK: [[CONT_BB]]: + // CHECK-NEXT: %[[VALID_BASE:.*]] = phi i1 [ true, {{.*}} ], [ %[[NO_OVERFLOW]], %[[CHECK_BB]] ] + // CHECK-NEXT: %[[VALID:.*]] = and i1 %[[RHS_INBOUNDS]], %[[VALID_BASE]] + // CHECK-NEXT: br i1 %[[VALID]] + + // CHECK: call void @__ubsan_handle_shift_out_of_bounds + // CHECK-NOT: call void @__ubsan_handle_shift_out_of_bounds + + // CHECK: %[[RET:.*]] = shl i32 %[[LHS]], %[[RHS]] + // CHECK-NEXT: ret i32 %[[RET]] + return a << b; +} diff --git a/clang/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp b/clang/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp index 4e41c4092bf4e..b756674f54c40 100644 --- a/clang/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp +++ b/clang/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp @@ -1,5 +1,8 @@ // RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -debug-info-kind=limited %s -o - | FileCheck %s +// Make sure this still works with constructor homing. +// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -debug-info-kind=constructor %s -o - | FileCheck %s + // Run again with -gline-tables-only or -gline-directives-only and verify we don't crash. We won't output // type info at all. // RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -debug-info-kind=line-tables-only %s -o - | FileCheck %s -check-prefix LINES-ONLY diff --git a/clang/test/CoverageMapping/if.cpp b/clang/test/CoverageMapping/if.cpp index 5cbc974aaf041..8ffc09d29a3c7 100644 --- a/clang/test/CoverageMapping/if.cpp +++ b/clang/test/CoverageMapping/if.cpp @@ -44,3 +44,10 @@ int main() { // CHECK: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = return 0; } + +#define FOO true + +// CHECK-LABEL: _Z7ternaryv: +void ternary() { + true ? FOO : FOO; // CHECK-NOT: Gap,{{.*}}, [[@LINE]]:8 -> +} diff --git a/clang/test/CoverageMapping/macro-expressions.cpp b/clang/test/CoverageMapping/macro-expressions.cpp index 6ac82523fef8d..60afc5238b9eb 100644 --- a/clang/test/CoverageMapping/macro-expressions.cpp +++ b/clang/test/CoverageMapping/macro-expressions.cpp @@ -57,8 +57,7 @@ void foo(int i) { // CHECK: File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:12 = #1 if (0) {} - // CHECK-NEXT: Expansion,File 0, [[@LINE+3]]:7 -> [[@LINE+3]]:11 = #0 - // CHECK-NEXT: Gap,File 0, [[@LINE+2]]:15 -> [[@LINE+2]]:16 = #2 + // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:7 -> [[@LINE+2]]:11 = #0 // CHECK-NEXT: File 0, [[@LINE+1]]:16 -> [[@LINE+1]]:18 = #2 if (EXPR(i)) {} // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:9 -> [[@LINE+2]]:14 = (#0 + #3) @@ -71,8 +70,7 @@ void foo(int i) { for (ASSIGN(DECL(int, j), 0); LT(j, i); INC(j)) {} // CHECK-NEXT: Expansion,File 0, [[@LINE+1]]:3 -> [[@LINE+1]]:9 = #0 ASSIGN(DECL(int, k), 0); - // CHECK-NEXT: Expansion,File 0, [[@LINE+4]]:10 -> [[@LINE+4]]:12 = (#0 + #5) - // CHECK-NEXT: Gap,File 0, [[@LINE+3]]:19 -> [[@LINE+3]]:20 = #5 + // CHECK-NEXT: Expansion,File 0, [[@LINE+3]]:10 -> [[@LINE+3]]:12 = (#0 + #5) // CHECK-NEXT: File 0, [[@LINE+2]]:20 -> [[@LINE+2]]:31 = #5 // CHECK-NEXT: Expansion,File 0, [[@LINE+1]]:22 -> [[@LINE+1]]:25 = #5 while (LT(k, i)) { INC(k); } @@ -84,7 +82,6 @@ void foo(int i) { // CHECK: File 0, [[@LINE+1]]:42 -> [[@LINE+1]]:44 = #7 for (DECL(int, j) : ARR(int, 1, 2, 3)) {} - // CHECK-NEXT: Gap,File 0, [[@LINE+3]]:12 -> [[@LINE+3]]:14 = #8 // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:14 -> [[@LINE+2]]:20 = #0 // CHECK-NEXT: Expansion,File 0, [[@LINE+1]]:23 -> [[@LINE+1]]:29 = #0 (void)(i ? PRIo64 : PRIu64); diff --git a/clang/test/CoverageMapping/macroparams2.c b/clang/test/CoverageMapping/macroparams2.c index cd29e3c4f190a..30ce25d9accda 100644 --- a/clang/test/CoverageMapping/macroparams2.c +++ b/clang/test/CoverageMapping/macroparams2.c @@ -6,12 +6,11 @@ struct S { int i, j; }; -// CHECK: File 0, [[@LINE+1]]:12 -> [[@LINE+11]]:2 = #0 +// CHECK: File 0, [[@LINE+1]]:12 -> [[@LINE+10]]:2 = #0 int main() { struct S arr[32] = { 0 }; int n = 0; - // CHECK-NEXT: Expansion,File 0, [[@LINE+3]]:7 -> [[@LINE+3]]:12 = #0 - // CHECK-NEXT: Gap,File 0, [[@LINE+2]]:33 -> [[@LINE+2]]:34 = #1 + // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:7 -> [[@LINE+2]]:12 = #0 // CHECK-NEXT: File 0, [[@LINE+1]]:34 -> [[@LINE+3]]:4 = #1 if (MACRO(arr[n].j, arr[n].i)) { n = 1; diff --git a/clang/test/CoverageMapping/macros.c b/clang/test/CoverageMapping/macros.c index 81ef2d9b7521a..83e2029be5612 100644 --- a/clang/test/CoverageMapping/macros.c +++ b/clang/test/CoverageMapping/macros.c @@ -38,32 +38,29 @@ void func3() { // CHECK-NEXT: File 0, [[@LINE]]:14 -> [[@LINE+3]]:2 = #0 // CHECK-NEXT: File 2, 4:17 -> 4:22 = #0 // CHECK-NEXT: func4 -void func4() { // CHECK-NEXT: File 0, [[@LINE]]:14 -> [[@LINE+7]]:2 = #0 +void func4() { // CHECK-NEXT: File 0, [[@LINE]]:14 -> [[@LINE+6]]:2 = #0 int i = 0; while (i++ < 10) // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE]]:18 = (#0 + #1) - if (i < 5) // CHECK: File 0, [[@LINE]]:5 -> [[@LINE+3]]:14 = #1 + if (i < 5) // CHECK: File 0, [[@LINE]]:5 -> [[@LINE+2]]:14 = #1 // CHECK-NEXT: File 0, [[@LINE-1]]:9 -> [[@LINE-1]]:14 = #1 - // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:15 -> [[@LINE+1]]:7 = #2 MACRO_2; // CHECK-NEXT: Expansion,File 0, [[@LINE]]:7 -> [[@LINE]]:14 = #2 } // CHECK-NEXT: File 1, 4:17 -> 4:22 = #2 // CHECK-NOT: File 1 // CHECK-NEXT: func5 -void func5() { // CHECK-NEXT: File 0, [[@LINE]]:14 -> [[@LINE+5]]:2 = #0 +void func5() { // CHECK-NEXT: File 0, [[@LINE]]:14 -> [[@LINE+4]]:2 = #0 int i = 0; if (i > 5) // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:12 = #0 - // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:5 = #1 MACRO_3; // CHECK-NEXT: Expansion,File 0, [[@LINE]]:5 -> [[@LINE]]:12 = #1 } // CHECK-NEXT: Expansion,File 1, 6:17 -> 6:24 = #1 // CHECK-NEXT: File 2, 4:17 -> 4:22 = #1 // CHECK-NEXT: func6 -void func6(unsigned count) { // CHECK-NEXT: File 0, [[@LINE]]:28 -> [[@LINE+5]]:2 = #0 -begin: // CHECK-NEXT: File 0, [[@LINE]]:1 -> [[@LINE+4]]:2 = #1 +void func6(unsigned count) { // CHECK-NEXT: File 0, [[@LINE]]:28 -> [[@LINE+4]]:2 = #0 +begin: // CHECK-NEXT: File 0, [[@LINE]]:1 -> [[@LINE+3]]:2 = #1 if (count--) // CHECK-NEXT: File 0, [[@LINE]]:9 -> [[@LINE]]:16 = #1 - // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:17 -> [[@LINE+1]]:9 = #2 GOTO begin; // CHECK-NEXT: File 0, [[@LINE]]:9 -> [[@LINE]]:19 = #2 } // CHECK-NEXT: Expansion,File 0, [[@LINE-2]]:9 -> [[@LINE-2]]:13 = #2 diff --git a/clang/test/CoverageMapping/macroscopes.cpp b/clang/test/CoverageMapping/macroscopes.cpp index c600574fb1e3c..62f5dbe77981f 100644 --- a/clang/test/CoverageMapping/macroscopes.cpp +++ b/clang/test/CoverageMapping/macroscopes.cpp @@ -61,15 +61,13 @@ int main() { starts_a_scope ends_a_scope - // CHECK-NEXT: Expansion,File 0, [[@LINE+3]]:3 -> [[@LINE+3]]:17 = #0 - // CHECK-NEXT: Gap,File 0, [[@LINE+2]]:3 -> [[@LINE+3]]:5 = #8 + // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:3 -> [[@LINE+2]]:17 = #0 // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:5 -> [[@LINE+2]]:16 = #8 starts_a_while simple_stmt; x = 0; - // CHECK-NEXT: Expansion,File 0, [[@LINE+5]]:3 -> [[@LINE+5]]:17 = #0 - // CHECK-NEXT: Gap,File 0, [[@LINE+4]]:3 -> [[@LINE+4]]:18 = #9 + // CHECK-NEXT: Expansion,File 0, [[@LINE+4]]:3 -> [[@LINE+4]]:17 = #0 // CHECK-NEXT: File 0, [[@LINE+3]]:18 -> [[@LINE+5]]:15 = #9 // CHECK-NEXT: Expansion,File 0, [[@LINE+3]]:5 -> [[@LINE+3]]:16 = #9 // CHECK-NEXT: Expansion,File 0, [[@LINE+3]]:3 -> [[@LINE+3]]:15 = #9 diff --git a/clang/test/CoverageMapping/moremacros.c b/clang/test/CoverageMapping/moremacros.c index 91657fc76d8b9..ed89dcafd6723 100644 --- a/clang/test/CoverageMapping/moremacros.c +++ b/clang/test/CoverageMapping/moremacros.c @@ -9,18 +9,16 @@ int main(int argc, const char *argv[]) { // CHECK-NEXT: File 0, [[@LINE+1]]:7 -> [[@LINE+1]]:12 = #0 if (!argc) {} // CHECK: File 0, [[@LINE]]:14 -> [[@LINE]]:16 = #1 - // CHECK-NEXT: File 0, [[@LINE+4]]:7 -> [[@LINE+4]]:12 = #0 - // CHECK-NEXT: Gap,File 0, [[@LINE+3]]:13 -> [[@LINE+3]]:14 = #2 + // CHECK-NEXT: File 0, [[@LINE+3]]:7 -> [[@LINE+3]]:12 = #0 // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:14 -> [[@LINE+2]]:19 = #2 // CHECK-NEXT: File 0, [[@LINE+1]]:19 -> [[@LINE+4]]:8 = #2 if (!argc) LBRAC return 0; // CHECK-NEXT: Expansion,File 0, [[@LINE+1]]:3 -> [[@LINE+1]]:8 = #2 - RBRAC // CHECK-NEXT: [[@LINE]]:8 -> [[@LINE+7]]:3 = (#0 - #2) + RBRAC // CHECK-NEXT: [[@LINE]]:8 -> [[@LINE+6]]:3 = (#0 - #2) - // CHECK-NEXT: File 0, [[@LINE+5]]:3 -> [[@LINE+16]]:2 = (#0 - #2) - // CHECK-NEXT: File 0, [[@LINE+4]]:7 -> [[@LINE+4]]:12 = (#0 - #2) - // CHECK-NEXT: Gap,File 0, [[@LINE+3]]:13 -> [[@LINE+3]]:14 = #3 + // CHECK-NEXT: File 0, [[@LINE+4]]:3 -> [[@LINE+15]]:2 = (#0 - #2) + // CHECK-NEXT: File 0, [[@LINE+3]]:7 -> [[@LINE+3]]:12 = (#0 - #2) // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:14 -> [[@LINE+2]]:19 = #3 // CHECK-NEXT: File 0, [[@LINE+1]]:19 -> [[@LINE+3]]:4 = #3 if (!argc) LBRAC diff --git a/clang/test/Driver/XRay/xray-instrument-os.c b/clang/test/Driver/XRay/xray-instrument-os.c index ba97328b54a67..3a0397208326f 100644 --- a/clang/test/Driver/XRay/xray-instrument-os.c +++ b/clang/test/Driver/XRay/xray-instrument-os.c @@ -1,4 +1,4 @@ // RUN: not %clang -o /dev/null -v -fxray-instrument -c %s -// XFAIL: -linux-, -freebsd, -darwin, -macos +// XFAIL: -linux-, -freebsd, x86_64-apple-darwin, x86_64-apple-macos // REQUIRES: amd64 || x86_64 || x86_64h || arm || aarch64 || arm64 typedef int a; diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index bc4cab0f949f2..89dbdebbaf69d 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -573,7 +573,7 @@ // STDCXX17: -std=c++17 // RUN: %clang_cl -fmsc-version=1900 -TP -std:c++latest -### -- %s 2>&1 | FileCheck -check-prefix=STDCXXLATEST %s -// STDCXXLATEST: -std=c++2a +// STDCXXLATEST: -std=c++20 // RUN: env CL="/Gy" %clang_cl -### -- %s 2>&1 | FileCheck -check-prefix=ENV-CL %s // ENV-CL: "-ffunction-sections" @@ -682,4 +682,13 @@ // CLANG-NOT: "--dependent-lib=libcmt" // CLANG-NOT: "-vectorize-slp" +// Validate that the default triple is used when run an empty tools dir is specified +// RUN: %clang_cl -vctoolsdir "" -### -- %s 2>&1 | FileCheck %s --check-prefix VCTOOLSDIR +// VCTOOLSDIR: "-triple" "{{[a-zA-Z0-9_-]*}}-pc-windows-msvc19.11.0" + +// Validate that built-in include paths are based on the supplied path +// RUN: %clang_cl -vctoolsdir "/fake" -### -- %s 2>&1 | FileCheck %s --check-prefix FAKEDIR +// FAKEDIR: "-internal-isystem" "/fake{{/|\\\\}}include" +// FAKEDIR: "-internal-isystem" "/fake{{/|\\\\}}atlmfc{{/|\\\\}}include" + void f() { } diff --git a/clang/test/Driver/debug-var-experimental-switch.c b/clang/test/Driver/debug-var-experimental-switch.c new file mode 100644 index 0000000000000..9c7a782e9e2bb --- /dev/null +++ b/clang/test/Driver/debug-var-experimental-switch.c @@ -0,0 +1,2 @@ +// RUN: %clang -Xclang -fexperimental-debug-variable-locations -fsyntax-only -disable-llvm-passes %s +int main() {} diff --git a/clang/test/Driver/fmemprof.cpp b/clang/test/Driver/fmemprof.cpp new file mode 100644 index 0000000000000..049067803e2b4 --- /dev/null +++ b/clang/test/Driver/fmemprof.cpp @@ -0,0 +1,6 @@ +// RUN: %clangxx -target x86_64-linux-gnu -fmemprof %s -### 2>&1 | FileCheck %s +// RUN: %clangxx -target x86_64-linux-gnu -fmemprof -fno-memprof %s -### 2>&1 | FileCheck %s --check-prefix=OFF +// CHECK: "-cc1" {{.*}} "-fmemprof" +// CHECK: ld{{.*}}libclang_rt.heapprof{{.*}}libclang_rt.heapprof_cxx +// OFF-NOT: "-fmemprof" +// OFF-NOT: libclang_rt.heapprof diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c index cfefd3fb632cb..bad519fcef24b 100644 --- a/clang/test/Driver/fsanitize.c +++ b/clang/test/Driver/fsanitize.c @@ -32,7 +32,7 @@ // CHECK-COVERAGE-WIN64: "--dependent-lib={{[^"]*}}ubsan_standalone-x86_64.lib" // RUN: %clang -target %itanium_abi_triple -fsanitize=integer %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-INTEGER -implicit-check-not="-fsanitize-address-use-after-scope" -// CHECK-INTEGER: "-fsanitize={{((signed-integer-overflow|unsigned-integer-overflow|integer-divide-by-zero|shift-base|shift-exponent|implicit-unsigned-integer-truncation|implicit-signed-integer-truncation|implicit-integer-sign-change),?){8}"}} +// CHECK-INTEGER: "-fsanitize={{((signed-integer-overflow|unsigned-integer-overflow|integer-divide-by-zero|shift-base|shift-exponent|implicit-unsigned-integer-truncation|implicit-signed-integer-truncation|implicit-integer-sign-change|unsigned-shift-base),?){9}"}} // RUN: %clang -fsanitize=implicit-conversion %s -### 2>&1 | FileCheck %s --check-prefixes=CHECK-implicit-conversion,CHECK-implicit-conversion-RECOVER // RUN: %clang -fsanitize=implicit-conversion -fsanitize-recover=implicit-conversion %s -### 2>&1 | FileCheck %s --check-prefixes=CHECK-implicit-conversion,CHECK-implicit-conversion-RECOVER diff --git a/clang/test/Driver/modules.cpp b/clang/test/Driver/modules.cpp index 4f4e3a414002d..87b6cc640cb0d 100644 --- a/clang/test/Driver/modules.cpp +++ b/clang/test/Driver/modules.cpp @@ -22,6 +22,7 @@ // Check use of a .pcm file in another compilation. // // RUN: %clang -std=c++2a -fmodule-file=%t/module.pcm -Dexport= %s -S -o %t/module.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE +// RUN: %clang -std=c++20 -fmodule-file=%t/module.pcm -Dexport= %s -S -o %t/module.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE // // CHECK-USE: -cc1 // CHECK-USE-SAME: {{-emit-obj|-S}} diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c index e17d05dc76da3..cee4539eaca2f 100644 --- a/clang/test/Driver/openbsd.c +++ b/clang/test/Driver/openbsd.c @@ -122,8 +122,3 @@ // RUN: %clang -target powerpc-unknown-openbsd -### -c %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHECK-POWERPC-SECUREPLT %s // CHECK-POWERPC-SECUREPLT: "-target-feature" "+secure-plt" - -// Check -fno-init-array -// RUN: %clang -no-canonical-prefixes -target i686-pc-openbsd %s -### 2>&1 \ -// RUN: | FileCheck --check-prefix=CHECK-CTORS %s -// CHECK-CTORS: "-fno-use-init-array" diff --git a/clang/test/Driver/ppc-dependent-options.cpp b/clang/test/Driver/ppc-dependent-options.cpp index 1c1a0c38cdcb0..65c40e9ce70f6 100644 --- a/clang/test/Driver/ppc-dependent-options.cpp +++ b/clang/test/Driver/ppc-dependent-options.cpp @@ -58,6 +58,14 @@ // RUN: -mcpu=power10 -std=c++11 -mno-vsx -mpaired-vector-memops %s 2>&1 | \ // RUN: FileCheck %s -check-prefix=CHECK-NVSX-PAIRED-VEC-MEMOPS +// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \ +// RUN: -mcpu=power10 -std=c++11 -mno-vsx -mmma %s 2>&1 | \ +// RUN: FileCheck %s -check-prefix=CHECK-NVSX-MMA + +// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \ +// RUN: -mcpu=future -std=c++11 -mno-vsx -mmma %s 2>&1 | \ +// RUN: FileCheck %s -check-prefix=CHECK-NVSX-MMA + // RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \ // RUN: -mcpu=power9 -std=c++11 -mno-vsx -mfloat128 -mpower9-vector %s 2>&1 | \ // RUN: FileCheck %s -check-prefix=CHECK-NVSX-MULTI @@ -103,5 +111,6 @@ static_assert(false, "Neither enabled"); // CHECK-NVSX-PAIRED-VEC-MEMOPS: error: option '-mpaired-vector-memops' cannot be specified with '-mno-vsx' // CHECK-NVSX-MULTI: error: option '-mfloat128' cannot be specified with '-mno-vsx' // CHECK-NVSX-MULTI: error: option '-mpower9-vector' cannot be specified with '-mno-vsx' +// CHECK-NVSX-MMA: error: option '-mmma' cannot be specified with '-mno-vsx' // CHECK-NVSX: Neither enabled // CHECK-VSX: VSX enabled diff --git a/clang/test/Driver/sycl-offload-intelfpga.cpp b/clang/test/Driver/sycl-offload-intelfpga.cpp index de589affbc5e7..71d5c2bc76051 100644 --- a/clang/test/Driver/sycl-offload-intelfpga.cpp +++ b/clang/test/Driver/sycl-offload-intelfpga.cpp @@ -32,7 +32,7 @@ // CHK-FPGA-LINK-NOT: clang-offload-bundler{{.*}} // CHK-FPGA-LINK: llvm-link{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]" // CHK-FPGA-LINK: sycl-post-link{{.*}} "-ir-output-only" "-spec-const=default" "-o" "[[OUTPUT2:.+\.bc]]" "[[OUTPUT2_1]]" -// CHK-FPGA-LINK: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2]]" +// CHK-FPGA-LINK: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-debug-info-version=legacy" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2]]" // CHK-FPGA-EARLY: aoc{{.*}} "-o" "[[OUTPUT4:.+\.aocr]]" "[[OUTPUT3]]" "-sycl" "-rtl" // CHK-FPGA-IMAGE: aoc{{.*}} "-o" "[[OUTPUT5:.+\.aocx]]" "[[OUTPUT3]]" "-sycl" // CHK-FPGA-LINK: llvm-ar{{.*}} "cr" "libfoo.a" "[[INPUT]]" @@ -58,7 +58,7 @@ // CHK-FPGA-LINK-WIN-NOT: clang-offload-bundler{{.*}} // CHK-FPGA-LINK-WIN: llvm-link{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]" // CHK-FPGA-LINK-WIN: sycl-post-link{{.*}} "-ir-output-only" "-spec-const=default" "-o" "[[OUTPUT2:.+\.bc]]" "[[OUTPUT2_1]]" -// CHK-FPGA-LINK-WIN: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2]]" +// CHK-FPGA-LINK-WIN: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-debug-info-version=legacy" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2]]" // CHK-FPGA-LINK-WIN: aoc{{.*}} "-o" "[[OUTPUT5:.+\.aocr]]" "[[OUTPUT3]]" "-sycl" "-rtl" // CHK-FPGA-LINK-WIN: lib.exe{{.*}} "[[INPUT]]" {{.*}} "-OUT:libfoo.lib" @@ -112,7 +112,7 @@ // CHK-FPGA: clang-offload-bundler{{.*}} "-type=o" "-targets=host-x86_64-unknown-linux-gnu,sycl-spir64_fpga-unknown-unknown-sycldevice" {{.*}} "-outputs=[[FINALLINK2:.+\.o]],[[OUTPUT1:.+\.o]]" "-unbundle" // CHK-FPGA: llvm-no-spir-kernel{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT3:.+\.o]]" // CHK-FPGA: llvm-link{{.*}} "[[OUTPUT3]]" "-o" "[[OUTPUT4:.+\.bc]]" -// CHK-FPGA: llvm-spirv{{.*}} "-o" "[[OUTPUT5:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT4]]" +// CHK-FPGA: llvm-spirv{{.*}} "-o" "[[OUTPUT5:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-debug-info-version=legacy" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT4]]" // CHK-FPGA: clang-offload-wrapper{{.*}} "-o=[[OUTPUT6:.+\.bc]]" "-host=x86_64-unknown-linux-gnu" "-target=spir64_fpga" "-kind=sycl" "[[OUTPUT5]]" // CHK-FPGA: llc{{.*}} "-filetype=obj" "-o" "[[FINALLINK3:.+\.o]]" "[[OUTPUT6]]" // CHK-FPGA: clang-offload-bundler{{.*}} "-type=aoo" "-targets=host-x86_64-unknown-linux-gnu" {{.*}} "-outputs=[[FINALLINK4:.+\.txt]]" "-unbundle" diff --git a/clang/test/Driver/sycl-offload.c b/clang/test/Driver/sycl-offload.c index ab6a7c9d90e1b..c743f3d8a9996 100644 --- a/clang/test/Driver/sycl-offload.c +++ b/clang/test/Driver/sycl-offload.c @@ -669,10 +669,10 @@ // CHK-TOOLS-AOT: clang{{.*}} "-fsycl-is-device" {{.*}} "-o" "[[OUTPUT1:.+\.bc]]" // CHK-TOOLS-AOT: llvm-link{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT2:.+\.bc]]" // CHK-TOOLS-AOT: sycl-post-link{{.*}} "-o" "[[OUTPUT2_1:.+\.bc]]" "[[OUTPUT2]]" -// CHK-TOOLS-CPU: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2_1]]" -// CHK-TOOLS-GEN: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2_1]]" -// CHK-TOOLS-FPGA-USM-DISABLE: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2_1]]" -// CHK-TOOLS-FPGA-USM-ENABLE: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-ext=+all" "[[OUTPUT2_1]]" +// CHK-TOOLS-CPU: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-debug-info-version=legacy" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2_1]]" +// CHK-TOOLS-GEN: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-debug-info-version=legacy" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2_1]]" +// CHK-TOOLS-FPGA-USM-DISABLE: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-debug-info-version=legacy" "-spirv-ext=+all,-SPV_INTEL_usm_storage_classes" "[[OUTPUT2_1]]" +// CHK-TOOLS-FPGA-USM-ENABLE: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.spv]]" "-spirv-max-version=1.1" "-spirv-debug-info-version=legacy" "-spirv-ext=+all" "[[OUTPUT2_1]]" // CHK-TOOLS-FPGA: aoc{{.*}} "-o" "[[OUTPUT4:.+\.aocx]]" "[[OUTPUT3]]" // CHK-TOOLS-GEN: ocloc{{.*}} "-output" "[[OUTPUT4:.+\.out]]" {{.*}} "[[OUTPUT3]]" // CHK-TOOLS-CPU: opencl-aot{{.*}} "-o=[[OUTPUT4:.+\.out]]" {{.*}} "[[OUTPUT3]]" diff --git a/clang/test/Driver/x86-march.c b/clang/test/Driver/x86-march.c index 87fdf624a9698..57882a5bf637f 100644 --- a/clang/test/Driver/x86-march.c +++ b/clang/test/Driver/x86-march.c @@ -104,6 +104,10 @@ // RUN: | FileCheck %s -check-prefix=tremont // tremont: "-target-cpu" "tremont" // +// RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=sapphirerapids 2>&1 \ +// RUN: | FileCheck %s -check-prefix=sapphirerapids +// sapphirerapids: "-target-cpu" "sapphirerapids" +// // RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=k8 2>&1 \ // RUN: | FileCheck %s -check-prefix=k8 // k8: "-target-cpu" "k8" diff --git a/clang/test/Driver/x86-mtune.c b/clang/test/Driver/x86-mtune.c index 731c580afc48b..a313412b6ab25 100644 --- a/clang/test/Driver/x86-mtune.c +++ b/clang/test/Driver/x86-mtune.c @@ -1,5 +1,14 @@ // Ensure we support the -mtune flag. -// + +// Default mtune should be generic. +// RUN: %clang -target x86_64-unknown-unknown -c -### %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix=notune +// notune: "-tune-cpu" "generic" + +// RUN: %clang -target x86_64-unknown-unknown -c -### %s -mtune=generic 2>&1 \ +// RUN: | FileCheck %s -check-prefix=generic +// generic: "-tune-cpu" "generic" + // RUN: %clang -target x86_64-unknown-unknown -c -### %s -mtune=nocona 2>&1 \ // RUN: | FileCheck %s -check-prefix=nocona // nocona: "-tune-cpu" "nocona" @@ -18,3 +27,16 @@ // RUN: | FileCheck %s -check-prefix=athlon // athlon: "-tune-cpu" "athlon" +// Check interaction between march and mtune. + +// -march should remove default mtune generic. +// RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=core2 2>&1 \ +// RUN: | FileCheck %s -check-prefix=marchcore2 +// marchcore2: "-target-cpu" "core2" +// marchcore2-NOT: "-tune-cpu" + +// -march should remove default mtune generic. +// RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=core2 -mtune=nehalem 2>&1 \ +// RUN: | FileCheck %s -check-prefix=marchmtune +// marchmtune: "-target-cpu" "core2" +// mmarchmtune: "-tune-cpu" "nehalem" diff --git a/clang/test/Frontend/fixed_point_add.c b/clang/test/Frontend/fixed_point_add.c index 15132cfb712a0..d01989c5eab0f 100644 --- a/clang/test/Frontend/fixed_point_add.c +++ b/clang/test/Frontend/fixed_point_add.c @@ -444,11 +444,10 @@ void sat_sassasas() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i16 [[TMP1]] to i15 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i15 @llvm.uadd.sat.i15(i15 [[RESIZE]], i15 [[RESIZE1]]) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i15 [[TMP2]] to i16 -// UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @usa_sat, align 2 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[TMP0]], i16 [[TMP1]]) +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP2]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 +// UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @usa_sat, align 2 // UNSIGNED-NEXT: ret void // void sat_usasusausas() { @@ -469,11 +468,11 @@ void sat_usasusausas() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @ua, align 4 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i32 [[TMP0]] to i31 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i16 [[TMP1]] to i31 -// UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i31 [[RESIZE1]], 8 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i31 @llvm.uadd.sat.i31(i31 [[RESIZE]], i31 [[UPSCALE]]) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i31 [[TMP2]] to i32 +// UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i16 [[TMP1]] to i32 +// UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i32 [[RESIZE]], 8 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[TMP0]], i32 [[UPSCALE]]) +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i32 [[TMP2]] to i31 +// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i31 [[RESIZE1]] to i32 // UNSIGNED-NEXT: store i32 [[RESIZE2]], i32* @ua_sat, align 4 // UNSIGNED-NEXT: ret void // @@ -533,11 +532,10 @@ void sat_sassasui() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @uf_sat, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @uf_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i16 [[TMP1]] to i15 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i15 @llvm.uadd.sat.i15(i15 [[RESIZE]], i15 [[RESIZE1]]) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i15 [[TMP2]] to i16 -// UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @uf_sat, align 2 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[TMP0]], i16 [[TMP1]]) +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP2]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 +// UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @uf_sat, align 2 // UNSIGNED-NEXT: ret void // void sat_ufsufsufs() { diff --git a/clang/test/Frontend/fixed_point_compound.c b/clang/test/Frontend/fixed_point_compound.c index 4a44d0ae95a28..897ba2e22636d 100644 --- a/clang/test/Frontend/fixed_point_compound.c +++ b/clang/test/Frontend/fixed_point_compound.c @@ -567,3 +567,50 @@ void div_csa() { c /= sa; } + +// CHECK-LABEL: @shft_ai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @a, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP1]], [[TMP0]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @a, align 4 +// CHECK-NEXT: ret void +// +void shft_ai() { + a <<= i; +} + +// SIGNED-LABEL: @shft_sufi( +// SIGNED-NEXT: entry: +// SIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @i, align 4 +// SIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @suf, align 2 +// SIGNED-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP0]] to i16 +// SIGNED-NEXT: [[TMP3:%.*]] = call i16 @llvm.ushl.sat.i16(i16 [[TMP1]], i16 [[TMP2]]) +// SIGNED-NEXT: store i16 [[TMP3]], i16* @suf, align 2 +// SIGNED-NEXT: ret void +// +// UNSIGNED-LABEL: @shft_sufi( +// UNSIGNED-NEXT: entry: +// UNSIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @i, align 4 +// UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @suf, align 2 +// UNSIGNED-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP0]] to i16 +// UNSIGNED-NEXT: [[TMP3:%.*]] = call i16 @llvm.sshl.sat.i16(i16 [[TMP1]], i16 [[TMP2]]) +// UNSIGNED-NEXT: store i16 [[TMP3]], i16* @suf, align 2 +// UNSIGNED-NEXT: ret void +// +void shft_sufi() { + suf <<= i; +} + +// CHECK-LABEL: @shft_ulai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* @ula, align 8 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP1]], [[TMP2]] +// CHECK-NEXT: store i64 [[TMP3]], i64* @ula, align 8 +// CHECK-NEXT: ret void +// +void shft_ulai() { + ula >>= i; +} diff --git a/clang/test/Frontend/fixed_point_div.c b/clang/test/Frontend/fixed_point_div.c index b77a13cafb3f6..d54ba3bf48b0d 100644 --- a/clang/test/Frontend/fixed_point_div.c +++ b/clang/test/Frontend/fixed_point_div.c @@ -252,7 +252,7 @@ void sdiv_aaaaa() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa, align 2 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.udiv.fix.i16(i16 [[TMP0]], i16 [[TMP1]], i32 7) +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.sdiv.fix.i16(i16 [[TMP0]], i16 [[TMP1]], i32 7) // UNSIGNED-NEXT: store i16 [[TMP2]], i16* @usa, align 2 // UNSIGNED-NEXT: ret void // @@ -276,7 +276,7 @@ void udiv_usausausa() { // UNSIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @ua, align 4 // UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i16 [[TMP0]] to i32 // UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i32 [[RESIZE]], 8 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.udiv.fix.i32(i32 [[UPSCALE]], i32 [[TMP1]], i32 15) +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.sdiv.fix.i32(i32 [[UPSCALE]], i32 [[TMP1]], i32 15) // UNSIGNED-NEXT: store i32 [[TMP2]], i32* @ua, align 4 // UNSIGNED-NEXT: ret void // @@ -298,7 +298,7 @@ void udiv_uausaua() { // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i8, i8* @usf, align 1 // UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i8 [[TMP1]] to i16 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.udiv.fix.i16(i16 [[TMP0]], i16 [[RESIZE]], i32 7) +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.sdiv.fix.i16(i16 [[TMP0]], i16 [[RESIZE]], i32 7) // UNSIGNED-NEXT: store i16 [[TMP2]], i16* @usa, align 2 // UNSIGNED-NEXT: ret void // @@ -326,7 +326,7 @@ void udiv_usausausf() { // UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i16 [[TMP0]] to i24 // UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i24 [[RESIZE]], 8 // UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i16 [[TMP1]] to i24 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i24 @llvm.udiv.fix.i24(i24 [[UPSCALE]], i24 [[RESIZE1]], i32 15) +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i24 @llvm.sdiv.fix.i24(i24 [[UPSCALE]], i24 [[RESIZE1]], i32 15) // UNSIGNED-NEXT: [[DOWNSCALE:%.*]] = lshr i24 [[TMP2]], 8 // UNSIGNED-NEXT: [[RESIZE2:%.*]] = trunc i24 [[DOWNSCALE]] to i16 // UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @usa, align 2 @@ -544,11 +544,10 @@ void sat_sassasas() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i16 [[TMP1]] to i15 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i15 @llvm.udiv.fix.sat.i15(i15 [[RESIZE]], i15 [[RESIZE1]], i32 7) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i15 [[TMP2]] to i16 -// UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @usa_sat, align 2 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.sdiv.fix.sat.i16(i16 [[TMP0]], i16 [[TMP1]], i32 7) +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP2]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 +// UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @usa_sat, align 2 // UNSIGNED-NEXT: ret void // void sat_usasusausas() { @@ -569,11 +568,11 @@ void sat_usasusausas() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @ua, align 4 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i32 [[TMP0]] to i31 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i16 [[TMP1]] to i31 -// UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i31 [[RESIZE1]], 8 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i31 @llvm.udiv.fix.sat.i31(i31 [[RESIZE]], i31 [[UPSCALE]], i32 15) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i31 [[TMP2]] to i32 +// UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i16 [[TMP1]] to i32 +// UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i32 [[RESIZE]], 8 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.sdiv.fix.sat.i32(i32 [[TMP0]], i32 [[UPSCALE]], i32 15) +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i32 [[TMP2]] to i31 +// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i31 [[RESIZE1]] to i32 // UNSIGNED-NEXT: store i32 [[RESIZE2]], i32* @ua_sat, align 4 // UNSIGNED-NEXT: ret void // @@ -633,11 +632,10 @@ void sat_sassasui() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @uf_sat, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @uf_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i16 [[TMP1]] to i15 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i15 @llvm.udiv.fix.sat.i15(i15 [[RESIZE]], i15 [[RESIZE1]], i32 15) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i15 [[TMP2]] to i16 -// UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @uf_sat, align 2 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.sdiv.fix.sat.i16(i16 [[TMP0]], i16 [[TMP1]], i32 15) +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP2]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 +// UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @uf_sat, align 2 // UNSIGNED-NEXT: ret void // void sat_ufsufsufs() { diff --git a/clang/test/Frontend/fixed_point_mul.c b/clang/test/Frontend/fixed_point_mul.c index 777c35c52d4a3..eeb80dd08d94d 100644 --- a/clang/test/Frontend/fixed_point_mul.c +++ b/clang/test/Frontend/fixed_point_mul.c @@ -252,7 +252,7 @@ void smul_aaaaa() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa, align 2 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.umul.fix.i16(i16 [[TMP0]], i16 [[TMP1]], i32 7) +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.smul.fix.i16(i16 [[TMP0]], i16 [[TMP1]], i32 7) // UNSIGNED-NEXT: store i16 [[TMP2]], i16* @usa, align 2 // UNSIGNED-NEXT: ret void // @@ -276,7 +276,7 @@ void umul_usausausa() { // UNSIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @ua, align 4 // UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i16 [[TMP0]] to i32 // UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i32 [[RESIZE]], 8 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[UPSCALE]], i32 [[TMP1]], i32 15) +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[UPSCALE]], i32 [[TMP1]], i32 15) // UNSIGNED-NEXT: store i32 [[TMP2]], i32* @ua, align 4 // UNSIGNED-NEXT: ret void // @@ -298,7 +298,7 @@ void umul_uausaua() { // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i8, i8* @usf, align 1 // UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i8 [[TMP1]] to i16 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.umul.fix.i16(i16 [[TMP0]], i16 [[RESIZE]], i32 7) +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.smul.fix.i16(i16 [[TMP0]], i16 [[RESIZE]], i32 7) // UNSIGNED-NEXT: store i16 [[TMP2]], i16* @usa, align 2 // UNSIGNED-NEXT: ret void // @@ -326,7 +326,7 @@ void umul_usausausf() { // UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i16 [[TMP0]] to i24 // UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i24 [[RESIZE]], 8 // UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i16 [[TMP1]] to i24 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i24 @llvm.umul.fix.i24(i24 [[UPSCALE]], i24 [[RESIZE1]], i32 15) +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i24 @llvm.smul.fix.i24(i24 [[UPSCALE]], i24 [[RESIZE1]], i32 15) // UNSIGNED-NEXT: [[DOWNSCALE:%.*]] = lshr i24 [[TMP2]], 8 // UNSIGNED-NEXT: [[RESIZE2:%.*]] = trunc i24 [[DOWNSCALE]] to i16 // UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @usa, align 2 @@ -544,11 +544,10 @@ void sat_sassasas() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i16 [[TMP1]] to i15 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i15 @llvm.umul.fix.sat.i15(i15 [[RESIZE]], i15 [[RESIZE1]], i32 7) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i15 [[TMP2]] to i16 -// UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @usa_sat, align 2 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.smul.fix.sat.i16(i16 [[TMP0]], i16 [[TMP1]], i32 7) +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP2]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 +// UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @usa_sat, align 2 // UNSIGNED-NEXT: ret void // void sat_usasusausas() { @@ -569,11 +568,11 @@ void sat_usasusausas() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @ua, align 4 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i32 [[TMP0]] to i31 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i16 [[TMP1]] to i31 -// UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i31 [[RESIZE1]], 8 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i31 @llvm.umul.fix.sat.i31(i31 [[RESIZE]], i31 [[UPSCALE]], i32 15) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i31 [[TMP2]] to i32 +// UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i16 [[TMP1]] to i32 +// UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i32 [[RESIZE]], 8 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.smul.fix.sat.i32(i32 [[TMP0]], i32 [[UPSCALE]], i32 15) +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i32 [[TMP2]] to i31 +// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i31 [[RESIZE1]] to i32 // UNSIGNED-NEXT: store i32 [[RESIZE2]], i32* @ua_sat, align 4 // UNSIGNED-NEXT: ret void // @@ -633,11 +632,10 @@ void sat_sassasui() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @uf_sat, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @uf_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i16 [[TMP1]] to i15 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i15 @llvm.umul.fix.sat.i15(i15 [[RESIZE]], i15 [[RESIZE1]], i32 15) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i15 [[TMP2]] to i16 -// UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @uf_sat, align 2 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.smul.fix.sat.i16(i16 [[TMP0]], i16 [[TMP1]], i32 15) +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP2]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 +// UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @uf_sat, align 2 // UNSIGNED-NEXT: ret void // void sat_ufsufsufs() { diff --git a/clang/test/Frontend/fixed_point_shift.c b/clang/test/Frontend/fixed_point_shift.c index cbd4f38ab8b05..a3d758798dfa9 100644 --- a/clang/test/Frontend/fixed_point_shift.c +++ b/clang/test/Frontend/fixed_point_shift.c @@ -1,37 +1,580 @@ -// RUN: %clang_cc1 -ffixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED -// RUN: %clang_cc1 -ffixed-point -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED - -short _Accum sa_const1 = 1.0hk << 2; // CHECK-DAG: @sa_const1 = {{.*}}global i16 512 -short _Accum sa_const2 = 0.5hk << 2; // CHECK-DAG: @sa_const2 = {{.*}}global i16 256 -short _Accum sa_const3 = 10.0hk >> 3; // CHECK-DAG: @sa_const3 = {{.*}}global i16 160 -short _Accum sa_const4 = 0.0546875hk << 8; // CHECK-DAG: @sa_const4 = {{.*}}global i16 1792 -short _Accum sa_const5 = -1.0hk << 2; // CHECK-DAG: @sa_const5 = {{.*}}global i16 -512 -short _Accum sa_const6 = -255.0hk >> 8; // CHECK-DAG: @sa_const6 = {{.*}}global i16 -128 - -_Fract f_const1 = -1.0r >> 5; // CHECK-DAG: @f_const1 = {{.*}}global i16 -1024 -_Fract f_const2 = 0.0052490234375r >> 3; // CHECK-DAG: @f_const2 = {{.*}}global i16 21 -_Fract f_const3 = -0.0001r << 5; // CHECK-DAG: @f_const3 = {{.*}}global i16 -96 -_Fract f_const4 = -0.75r >> 15; // CHECK-DAG: @f_const4 = {{.*}}global i16 -1 -_Fract f_const5 = 0.078216552734375r << 3; // CHECK-DAG: @f_const5 = {{.*}}global i16 20504 - -unsigned _Fract uf_const1 = 0.375ur >> 13; -// SIGNED-DAG: @uf_const1 = {{.*}}global i16 3 -// UNSIGNED-DAG: @uf_const1 = {{.*}}global i16 1 -unsigned _Fract uf_const2 = 0.0546875ur << 3; -// SIGNED-DAG: @uf_const2 = {{.*}}global i16 28672 -// UNSIGNED-DAG: @uf_const2 = {{.*}}global i16 14336 - -_Sat short _Accum ssa_const1 = (_Sat short _Accum)31.875hk << 4; // CHECK-DAG: @ssa_const1 = {{.*}}global i16 32767 -_Sat short _Accum ssa_const2 = (_Sat short _Accum) - 1.0hk << 8; // CHECK-DAG: @ssa_const2 = {{.*}}global i16 -32768 -_Sat short _Accum ssa_const3 = (_Sat short _Accum)128.0hk << 8; // CHECK-DAG: @ssa_const3 = {{.*}}global i16 32767 -_Sat short _Fract ssf_const1 = (_Sat short _Fract) - 0.5hr << 3; // CHECK-DAG: @ssf_const1 = {{.*}}global i8 -128 - -_Sat unsigned _Fract suf_const1 = (_Sat unsigned _Fract)0.5r << 1; -// SIGNED-DAG: @suf_const1 = {{.*}}global i16 -1 -// UNSIGNED-DAG: @suf_const1 = {{.*}}global i16 32767 -_Sat unsigned _Fract suf_const2 = (_Sat unsigned _Fract)0.25r << 1; -// SIGNED-DAG: @suf_const2 = {{.*}}global i16 -32768 -// UNSIGNED-DAG: @suf_const2 = {{.*}}global i16 16384 -_Sat unsigned _Accum sua_const2 = (_Sat unsigned _Accum)128.0uk << 10; -// SIGNED-DAG: @sua_const2 = {{.*}}global i32 -1 -// UNSIGNED-DAG: @sua_const2 = {{.*}}global i32 2147483647 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED + +short _Accum sa; +_Accum a; +long _Accum la; + +short _Fract sf; +_Fract f; +long _Fract lf; + +unsigned short _Accum usa; +unsigned _Accum ua; +unsigned long _Accum ula; + +unsigned short _Fract usf; +unsigned _Fract uf; +unsigned long _Fract ulf; + +_Sat short _Accum sa_sat; +_Sat _Accum a_sat; + +_Sat short _Fract sf_sat; +_Sat _Fract f_sat; + +_Sat unsigned short _Accum usa_sat; +_Sat unsigned _Accum ua_sat; + +_Sat unsigned short _Fract usf_sat; +_Sat unsigned _Fract uf_sat; + +int i; +unsigned u; + + +// CHECK-LABEL: @sleft_sasai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @sa, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = shl i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @sa, align 2 +// CHECK-NEXT: ret void +// +void sleft_sasai() { + sa = sa << i; +} + +// CHECK-LABEL: @sleft_aai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @a, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @a, align 4 +// CHECK-NEXT: ret void +// +void sleft_aai() { + a = a << i; +} + +// CHECK-LABEL: @sleft_lalai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* @la, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i64 [[TMP3]], i64* @la, align 8 +// CHECK-NEXT: ret void +// +void sleft_lalai() { + la = la << i; +} + +// CHECK-LABEL: @sleft_sfsfi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* @sf, align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// CHECK-NEXT: [[TMP3:%.*]] = shl i8 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i8 [[TMP3]], i8* @sf, align 1 +// CHECK-NEXT: ret void +// +void sleft_sfsfi() { + sf = sf << i; +} + +// CHECK-LABEL: @sleft_ffi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @f, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = shl i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @f, align 2 +// CHECK-NEXT: ret void +// +void sleft_ffi() { + f = f << i; +} + +// CHECK-LABEL: @sleft_lflfi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @lf, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @lf, align 4 +// CHECK-NEXT: ret void +// +void sleft_lflfi() { + lf = lf << i; +} + +// CHECK-LABEL: @sleft_aau( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @a, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @u, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @a, align 4 +// CHECK-NEXT: ret void +// +void sleft_aau() { + a = a << u; +} + +// CHECK-LABEL: @sleft_ffu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @f, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @u, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = shl i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @f, align 2 +// CHECK-NEXT: ret void +// +void sleft_ffu() { + f = f << u; +} + + +// CHECK-LABEL: @uleft_usausai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = shl i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @usa, align 2 +// CHECK-NEXT: ret void +// +void uleft_usausai() { + usa = usa << i; +} + +// CHECK-LABEL: @uleft_uauai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @ua, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @ua, align 4 +// CHECK-NEXT: ret void +// +void uleft_uauai() { + ua = ua << i; +} + +// CHECK-LABEL: @uleft_ulaulai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* @ula, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i64 [[TMP3]], i64* @ula, align 8 +// CHECK-NEXT: ret void +// +void uleft_ulaulai() { + ula = ula << i; +} + +// CHECK-LABEL: @uleft_usfusfi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* @usf, align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// CHECK-NEXT: [[TMP3:%.*]] = shl i8 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i8 [[TMP3]], i8* @usf, align 1 +// CHECK-NEXT: ret void +// +void uleft_usfusfi() { + usf = usf << i; +} + +// CHECK-LABEL: @uleft_ufufi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @uf, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = shl i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @uf, align 2 +// CHECK-NEXT: ret void +// +void uleft_ufufi() { + uf = uf << i; +} + +// CHECK-LABEL: @uleft_ulfulfi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @ulf, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @ulf, align 4 +// CHECK-NEXT: ret void +// +void uleft_ulfulfi() { + ulf = ulf << i; +} + +// CHECK-LABEL: @uleft_uauau( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @ua, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @u, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @ua, align 4 +// CHECK-NEXT: ret void +// +void uleft_uauau() { + ua = ua << u; +} + +// CHECK-LABEL: @uleft_ufufu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @uf, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @u, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = shl i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @uf, align 2 +// CHECK-NEXT: ret void +// +void uleft_ufufu() { + uf = uf << u; +} + + +// CHECK-LABEL: @sright_sasai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @sa, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = ashr i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @sa, align 2 +// CHECK-NEXT: ret void +// +void sright_sasai() { + sa = sa >> i; +} + +// CHECK-LABEL: @sright_aai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @a, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @a, align 4 +// CHECK-NEXT: ret void +// +void sright_aai() { + a = a >> i; +} + +// CHECK-LABEL: @sright_lalai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* @la, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = ashr i64 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i64 [[TMP3]], i64* @la, align 8 +// CHECK-NEXT: ret void +// +void sright_lalai() { + la = la >> i; +} + +// CHECK-LABEL: @sright_sfsfi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* @sf, align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// CHECK-NEXT: [[TMP3:%.*]] = ashr i8 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i8 [[TMP3]], i8* @sf, align 1 +// CHECK-NEXT: ret void +// +void sright_sfsfi() { + sf = sf >> i; +} + +// CHECK-LABEL: @sright_ffi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @f, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = ashr i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @f, align 2 +// CHECK-NEXT: ret void +// +void sright_ffi() { + f = f >> i; +} + +// CHECK-LABEL: @sright_lflfi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @lf, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @lf, align 4 +// CHECK-NEXT: ret void +// +void sright_lflfi() { + lf = lf >> i; +} + +// CHECK-LABEL: @sright_aau( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @a, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @u, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @a, align 4 +// CHECK-NEXT: ret void +// +void sright_aau() { + a = a >> u; +} + +// CHECK-LABEL: @sright_ffu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @f, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @u, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = ashr i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @f, align 2 +// CHECK-NEXT: ret void +// +void sright_ffu() { + f = f >> u; +} + + +// CHECK-LABEL: @uright_usausai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = lshr i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @usa, align 2 +// CHECK-NEXT: ret void +// +void uright_usausai() { + usa = usa >> i; +} + +// CHECK-LABEL: @uright_uauai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @ua, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @ua, align 4 +// CHECK-NEXT: ret void +// +void uright_uauai() { + ua = ua >> i; +} + +// CHECK-LABEL: @uright_ulaulai( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* @ula, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i64 [[TMP3]], i64* @ula, align 8 +// CHECK-NEXT: ret void +// +void uright_ulaulai() { + ula = ula >> i; +} + +// CHECK-LABEL: @uright_usfusfi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* @usf, align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// CHECK-NEXT: [[TMP3:%.*]] = lshr i8 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i8 [[TMP3]], i8* @usf, align 1 +// CHECK-NEXT: ret void +// +void uright_usfusfi() { + usf = usf >> i; +} + +// CHECK-LABEL: @uright_ufufi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @uf, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = lshr i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @uf, align 2 +// CHECK-NEXT: ret void +// +void uright_ufufi() { + uf = uf >> i; +} + +// CHECK-LABEL: @uright_ulfulfi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @ulf, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @ulf, align 4 +// CHECK-NEXT: ret void +// +void uright_ulfulfi() { + ulf = ulf >> i; +} + +// CHECK-LABEL: @uright_uauau( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @ua, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @u, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: store i32 [[TMP2]], i32* @ua, align 4 +// CHECK-NEXT: ret void +// +void uright_uauau() { + ua = ua >> u; +} + +// CHECK-LABEL: @uright_ufufu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @uf, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @u, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = lshr i16 [[TMP0]], [[TMP2]] +// CHECK-NEXT: store i16 [[TMP3]], i16* @uf, align 2 +// CHECK-NEXT: ret void +// +void uright_ufufu() { + uf = uf >> u; +} + + +// CHECK-LABEL: @satleft_sassasi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @sa_sat, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.sshl.sat.i16(i16 [[TMP0]], i16 [[TMP2]]) +// CHECK-NEXT: store i16 [[TMP3]], i16* @sa_sat, align 2 +// CHECK-NEXT: ret void +// +void satleft_sassasi() { + sa_sat = sa_sat << i; +} + +// CHECK-LABEL: @satleft_asasi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @a_sat, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.sshl.sat.i32(i32 [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: store i32 [[TMP2]], i32* @a_sat, align 4 +// CHECK-NEXT: ret void +// +void satleft_asasi() { + a_sat = a_sat << i; +} + +// CHECK-LABEL: @satleft_sfssfsi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* @sf_sat, align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.sshl.sat.i8(i8 [[TMP0]], i8 [[TMP2]]) +// CHECK-NEXT: store i8 [[TMP3]], i8* @sf_sat, align 1 +// CHECK-NEXT: ret void +// +void satleft_sfssfsi() { + sf_sat = sf_sat << i; +} + +// CHECK-LABEL: @satleft_fsfsi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @f_sat, align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.sshl.sat.i16(i16 [[TMP0]], i16 [[TMP2]]) +// CHECK-NEXT: store i16 [[TMP3]], i16* @f_sat, align 2 +// CHECK-NEXT: ret void +// +void satleft_fsfsi() { + f_sat = f_sat << i; +} + +// SIGNED-LABEL: @satleft_usasusasi( +// SIGNED-NEXT: entry: +// SIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa_sat, align 2 +// SIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// SIGNED-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// SIGNED-NEXT: [[TMP3:%.*]] = call i16 @llvm.ushl.sat.i16(i16 [[TMP0]], i16 [[TMP2]]) +// SIGNED-NEXT: store i16 [[TMP3]], i16* @usa_sat, align 2 +// SIGNED-NEXT: ret void +// +// UNSIGNED-LABEL: @satleft_usasusasi( +// UNSIGNED-NEXT: entry: +// UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa_sat, align 2 +// UNSIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// UNSIGNED-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// UNSIGNED-NEXT: [[TMP3:%.*]] = call i16 @llvm.sshl.sat.i16(i16 [[TMP0]], i16 [[TMP2]]) +// UNSIGNED-NEXT: store i16 [[TMP3]], i16* @usa_sat, align 2 +// UNSIGNED-NEXT: ret void +// +void satleft_usasusasi() { + usa_sat = usa_sat << i; +} + +// SIGNED-LABEL: @satleft_uasuasi( +// SIGNED-NEXT: entry: +// SIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @ua_sat, align 4 +// SIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// SIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.ushl.sat.i32(i32 [[TMP0]], i32 [[TMP1]]) +// SIGNED-NEXT: store i32 [[TMP2]], i32* @ua_sat, align 4 +// SIGNED-NEXT: ret void +// +// UNSIGNED-LABEL: @satleft_uasuasi( +// UNSIGNED-NEXT: entry: +// UNSIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @ua_sat, align 4 +// UNSIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.sshl.sat.i32(i32 [[TMP0]], i32 [[TMP1]]) +// UNSIGNED-NEXT: store i32 [[TMP2]], i32* @ua_sat, align 4 +// UNSIGNED-NEXT: ret void +// +void satleft_uasuasi() { + ua_sat = ua_sat << i; +} + +// SIGNED-LABEL: @satleft_usfsusfsi( +// SIGNED-NEXT: entry: +// SIGNED-NEXT: [[TMP0:%.*]] = load i8, i8* @usf_sat, align 1 +// SIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// SIGNED-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// SIGNED-NEXT: [[TMP3:%.*]] = call i8 @llvm.ushl.sat.i8(i8 [[TMP0]], i8 [[TMP2]]) +// SIGNED-NEXT: store i8 [[TMP3]], i8* @usf_sat, align 1 +// SIGNED-NEXT: ret void +// +// UNSIGNED-LABEL: @satleft_usfsusfsi( +// UNSIGNED-NEXT: entry: +// UNSIGNED-NEXT: [[TMP0:%.*]] = load i8, i8* @usf_sat, align 1 +// UNSIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// UNSIGNED-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// UNSIGNED-NEXT: [[TMP3:%.*]] = call i8 @llvm.sshl.sat.i8(i8 [[TMP0]], i8 [[TMP2]]) +// UNSIGNED-NEXT: store i8 [[TMP3]], i8* @usf_sat, align 1 +// UNSIGNED-NEXT: ret void +// +void satleft_usfsusfsi() { + usf_sat = usf_sat << i; +} + +// SIGNED-LABEL: @satleft_ufsufsi( +// SIGNED-NEXT: entry: +// SIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @uf_sat, align 2 +// SIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// SIGNED-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// SIGNED-NEXT: [[TMP3:%.*]] = call i16 @llvm.ushl.sat.i16(i16 [[TMP0]], i16 [[TMP2]]) +// SIGNED-NEXT: store i16 [[TMP3]], i16* @uf_sat, align 2 +// SIGNED-NEXT: ret void +// +// UNSIGNED-LABEL: @satleft_ufsufsi( +// UNSIGNED-NEXT: entry: +// UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @uf_sat, align 2 +// UNSIGNED-NEXT: [[TMP1:%.*]] = load i32, i32* @i, align 4 +// UNSIGNED-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// UNSIGNED-NEXT: [[TMP3:%.*]] = call i16 @llvm.sshl.sat.i16(i16 [[TMP0]], i16 [[TMP2]]) +// UNSIGNED-NEXT: store i16 [[TMP3]], i16* @uf_sat, align 2 +// UNSIGNED-NEXT: ret void +// +void satleft_ufsufsi() { + uf_sat = uf_sat << i; +} diff --git a/clang/test/Frontend/fixed_point_shift_const.c b/clang/test/Frontend/fixed_point_shift_const.c new file mode 100644 index 0000000000000..10860efd188b7 --- /dev/null +++ b/clang/test/Frontend/fixed_point_shift_const.c @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED + +short _Accum sa_const1 = 1.0hk << 2; +// CHECK-DAG: @sa_const1 = {{.*}}global i16 512 +short _Accum sa_const2 = 0.5hk << 2; +// CHECK-DAG: @sa_const2 = {{.*}}global i16 256 +short _Accum sa_const3 = 10.0hk >> 3; +// CHECK-DAG: @sa_const3 = {{.*}}global i16 160 +short _Accum sa_const4 = 0.0546875hk << 8; +// CHECK-DAG: @sa_const4 = {{.*}}global i16 1792 +short _Accum sa_const5 = -1.0hk << 2; +// CHECK-DAG: @sa_const5 = {{.*}}global i16 -512 +short _Accum sa_const6 = -255.0hk >> 8; +// CHECK-DAG: @sa_const6 = {{.*}}global i16 -128 + +_Fract f_const1 = -1.0r >> 5; +// CHECK-DAG: @f_const1 = {{.*}}global i16 -1024 +_Fract f_const2 = 0.0052490234375r >> 3; +// CHECK-DAG: @f_const2 = {{.*}}global i16 21 +_Fract f_const3 = -0.0001r << 5; +// CHECK-DAG: @f_const3 = {{.*}}global i16 -96 +_Fract f_const4 = -0.75r >> 15; +// CHECK-DAG: @f_const4 = {{.*}}global i16 -1 +_Fract f_const5 = 0.078216552734375r << 3; +// CHECK-DAG: @f_const5 = {{.*}}global i16 20504 + +unsigned _Fract uf_const1 = 0.375ur >> 13; +// SIGNED-DAG: @uf_const1 = {{.*}}global i16 3 +// UNSIGNED-DAG: @uf_const1 = {{.*}}global i16 1 +unsigned _Fract uf_const2 = 0.0546875ur << 3; +// SIGNED-DAG: @uf_const2 = {{.*}}global i16 28672 +// UNSIGNED-DAG: @uf_const2 = {{.*}}global i16 14336 + +_Sat short _Accum ssa_const1 = (_Sat short _Accum)31.875hk << 4; +// CHECK-DAG: @ssa_const1 = {{.*}}global i16 32767 +_Sat short _Accum ssa_const2 = (_Sat short _Accum) - 1.0hk << 8; +// CHECK-DAG: @ssa_const2 = {{.*}}global i16 -32768 +_Sat short _Accum ssa_const3 = (_Sat short _Accum)128.0hk << 8; +// CHECK-DAG: @ssa_const3 = {{.*}}global i16 32767 +_Sat short _Fract ssf_const1 = (_Sat short _Fract) - 0.5hr << 3; +// CHECK-DAG: @ssf_const1 = {{.*}}global i8 -128 + +_Sat unsigned _Fract suf_const1 = (_Sat unsigned _Fract)0.5r << 1; +// SIGNED-DAG: @suf_const1 = {{.*}}global i16 -1 +// UNSIGNED-DAG: @suf_const1 = {{.*}}global i16 32767 +_Sat unsigned _Fract suf_const2 = (_Sat unsigned _Fract)0.25r << 1; +// SIGNED-DAG: @suf_const2 = {{.*}}global i16 -32768 +// UNSIGNED-DAG: @suf_const2 = {{.*}}global i16 16384 +_Sat unsigned _Accum sua_const2 = (_Sat unsigned _Accum)128.0uk << 10; +// SIGNED-DAG: @sua_const2 = {{.*}}global i32 -1 +// UNSIGNED-DAG: @sua_const2 = {{.*}}global i32 2147483647 diff --git a/clang/test/Frontend/fixed_point_sub.c b/clang/test/Frontend/fixed_point_sub.c index 4d07b4a522572..6446d76fbaa56 100644 --- a/clang/test/Frontend/fixed_point_sub.c +++ b/clang/test/Frontend/fixed_point_sub.c @@ -444,11 +444,12 @@ void sat_sassasas() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @usa, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i16 [[TMP1]] to i15 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i15 @llvm.usub.sat.i15(i15 [[RESIZE]], i15 [[RESIZE1]]) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i15 [[TMP2]] to i16 -// UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @usa_sat, align 2 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[TMP0]], i16 [[TMP1]]) +// UNSIGNED-NEXT: [[TMP3:%.*]] = icmp slt i16 [[TMP2]], 0 +// UNSIGNED-NEXT: [[SATMIN:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP2]] +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[SATMIN]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 +// UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @usa_sat, align 2 // UNSIGNED-NEXT: ret void // void sat_usasusausas() { @@ -469,11 +470,13 @@ void sat_usasusausas() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @ua, align 4 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @usa_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i32 [[TMP0]] to i31 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i16 [[TMP1]] to i31 -// UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i31 [[RESIZE1]], 8 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i31 @llvm.usub.sat.i31(i31 [[RESIZE]], i31 [[UPSCALE]]) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i31 [[TMP2]] to i32 +// UNSIGNED-NEXT: [[RESIZE:%.*]] = zext i16 [[TMP1]] to i32 +// UNSIGNED-NEXT: [[UPSCALE:%.*]] = shl i32 [[RESIZE]], 8 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[TMP0]], i32 [[UPSCALE]]) +// UNSIGNED-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], 0 +// UNSIGNED-NEXT: [[SATMIN:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i32 [[SATMIN]] to i31 +// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i31 [[RESIZE1]] to i32 // UNSIGNED-NEXT: store i32 [[RESIZE2]], i32* @ua_sat, align 4 // UNSIGNED-NEXT: ret void // @@ -533,11 +536,12 @@ void sat_sassasui() { // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @uf_sat, align 2 // UNSIGNED-NEXT: [[TMP1:%.*]] = load i16, i16* @uf_sat, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = trunc i16 [[TMP1]] to i15 -// UNSIGNED-NEXT: [[TMP2:%.*]] = call i15 @llvm.usub.sat.i15(i15 [[RESIZE]], i15 [[RESIZE1]]) -// UNSIGNED-NEXT: [[RESIZE2:%.*]] = zext i15 [[TMP2]] to i16 -// UNSIGNED-NEXT: store i16 [[RESIZE2]], i16* @uf_sat, align 2 +// UNSIGNED-NEXT: [[TMP2:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[TMP0]], i16 [[TMP1]]) +// UNSIGNED-NEXT: [[TMP3:%.*]] = icmp slt i16 [[TMP2]], 0 +// UNSIGNED-NEXT: [[SATMIN:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP2]] +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[SATMIN]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 +// UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @uf_sat, align 2 // UNSIGNED-NEXT: ret void // void sat_ufsufsufs() { diff --git a/clang/test/Frontend/fixed_point_unary.c b/clang/test/Frontend/fixed_point_unary.c index 84c6654fe7aff..849e38a94bc48 100644 --- a/clang/test/Frontend/fixed_point_unary.c +++ b/clang/test/Frontend/fixed_point_unary.c @@ -148,9 +148,9 @@ void inc_slf() { // UNSIGNED-LABEL: @inc_sua( // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @sua, align 4 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i32 [[TMP0]] to i31 -// UNSIGNED-NEXT: [[TMP1:%.*]] = call i31 @llvm.uadd.sat.i31(i31 [[RESIZE]], i31 32768) -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i31 [[TMP1]] to i32 +// UNSIGNED-NEXT: [[TMP1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[TMP0]], i32 32768) +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i32 [[TMP1]] to i31 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i31 [[RESIZE]] to i32 // UNSIGNED-NEXT: store i32 [[RESIZE1]], i32* @sua, align 4 // UNSIGNED-NEXT: ret void // @@ -168,9 +168,9 @@ void inc_sua() { // UNSIGNED-LABEL: @inc_susa( // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @susa, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[TMP1:%.*]] = call i15 @llvm.uadd.sat.i15(i15 [[RESIZE]], i15 128) -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[TMP1]] to i16 +// UNSIGNED-NEXT: [[TMP1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[TMP0]], i16 128) +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP1]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 // UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @susa, align 2 // UNSIGNED-NEXT: ret void // @@ -188,9 +188,9 @@ void inc_susa() { // UNSIGNED-LABEL: @inc_suf( // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @suf, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[TMP1:%.*]] = call i15 @llvm.uadd.sat.i15(i15 [[RESIZE]], i15 -1) -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[TMP1]] to i16 +// UNSIGNED-NEXT: [[TMP1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[TMP0]], i16 32767) +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP1]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 // UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @suf, align 2 // UNSIGNED-NEXT: ret void // @@ -329,9 +329,11 @@ void dec_slf() { // UNSIGNED-LABEL: @dec_sua( // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i32, i32* @sua, align 4 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i32 [[TMP0]] to i31 -// UNSIGNED-NEXT: [[TMP1:%.*]] = call i31 @llvm.usub.sat.i31(i31 [[RESIZE]], i31 32768) -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i31 [[TMP1]] to i32 +// UNSIGNED-NEXT: [[TMP1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[TMP0]], i32 32768) +// UNSIGNED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0 +// UNSIGNED-NEXT: [[SATMIN:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]] +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i32 [[SATMIN]] to i31 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i31 [[RESIZE]] to i32 // UNSIGNED-NEXT: store i32 [[RESIZE1]], i32* @sua, align 4 // UNSIGNED-NEXT: ret void // @@ -349,9 +351,11 @@ void dec_sua() { // UNSIGNED-LABEL: @dec_susa( // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @susa, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[TMP1:%.*]] = call i15 @llvm.usub.sat.i15(i15 [[RESIZE]], i15 128) -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[TMP1]] to i16 +// UNSIGNED-NEXT: [[TMP1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[TMP0]], i16 128) +// UNSIGNED-NEXT: [[TMP2:%.*]] = icmp slt i16 [[TMP1]], 0 +// UNSIGNED-NEXT: [[SATMIN:%.*]] = select i1 [[TMP2]], i16 0, i16 [[TMP1]] +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[SATMIN]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 // UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @susa, align 2 // UNSIGNED-NEXT: ret void // @@ -369,9 +373,11 @@ void dec_susa() { // UNSIGNED-LABEL: @dec_suf( // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @suf, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[TMP1:%.*]] = call i15 @llvm.usub.sat.i15(i15 [[RESIZE]], i15 -1) -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[TMP1]] to i16 +// UNSIGNED-NEXT: [[TMP1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[TMP0]], i16 32767) +// UNSIGNED-NEXT: [[TMP2:%.*]] = icmp slt i16 [[TMP1]], 0 +// UNSIGNED-NEXT: [[SATMIN:%.*]] = select i1 [[TMP2]], i16 0, i16 [[TMP1]] +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[SATMIN]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 // UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @suf, align 2 // UNSIGNED-NEXT: ret void // @@ -456,9 +462,11 @@ void neg_sf() { // UNSIGNED-LABEL: @neg_susa( // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @susa, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[TMP1:%.*]] = call i15 @llvm.usub.sat.i15(i15 0, i15 [[RESIZE]]) -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[TMP1]] to i16 +// UNSIGNED-NEXT: [[TMP1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 0, i16 [[TMP0]]) +// UNSIGNED-NEXT: [[TMP2:%.*]] = icmp slt i16 [[TMP1]], 0 +// UNSIGNED-NEXT: [[SATMIN:%.*]] = select i1 [[TMP2]], i16 0, i16 [[TMP1]] +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[SATMIN]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 // UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @susa, align 2 // UNSIGNED-NEXT: ret void // @@ -476,9 +484,11 @@ void neg_susa() { // UNSIGNED-LABEL: @neg_suf( // UNSIGNED-NEXT: entry: // UNSIGNED-NEXT: [[TMP0:%.*]] = load i16, i16* @suf, align 2 -// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[TMP0]] to i15 -// UNSIGNED-NEXT: [[TMP1:%.*]] = call i15 @llvm.usub.sat.i15(i15 0, i15 [[RESIZE]]) -// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[TMP1]] to i16 +// UNSIGNED-NEXT: [[TMP1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 0, i16 [[TMP0]]) +// UNSIGNED-NEXT: [[TMP2:%.*]] = icmp slt i16 [[TMP1]], 0 +// UNSIGNED-NEXT: [[SATMIN:%.*]] = select i1 [[TMP2]], i16 0, i16 [[TMP1]] +// UNSIGNED-NEXT: [[RESIZE:%.*]] = trunc i16 [[SATMIN]] to i15 +// UNSIGNED-NEXT: [[RESIZE1:%.*]] = zext i15 [[RESIZE]] to i16 // UNSIGNED-NEXT: store i16 [[RESIZE1]], i16* @suf, align 2 // UNSIGNED-NEXT: ret void // diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index 803b784bc0ec2..546ab6341f97f 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -21,7 +21,7 @@ // X86-SAME: nocona, core2, penryn, bonnell, atom, silvermont, slm, goldmont, goldmont-plus, tremont, // X86-SAME: nehalem, corei7, westmere, sandybridge, corei7-avx, ivybridge, // X86-SAME: core-avx-i, haswell, core-avx2, broadwell, skylake, skylake-avx512, -// X86-SAME: skx, cascadelake, cooperlake, cannonlake, icelake-client, icelake-server, tigerlake, knl, knm, lakemont, k6, k6-2, k6-3, +// X86-SAME: skx, cascadelake, cooperlake, cannonlake, icelake-client, icelake-server, tigerlake, sapphirerapids, knl, knm, lakemont, k6, k6-2, k6-3, // X86-SAME: athlon, athlon-tbird, athlon-xp, athlon-mp, athlon-4, k8, athlon64, // X86-SAME: athlon-fx, opteron, k8-sse3, athlon64-sse3, opteron-sse3, amdfam10, // X86-SAME: barcelona, btver1, btver2, bdver1, bdver2, bdver3, bdver4, znver1, znver2, @@ -33,7 +33,7 @@ // X86_64-SAME: atom, silvermont, slm, goldmont, goldmont-plus, tremont, nehalem, corei7, westmere, // X86_64-SAME: sandybridge, corei7-avx, ivybridge, core-avx-i, haswell, // X86_64-SAME: core-avx2, broadwell, skylake, skylake-avx512, skx, cascadelake, cooperlake, cannonlake, -// X86_64-SAME: icelake-client, icelake-server, tigerlake, knl, knm, k8, athlon64, athlon-fx, opteron, k8-sse3, +// X86_64-SAME: icelake-client, icelake-server, tigerlake, sapphirerapids, knl, knm, k8, athlon64, athlon-fx, opteron, k8-sse3, // X86_64-SAME: athlon64-sse3, opteron-sse3, amdfam10, barcelona, btver1, // X86_64-SAME: btver2, bdver1, bdver2, bdver3, bdver4, znver1, znver2, x86-64 @@ -45,7 +45,7 @@ // TUNE_X86-SAME: nocona, core2, penryn, bonnell, atom, silvermont, slm, goldmont, goldmont-plus, tremont, // TUNE_X86-SAME: nehalem, corei7, westmere, sandybridge, corei7-avx, ivybridge, // TUNE_X86-SAME: core-avx-i, haswell, core-avx2, broadwell, skylake, skylake-avx512, -// TUNE_X86-SAME: skx, cascadelake, cooperlake, cannonlake, icelake-client, icelake-server, tigerlake, knl, knm, lakemont, k6, k6-2, k6-3, +// TUNE_X86-SAME: skx, cascadelake, cooperlake, cannonlake, icelake-client, icelake-server, tigerlake, sapphirerapids, knl, knm, lakemont, k6, k6-2, k6-3, // TUNE_X86-SAME: athlon, athlon-tbird, athlon-xp, athlon-mp, athlon-4, k8, athlon64, // TUNE_X86-SAME: athlon-fx, opteron, k8-sse3, athlon64-sse3, opteron-sse3, amdfam10, // TUNE_X86-SAME: barcelona, btver1, btver2, bdver1, bdver2, bdver3, bdver4, znver1, znver2, @@ -59,7 +59,7 @@ // TUNE_X86_64-SAME: nocona, core2, penryn, bonnell, atom, silvermont, slm, goldmont, goldmont-plus, tremont, // TUNE_X86_64-SAME: nehalem, corei7, westmere, sandybridge, corei7-avx, ivybridge, // TUNE_X86_64-SAME: core-avx-i, haswell, core-avx2, broadwell, skylake, skylake-avx512, -// TUNE_X86_64-SAME: skx, cascadelake, cooperlake, cannonlake, icelake-client, icelake-server, tigerlake, knl, knm, lakemont, k6, k6-2, k6-3, +// TUNE_X86_64-SAME: skx, cascadelake, cooperlake, cannonlake, icelake-client, icelake-server, tigerlake, sapphirerapids, knl, knm, lakemont, k6, k6-2, k6-3, // TUNE_X86_64-SAME: athlon, athlon-tbird, athlon-xp, athlon-mp, athlon-4, k8, athlon64, // TUNE_X86_64-SAME: athlon-fx, opteron, k8-sse3, athlon64-sse3, opteron-sse3, amdfam10, // TUNE_X86_64-SAME: barcelona, btver1, btver2, bdver1, bdver2, bdver3, bdver4, znver1, znver2, diff --git a/clang/test/Modules/Inputs/DebugDwoId.h b/clang/test/Modules/Inputs/DebugDwoId.h new file mode 100644 index 0000000000000..242e4c7f5116d --- /dev/null +++ b/clang/test/Modules/Inputs/DebugDwoId.h @@ -0,0 +1,4 @@ +#ifndef DEBUG_DWO_ID_H +#define DEBUG_DWO_ID_H +struct Dummy {}; +#endif diff --git a/clang/test/Modules/Inputs/module.map b/clang/test/Modules/Inputs/module.map index ed220e667f055..e7cb4b27bc08b 100644 --- a/clang/test/Modules/Inputs/module.map +++ b/clang/test/Modules/Inputs/module.map @@ -357,6 +357,10 @@ module DebugObjCImport { } } +module DebugDwoId { + header "DebugDwoId.h" +} + module ImportNameInDir { header "ImportNameInDir.h" export * diff --git a/clang/test/Modules/ModuleDebugInfoDwoId.cpp b/clang/test/Modules/ModuleDebugInfoDwoId.cpp new file mode 100644 index 0000000000000..5d6ad3c594cf4 --- /dev/null +++ b/clang/test/Modules/ModuleDebugInfoDwoId.cpp @@ -0,0 +1,22 @@ +// Tests that dwoIds in modules match the dwoIDs in the main file. + +// REQUIRES: asserts + +// RUN: rm -rf %t.cache +// RUN: %clang_cc1 -triple %itanium_abi_triple -x objective-c++ -std=c++11 -debugger-tuning=lldb -debug-info-kind=limited -fmodules -fmodule-format=obj -dwarf-ext-refs -fimplicit-module-maps -fmodules-cache-path=%t.cache %s -I %S/Inputs -emit-llvm -o %t.ll -mllvm -debug-only=pchcontainer 2> %t.mod-out +// RUN: cat %t.ll %t.mod-out | FileCheck %s +// RUN: cat %t.ll | FileCheck --check-prefix=CHECK-REALIDS %s +// RUN: cat %t.mod-out | FileCheck --check-prefix=CHECK-REALIDS %s + +@import DebugDwoId; + +Dummy d; + +// Find the emitted dwoID for DebugInfoId and compare it against the one in the PCM. +// CHECK: DebugDwoId-{{[A-Z0-9]+}}.pcm +// CHECK-SAME: dwoId: [[DWOID:[0-9]+]] +// CHECK: dwoId: [[DWOID]] +// CHECK-NEXT: !DIFile(filename: "DebugDwoId" + +// Make sure the dwo IDs are real IDs and not fallback values (~1ULL). +// CHECK-REALIDS-NOT: dwoId: 18446744073709551615 diff --git a/clang/test/Modules/diagnostics.modulemap b/clang/test/Modules/diagnostics.modulemap index 01aa0b66a406f..c12fef50c38ed 100644 --- a/clang/test/Modules/diagnostics.modulemap +++ b/clang/test/Modules/diagnostics.modulemap @@ -28,3 +28,9 @@ module header_attr { header "quux.h" { size 1 mtime 2 } header "no_attrs.h" {} } + +// CHECK: diagnostics.modulemap:[[@LINE+1]]:8: error: no module named 'unknown' found, parent module must be defined before the submodule +module unknown.submodule {} +module known_top_level {} +// CHECK: diagnostics.modulemap:[[@LINE+1]]:24: error: no module named 'unknown' in 'known_top_level', parent module must be defined before the submodule +module known_top_level.unknown.submodule {} diff --git a/clang/test/Modules/module_file_info.m b/clang/test/Modules/module_file_info.m index 677eff8e8ef58..da4ea1ca0d951 100644 --- a/clang/test/Modules/module_file_info.m +++ b/clang/test/Modules/module_file_info.m @@ -28,6 +28,7 @@ // CHECK: Target options: // CHECK: Triple: // CHECK: CPU: +// CHECK: TuneCPU: // CHECK: ABI: // CHECK: Header search options: diff --git a/clang/test/OpenMP/atomic_ast_print.cpp b/clang/test/OpenMP/atomic_ast_print.cpp index 1b55c56ad17f9..67ed8428031e2 100644 --- a/clang/test/OpenMP/atomic_ast_print.cpp +++ b/clang/test/OpenMP/atomic_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/atomic_capture_codegen.cpp b/clang/test/OpenMP/atomic_capture_codegen.cpp index d7b8748bbc242..047ec3616afcc 100644 --- a/clang/test/OpenMP/atomic_capture_codegen.cpp +++ b/clang/test/OpenMP/atomic_capture_codegen.cpp @@ -1,10 +1,11 @@ -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp -fopenmp-version=50 -x c -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp-simd -fopenmp-version=50 -x c -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp -x c -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp-simd -x c -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/atomic_messages.c b/clang/test/OpenMP/atomic_messages.c index 053771b5b25dc..8f754aa4ff7fd 100644 --- a/clang/test/OpenMP/atomic_messages.c +++ b/clang/test/OpenMP/atomic_messages.c @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 100 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 100 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 100 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 100 %s -Wuninitialized void xxx(int argc) { int x; // expected-note {{initialize the variable 'x' to silence this warning}} diff --git a/clang/test/OpenMP/atomic_messages.cpp b/clang/test/OpenMP/atomic_messages.cpp index fc29a2ceae174..f781e664e5aea 100644 --- a/clang/test/OpenMP/atomic_messages.cpp +++ b/clang/test/OpenMP/atomic_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 150 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 150 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 150 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 150 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 150 %s -Wuninitialized int foo() { L1: diff --git a/clang/test/OpenMP/atomic_read_codegen.c b/clang/test/OpenMP/atomic_read_codegen.c index 94e212f724477..211ddca3449db 100644 --- a/clang/test/OpenMP/atomic_read_codegen.c +++ b/clang/test/OpenMP/atomic_read_codegen.c @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp -fopenmp-version=50 -x c -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp -x c -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp-simd -fopenmp-version=50 -x c -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp-simd -x c -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics // REQUIRES: x86-registered-target diff --git a/clang/test/OpenMP/atomic_update_codegen.cpp b/clang/test/OpenMP/atomic_update_codegen.cpp index a2b6f70540aa3..f8c45c5959be5 100644 --- a/clang/test/OpenMP/atomic_update_codegen.cpp +++ b/clang/test/OpenMP/atomic_update_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp -fopenmp-version=50 -x c -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp -x c -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp-simd -fopenmp-version=50 -x c -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp-simd -x c -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/atomic_write_codegen.c b/clang/test/OpenMP/atomic_write_codegen.c index 3cbaf2752448b..f3b3acfbee3f0 100644 --- a/clang/test/OpenMP/atomic_write_codegen.c +++ b/clang/test/OpenMP/atomic_write_codegen.c @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp -fopenmp-version=50 -x c -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp -x c -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp-simd -fopenmp-version=50 -x c -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp-simd -x c -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics // REQUIRES: x86-registered-target diff --git a/clang/test/OpenMP/declare_target_ast_print.cpp b/clang/test/OpenMP/declare_target_ast_print.cpp index 14115e6a3ad6a..c086f85261476 100644 --- a/clang/test/OpenMP/declare_target_ast_print.cpp +++ b/clang/test/OpenMP/declare_target_ast_print.cpp @@ -2,9 +2,9 @@ // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -I %S/Inputs -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -verify -fopenmp -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 // RUN: %clang_cc1 -verify -fopenmp-simd -I %S/Inputs -ast-print %s | FileCheck %s // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s @@ -241,6 +241,28 @@ int baz() { return 1; } // CHECK: void cba(); // CHECK: #pragma omp end declare target +#pragma omp declare target +int abc1() { return 1; } +#pragma omp declare target to(abc1) device_type(nohost) +#pragma omp end declare target + +// CHECK-NEXT: #pragma omp declare target +// CHECK-NEXT: #pragma omp declare target device_type(nohost) +// CHECK-NEXT: int abc1() { +// CHECK-NEXT: return 1; +// CHECK-NEXT: } +// CHECK-NEXT: #pragma omp end declare target + +#pragma omp declare target +int inner_link; +#pragma omp declare target link(inner_link) +#pragma omp end declare target + +// CHECK-NEXT: #pragma omp declare target +// CHECK-NEXT: #pragma omp declare target link +// CHECK-NEXT: int inner_link; +// CHECK-NEXT: #pragma omp end declare target + int main (int argc, char **argv) { foo(); foo_c(); @@ -254,4 +276,5 @@ int main (int argc, char **argv) { // CHECK: #pragma omp declare target // CHECK-NEXT: int ts = 1; // CHECK-NEXT: #pragma omp end declare target + #endif diff --git a/clang/test/OpenMP/declare_target_codegen.cpp b/clang/test/OpenMP/declare_target_codegen.cpp index 0cd725ac5665f..98e06b1eee9fa 100644 --- a/clang/test/OpenMP/declare_target_codegen.cpp +++ b/clang/test/OpenMP/declare_target_codegen.cpp @@ -3,15 +3,15 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DLOAD | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix HOST5 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=50 -DOMP5 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix DEV5 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -DOMP5 | FileCheck %s --check-prefix HOST5 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -DOMP5 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DOMP5 | FileCheck %s --check-prefix DEV5 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix KMPC-ONLY +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - -DOMP5 | FileCheck %s --check-prefix KMPC-ONLY -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix SIMD-ONLY -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=50 -DOMP5 -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix SIMD-ONLY +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -DOMP5 | FileCheck %s --check-prefix SIMD-ONLY +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -DOMP5 +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DOMP5 | FileCheck %s --check-prefix SIMD-ONLY // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=45 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=45 | FileCheck %s --check-prefix SIMD-ONLY diff --git a/clang/test/OpenMP/declare_target_messages.cpp b/clang/test/OpenMP/declare_target_messages.cpp index 7287a6682f0bb..17a60ce2eb59f 100644 --- a/clang/test/OpenMP/declare_target_messages.cpp +++ b/clang/test/OpenMP/declare_target_messages.cpp @@ -1,9 +1,9 @@ // RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -fnoopenmp-use-tls -ferror-limit 100 -o - %s -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp5,host5 -fopenmp -fopenmp-version=50 -fnoopenmp-use-tls -ferror-limit 100 -o - %s -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp5,dev5 -fopenmp -fopenmp-is-device -fopenmp-targets=x86_64-apple-macos10.7.0 -aux-triple x86_64-apple-macos10.7.0 -fopenmp-version=50 -fnoopenmp-use-tls -ferror-limit 100 -o - %s +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp5,host5 -fopenmp -fnoopenmp-use-tls -ferror-limit 100 -o - %s +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp5,dev5 -fopenmp -fopenmp-is-device -fopenmp-targets=x86_64-apple-macos10.7.0 -aux-triple x86_64-apple-macos10.7.0 -fnoopenmp-use-tls -ferror-limit 100 -o - %s -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp5,host5 -fopenmp-simd -fopenmp-version=50 -fnoopenmp-use-tls -ferror-limit 100 -o - %s -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp5,host5 -fopenmp-simd -fopenmp-is-device -fopenmp-version=50 -fnoopenmp-use-tls -ferror-limit 100 -o - %s +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp5,host5 -fopenmp-simd -fnoopenmp-use-tls -ferror-limit 100 -o - %s +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp5,host5 -fopenmp-simd -fopenmp-is-device -fnoopenmp-use-tls -ferror-limit 100 -o - %s // RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp45 -fopenmp-version=45 -fopenmp-simd -fnoopenmp-use-tls -ferror-limit 100 -o - %s #pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}} diff --git a/clang/test/OpenMP/declare_variant_device_isa_codegen_1.c b/clang/test/OpenMP/declare_variant_device_isa_codegen_1.c index 76a3eedeae301..029270ab84866 100644 --- a/clang/test/OpenMP/declare_variant_device_isa_codegen_1.c +++ b/clang/test/OpenMP/declare_variant_device_isa_codegen_1.c @@ -1,10 +1,18 @@ -// RUN: %clang_cc1 -verify -fopenmp -x c -triple %itanium_abi_triple -emit-llvm %s -o - -fopenmp-version=50 | FileCheck %s --check-prefix=GENERIC -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 | FileCheck %s --check-prefix=GENERIC +// RUN: %clang_cc1 -verify -fopenmp -x c -triple %itanium_abi_triple -emit-llvm %s -o - -fopenmp-version=45 | FileCheck %s --check-prefix=GENERIC +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=45 %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --check-prefix=GENERIC -// RUN: %clang_cc1 -target-feature +avx512f -verify -fopenmp -x c -triple %itanium_abi_triple -emit-llvm %s -o - -fopenmp-version=50 | FileCheck %s --check-prefix=WITHFEATURE -// RUN: %clang_cc1 -target-feature +avx512f -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -// RUN: %clang_cc1 -target-feature +avx512f -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 | FileCheck %s --check-prefix=WITHFEATURE +// RUN: %clang_cc1 -target-feature +avx512f -verify -fopenmp -x c -triple %itanium_abi_triple -emit-llvm %s -o - -fopenmp-version=45 | FileCheck %s --check-prefix=WITHFEATURE +// RUN: %clang_cc1 -target-feature +avx512f -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=45 %s +// RUN: %clang_cc1 -target-feature +avx512f -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --check-prefix=WITHFEATURE + +// RUN: %clang_cc1 -verify -fopenmp -x c -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s --check-prefix=GENERIC +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=GENERIC + +// RUN: %clang_cc1 -target-feature +avx512f -verify -fopenmp -x c -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s --check-prefix=WITHFEATURE +// RUN: %clang_cc1 -target-feature +avx512f -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -target-feature +avx512f -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=WITHFEATURE // expected-no-diagnostics diff --git a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp index 99f33741960e8..e18b356579dd3 100644 --- a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp +++ b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp @@ -1,22 +1,62 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DHOST | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -DHOST -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 -DHOST | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=45 %s -DHOST +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 -DHOST | FileCheck %s // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple aarch64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DHOST | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -DHOST -// RUN: %clang_cc1 -fopenmp -x c++ -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 -DHOST | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=45 %s -DHOST +// RUN: %clang_cc1 -fopenmp -x c++ -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 -DHOST | FileCheck %s // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DHOST | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -DHOST -// RUN: %clang_cc1 -fopenmp -x c++ -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 -DHOST | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=45 %s -DHOST +// RUN: %clang_cc1 -fopenmp -x c++ -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 -DHOST | FileCheck %s // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DCPU | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -DCPU -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 -DCPU | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=45 %s -DCPU +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 -DCPU | FileCheck %s // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple aarch64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DCPU | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -DCPU -// RUN: %clang_cc1 -fopenmp -x c++ -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 -DCPU | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=45 %s -DCPU +// RUN: %clang_cc1 -fopenmp -x c++ -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 -DCPU | FileCheck %s // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DCPU | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -DCPU -// RUN: %clang_cc1 -fopenmp -x c++ -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 -DCPU | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=45 %s -DCPU +// RUN: %clang_cc1 -fopenmp -x c++ -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 -DCPU | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -fopenmp-targets=x86_64-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DCPU +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -o - -DCPU | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DCPU | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -fopenmp-targets=ppc64le-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DCPU +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -o - -DCPU | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DCPU | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -fopenmp-targets=x86_64-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DNOHOST +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -o - -DNOHOST | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -fopenmp-targets=ppc64le-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DNOHOST +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -o - -DNOHOST | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DHOST | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -DHOST +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DHOST | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple aarch64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DHOST | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -DHOST +// RUN: %clang_cc1 -fopenmp -x c++ -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DHOST | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DHOST | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -DHOST +// RUN: %clang_cc1 -fopenmp -x c++ -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DHOST | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DCPU | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -DCPU +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DCPU | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple aarch64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DCPU | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -DCPU +// RUN: %clang_cc1 -fopenmp -x c++ -triple aarch64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DCPU | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DCPU | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -DCPU +// RUN: %clang_cc1 -fopenmp -x c++ -triple ppc64le-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DCPU | FileCheck %s // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -fopenmp-targets=x86_64-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DCPU // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -o - -DCPU | FileCheck %s diff --git a/clang/test/OpenMP/declare_variant_implementation_vendor_codegen.cpp b/clang/test/OpenMP/declare_variant_implementation_vendor_codegen.cpp index 70e24c027eb82..d4077ce35d813 100644 --- a/clang/test/OpenMP/declare_variant_implementation_vendor_codegen.cpp +++ b/clang/test/OpenMP/declare_variant_implementation_vendor_codegen.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple %itanium_abi_triple -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -fopenmp-version=45 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|19|21|22|23|24}}' -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|19|21|22|23|24}}' +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|19|21|22|23|24}}' // expected-no-diagnostics // CHECK-DAG: ret i32 2 diff --git a/clang/test/OpenMP/declare_variant_mixed_codegen.c b/clang/test/OpenMP/declare_variant_mixed_codegen.c index 9eaa35fe3c05b..5516d6793e343 100644 --- a/clang/test/OpenMP/declare_variant_mixed_codegen.c +++ b/clang/test/OpenMP/declare_variant_mixed_codegen.c @@ -1,10 +1,18 @@ // RUN: %clang_cc1 -verify -fopenmp -x c -triple x86_64-unknown-linux -emit-llvm %s -o - | FileCheck %s --check-prefix HOST -// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-unknown-linux -emit-pch -o %t -fopenmp-version=50 %s -// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 | FileCheck %s --check-prefix HOST -// RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=50 -// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=50 | FileCheck %s --check-prefix GPU -// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=50 -// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -fopenmp-version=50 | FileCheck %s --check-prefix GPU +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-unknown-linux -emit-pch -o %t -fopenmp-version=45 %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --check-prefix HOST +// RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=45 +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=45 | FileCheck %s --check-prefix GPU +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=45 +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -fopenmp-version=45 | FileCheck %s --check-prefix GPU + +// RUN: %clang_cc1 -verify -fopenmp -x c -triple x86_64-unknown-linux -emit-llvm %s -o - | FileCheck %s --check-prefix HOST +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-unknown-linux -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix HOST +// RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix GPU +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefix GPU // expected-no-diagnostics #ifndef HEADER @@ -21,7 +29,7 @@ int base(); // HOST-LABEL: define void @foo() // HOST: call i32 @hst(double -1.000000e+00) // HOST: call i32 @hst(double -2.000000e+00) -// HOST: call void [[OFFL:@.+_foo_l28]]() +// HOST: call void [[OFFL:@.+_foo_l36]]() void foo() { base(-1); hst(-2); @@ -36,7 +44,7 @@ void foo() { // HOST: call i32 @hst(double -3.000000e+00) // HOST: call i32 @dev(double -4.000000e+00) -// GPU: define {{.*}}void @__omp_offloading_{{.+}}_foo_l28() +// GPU: define {{.*}}void @__omp_offloading_{{.+}}_foo_l36() // GPU: call i32 @dev(double -3.000000e+00) // GPU: call i32 @dev(double -4.000000e+00) diff --git a/clang/test/OpenMP/declare_variant_mixed_codegen.cpp b/clang/test/OpenMP/declare_variant_mixed_codegen.cpp index ce48ff8e9be7d..d0c3373302f78 100644 --- a/clang/test/OpenMP/declare_variant_mixed_codegen.cpp +++ b/clang/test/OpenMP/declare_variant_mixed_codegen.cpp @@ -1,6 +1,10 @@ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|10|13|15|19|22|23|24}}' +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=45 %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|10|13|15|19|22|23|24}}' + // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|10|13|15|19|22|23|24}}' -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|10|13|15|19|22|23|24}}' +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|10|13|15|19|22|23|24}}' // expected-no-diagnostics diff --git a/clang/test/OpenMP/deferred-diags.cpp b/clang/test/OpenMP/deferred-diags.cpp index 98c28aff644ab..037498b7fedbf 100644 --- a/clang/test/OpenMP/deferred-diags.cpp +++ b/clang/test/OpenMP/deferred-diags.cpp @@ -1,6 +1,10 @@ // RUN: %clang_cc1 -triple x86_64 -verify=expected,dev -std=c++11\ // RUN: -verify-ignore-unexpected=note \ -// RUN: -fopenmp -fopenmp-version=50 -o - %s +// RUN: -fopenmp -fopenmp-version=45 -o - %s + +// RUN: %clang_cc1 -triple x86_64 -verify=expected,dev -std=c++11\ +// RUN: -verify-ignore-unexpected=note \ +// RUN: -fopenmp -o - %s // expected-no-diagnostics diff --git a/clang/test/OpenMP/depobj_ast_print.cpp b/clang/test/OpenMP/depobj_ast_print.cpp index b993b4a7bdd4e..3959396f8095c 100644 --- a/clang/test/OpenMP/depobj_ast_print.cpp +++ b/clang/test/OpenMP/depobj_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/depobj_codegen.cpp b/clang/test/OpenMP/depobj_codegen.cpp index bd9b7e0316fb8..202538c31952a 100644 --- a/clang/test/OpenMP/depobj_codegen.cpp +++ b/clang/test/OpenMP/depobj_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=50 -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10 -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10 -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=50 -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10 -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10 -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics diff --git a/clang/test/OpenMP/depobj_messages.cpp b/clang/test/OpenMP/depobj_messages.cpp index 1b20b1c525e77..f1110265d8bd9 100644 --- a/clang/test/OpenMP/depobj_messages.cpp +++ b/clang/test/OpenMP/depobj_messages.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp-simd -ferror-limit 100 %s -Wuninitialized struct S1 { // expected-note 2 {{declared here}} int a; diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp index 3dce665b90914..fc4d3f19649bb 100644 --- a/clang/test/OpenMP/distribute_codegen.cpp +++ b/clang/test/OpenMP/distribute_codegen.cpp @@ -1,17 +1,25 @@ // Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK +// RUN: %clang_cc1 -verify -fopenmp-version=45 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK +// Test host codegen. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK -// Test host codegen. -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s @@ -21,23 +29,15 @@ // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} - // Test target codegen - host bc file has to be created first. (no significant differences with host version of target region) -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s diff --git a/clang/test/OpenMP/distribute_parallel_for_ast_print.cpp b/clang/test/OpenMP/distribute_parallel_for_ast_print.cpp index ec8d595f46cb7..ca265b5f7a58d 100644 --- a/clang/test/OpenMP/distribute_parallel_for_ast_print.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_ast_print.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify -std=c++11 -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -std=c++11 -fopenmp -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -verify -std=c++11 -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 // RUN: %clang_cc1 -verify -std=c++11 -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -std=c++11 -fopenmp-simd -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -verify -std=c++11 -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp index 6ff3ad949806c..d534ac58da0a9 100644 --- a/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp-simd %s -Wuninitialized void foo() { } diff --git a/clang/test/OpenMP/distribute_parallel_for_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_messages.cpp index 5078bbf0eef2b..e2cade8ae1c19 100644 --- a/clang/test/OpenMP/distribute_parallel_for_messages.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -ferror-limit 100 -std=c++11 -o - %s -fopenmp-version=45 -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 100 -std=c++11 -o - %s -fopenmp-version=50 -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 100 -std=c++11 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -ferror-limit 100 -std=c++11 -o - %s -fopenmp-version=45 -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 100 -std=c++11 -o - %s -fopenmp-version=50 -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 100 -std=c++11 -o - %s -Wuninitialized void foo() { } diff --git a/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp index 828b31e00846a..716842ae250a9 100644 --- a/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized extern int omp_default_mem_alloc; diff --git a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp index 995ded43db3d8..6dc3afe9aa0c8 100644 --- a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp index e8785f8c20a09..b807679101de8 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp index cb10008f2d8a8..1e6f0c67247f8 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp index aeb2ded6fb0c1..504b71904298a 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wno-openmp-mapping -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wno-openmp-mapping -Wuninitialized extern int omp_default_mem_alloc; void foo() { diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp index 8f7f3c44f7a77..8195ed655f858 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wno-openmp-mapping -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wno-openmp-mapping -Wuninitialized class S { int a; diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_misc_messages.c b/clang/test/OpenMP/distribute_parallel_for_simd_misc_messages.c index 57c11edda7d81..6cdf7363a2a3e 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_misc_messages.c +++ b/clang/test/OpenMP/distribute_parallel_for_simd_misc_messages.c @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -verify=expected,omp45 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -verify=expected,omp50 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify=expected,omp50 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -verify=expected,omp45 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -verify=expected,omp50 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -verify=expected,omp50 %s -Wuninitialized void xxx(int argc) { int x; // expected-note {{initialize the variable 'x' to silence this warning}} diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp index 84e79c84a2b27..91af8bec917c0 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 -ferror-limit 150 -o - %s -Wno-openmp-mapping -Wuninitialized extern int omp_default_mem_alloc; void xxx(int argc) { diff --git a/clang/test/OpenMP/distribute_simd_ast_print.cpp b/clang/test/OpenMP/distribute_simd_ast_print.cpp index d77b274dea13e..e87453b4cc5d7 100644 --- a/clang/test/OpenMP/distribute_simd_ast_print.cpp +++ b/clang/test/OpenMP/distribute_simd_ast_print.cpp @@ -1,16 +1,16 @@ -// RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 - -// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp index 5de8ac492b884..cc6239a487d11 100644 --- a/clang/test/OpenMP/distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/distribute_simd_codegen.cpp @@ -1,65 +1,65 @@ // Test host codegen. -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix OMP45 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix OMP45 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK --check-prefix OMP50 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -DOMP5| FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix OMP45 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK --check-prefix OMP45 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix OMP45 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix OMP45 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -DOMP5| FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix OMP50 -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // Test target codegen - host bc file has to be created first. (no significant differences with host version of target region) -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -DOMP5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc -DOMP5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP45 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -DOMP5 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc -DOMP5 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -DOMP5 -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY1 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY1 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc -DOMP5 -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY1 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -DOMP5 +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc -DOMP5 +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - -DOMP5 | FileCheck --check-prefix SIMD-ONLY1 %s // SIMD-ONLY1-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics diff --git a/clang/test/OpenMP/distribute_simd_if_messages.cpp b/clang/test/OpenMP/distribute_simd_if_messages.cpp index 1579fb646ecb7..b1d0b34864fa3 100644 --- a/clang/test/OpenMP/distribute_simd_if_messages.cpp +++ b/clang/test/OpenMP/distribute_simd_if_messages.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp-simd %s -Wuninitialized void foo() { } diff --git a/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp b/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp index 8929b885d1120..8e471890c0193 100644 --- a/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp +++ b/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp-simd %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized extern int omp_default_mem_alloc; void foo() { diff --git a/clang/test/OpenMP/distribute_simd_loop_messages.cpp b/clang/test/OpenMP/distribute_simd_loop_messages.cpp index 5838ac7592a2a..5a55f9569b8dc 100644 --- a/clang/test/OpenMP/distribute_simd_loop_messages.cpp +++ b/clang/test/OpenMP/distribute_simd_loop_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized class S5 { int a; diff --git a/clang/test/OpenMP/distribute_simd_misc_messages.c b/clang/test/OpenMP/distribute_simd_misc_messages.c index fa0cdd0e939d3..2403f2f99680f 100644 --- a/clang/test/OpenMP/distribute_simd_misc_messages.c +++ b/clang/test/OpenMP/distribute_simd_misc_messages.c @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -verify=expected,omp45 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -verify=expected,omp50 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify=expected,omp50 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -verify=expected,omp45 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -verify=expected,omp50 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -verify=expected,omp50 %s -Wuninitialized void xxx(int argc) { int x; // expected-note {{initialize the variable 'x' to silence this warning}} diff --git a/clang/test/OpenMP/distribute_simd_reduction_messages.cpp b/clang/test/OpenMP/distribute_simd_reduction_messages.cpp index aba7b7133b0c5..201a4740d1d5c 100644 --- a/clang/test/OpenMP/distribute_simd_reduction_messages.cpp +++ b/clang/test/OpenMP/distribute_simd_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized extern int omp_default_mem_alloc; void xxx(int argc) { diff --git a/clang/test/OpenMP/flush_codegen.cpp b/clang/test/OpenMP/flush_codegen.cpp index 36ab677a7e820..029166ef5100c 100644 --- a/clang/test/OpenMP/flush_codegen.cpp +++ b/clang/test/OpenMP/flush_codegen.cpp @@ -1,13 +1,13 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/flush_messages.cpp b/clang/test/OpenMP/flush_messages.cpp index 7d20e385bfafa..446d3e963ba4d 100644 --- a/clang/test/OpenMP/flush_messages.cpp +++ b/clang/test/OpenMP/flush_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 100 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 100 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 100 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 100 %s -Wuninitialized struct S1 { // expected-note 2 {{declared here}} int a; diff --git a/clang/test/OpenMP/for_ast_print.cpp b/clang/test/OpenMP/for_ast_print.cpp index a1ae55ae7f7bf..15187becf2e8c 100644 --- a/clang/test/OpenMP/for_ast_print.cpp +++ b/clang/test/OpenMP/for_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/for_codegen.cpp b/clang/test/OpenMP/for_codegen.cpp index 3d235f4770a6b..3c26cb020cdd2 100644 --- a/clang/test/OpenMP/for_codegen.cpp +++ b/clang/test/OpenMP/for_codegen.cpp @@ -1,15 +1,15 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope | FileCheck %s --check-prefix=CHECK --check-prefix=LIFETIME --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope | FileCheck %s --check-prefix=CHECK --check-prefix=LIFETIME --check-prefix=OMP5 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP5 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope | FileCheck %s --check-prefix=CHECK --check-prefix=LIFETIME --check-prefix=OMP5 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK --check-prefix=OMP5 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -fopenmp-version=45 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -gno-column-info -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG // RUN: %clang_cc1 -main-file-name for_codegen.cpp %s -o - -emit-llvm -fprofile-instrument=clang -fprofile-instrument-path=for_codegen-test.profraw | FileCheck %s --check-prefix=PROF-INSTR-PATH // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s diff --git a/clang/test/OpenMP/for_collapse_messages.cpp b/clang/test/OpenMP/for_collapse_messages.cpp index d2cfccd13ae53..07974be8da967 100644 --- a/clang/test/OpenMP/for_collapse_messages.cpp +++ b/clang/test/OpenMP/for_collapse_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 %s -Wuninitialized void foo() { } diff --git a/clang/test/OpenMP/for_lastprivate_codegen.cpp b/clang/test/OpenMP/for_lastprivate_codegen.cpp index fbbb6ad6bc3d5..4fc7b2061ae21 100644 --- a/clang/test/OpenMP/for_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/for_lastprivate_codegen.cpp @@ -1,24 +1,24 @@ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s -// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK -check-prefix=OMP50 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s - -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s +// RUN: %clang_cc1 -verify -fopenmp -DOMP5 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp -DOMP5 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -DOMP5 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK -check-prefix=OMP50 +// RUN: %clang_cc1 -verify -fopenmp -DOMP5 -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s +// RUN: %clang_cc1 -verify -fopenmp -DOMP5 -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -DOMP5 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -DOMP5 -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -DOMP5 -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/for_lastprivate_messages.cpp b/clang/test/OpenMP/for_lastprivate_messages.cpp index 5ad8552a4262f..b728f35d7dc33 100644 --- a/clang/test/OpenMP/for_lastprivate_messages.cpp +++ b/clang/test/OpenMP/for_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp-simd %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized extern int omp_default_mem_alloc; void foo() { diff --git a/clang/test/OpenMP/for_loop_messages.cpp b/clang/test/OpenMP/for_loop_messages.cpp index b423bfa0f31fa..a1bc9bd3e045e 100644 --- a/clang/test/OpenMP/for_loop_messages.cpp +++ b/clang/test/OpenMP/for_loop_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized class S { int a; diff --git a/clang/test/OpenMP/for_misc_messages.c b/clang/test/OpenMP/for_misc_messages.c index f03da5879fda0..bb8e55fb5d4b7 100644 --- a/clang/test/OpenMP/for_misc_messages.c +++ b/clang/test/OpenMP/for_misc_messages.c @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -triple x86_64-unknown-unknown -verify=expected,omp45 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -triple x86_64-unknown-unknown -verify=expected,omp50 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -triple x86_64-unknown-unknown -verify=expected,omp50 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -triple x86_64-unknown-unknown -verify=expected,omp45 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -triple x86_64-unknown-unknown -verify=expected,omp50 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -triple x86_64-unknown-unknown -verify=expected,omp50 %s -Wuninitialized void xxx(int argc) { int x; // expected-note {{initialize the variable 'x' to silence this warning}} diff --git a/clang/test/OpenMP/for_reduction_messages.cpp b/clang/test/OpenMP/for_reduction_messages.cpp index fd9a414b679ed..18bb2c68b824c 100644 --- a/clang/test/OpenMP/for_reduction_messages.cpp +++ b/clang/test/OpenMP/for_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized extern int omp_default_mem_alloc; void xxx(int argc) { diff --git a/clang/test/OpenMP/for_reduction_task_codegen.cpp b/clang/test/OpenMP/for_reduction_task_codegen.cpp index 0018e109aaed9..4cf4adee66d24 100644 --- a/clang/test/OpenMP/for_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/for_reduction_task_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/for_scan_codegen.cpp b/clang/test/OpenMP/for_scan_codegen.cpp index 9905e4a67f77b..aba59bb4d0f4d 100644 --- a/clang/test/OpenMP/for_scan_codegen.cpp +++ b/clang/test/OpenMP/for_scan_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/for_schedule_messages.cpp b/clang/test/OpenMP/for_schedule_messages.cpp index f4e8bb19d2459..9731e48743e6f 100644 --- a/clang/test/OpenMP/for_schedule_messages.cpp +++ b/clang/test/OpenMP/for_schedule_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized void foo() { } diff --git a/clang/test/OpenMP/for_simd_ast_print.cpp b/clang/test/OpenMP/for_simd_ast_print.cpp index 6e346b97a9321..5334754420cf8 100644 --- a/clang/test/OpenMP/for_simd_ast_print.cpp +++ b/clang/test/OpenMP/for_simd_ast_print.cpp @@ -1,16 +1,16 @@ -// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -fopenmp -ast-print %s -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -fopenmp-version=50 -DOMP5 -// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 - -// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -fopenmp-version=50 -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/for_simd_codegen.cpp b/clang/test/OpenMP/for_simd_codegen.cpp index 5bb9811bcedf4..2d1b0f073e2e7 100644 --- a/clang/test/OpenMP/for_simd_codegen.cpp +++ b/clang/test/OpenMP/for_simd_codegen.cpp @@ -1,20 +1,20 @@ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm -fexceptions -fcxx-exceptions -o - < %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t < %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify -emit-llvm -o - < %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm -o - < %s | FileCheck %s --check-prefix=TERM_DEBUG -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm -fexceptions -fcxx-exceptions -o - < %s -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t < %s -fopenmp-version=50 -DOMP5 -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify -emit-llvm -o - -fopenmp-version=50 -DOMP5 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm -o - < %s -fopenmp-version=50 -DOMP5 | FileCheck %s --check-prefix=TERM_DEBUG - -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm -fexceptions -fcxx-exceptions -o - < %s | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t < %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify -emit-llvm -o - < %s | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm -o - < %s | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm -fexceptions -fcxx-exceptions -o - < %s -fopenmp-version=50 -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t < %s -fopenmp-version=50 -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify -fopenmp-version=50 -DOMP5 -emit-llvm -o - < %s | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm -o - < %s -fopenmp-version=50 -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm -fexceptions -fcxx-exceptions -o - < %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t < %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify -emit-llvm -o - < %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm -o - < %s | FileCheck %s --check-prefix=TERM_DEBUG +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm -fexceptions -fcxx-exceptions -o - < %s -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t < %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify -emit-llvm -o - -DOMP5 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm -o - < %s -DOMP5 | FileCheck %s --check-prefix=TERM_DEBUG + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm -fexceptions -fcxx-exceptions -o - < %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t < %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify -emit-llvm -o - < %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=45 -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm -o - < %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm -fexceptions -fcxx-exceptions -o - < %s -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t < %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify -DOMP5 -emit-llvm -o - < %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm -o - < %s -DOMP5 | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/for_simd_if_messages.cpp b/clang/test/OpenMP/for_simd_if_messages.cpp index 6dd9dd30871e2..76913610e53e7 100644 --- a/clang/test/OpenMP/for_simd_if_messages.cpp +++ b/clang/test/OpenMP/for_simd_if_messages.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp-simd %s -Wuninitialized void foo() { } diff --git a/clang/test/OpenMP/for_simd_lastprivate_messages.cpp b/clang/test/OpenMP/for_simd_lastprivate_messages.cpp index 023f56c1f2bd2..35d9dfe2b9542 100644 --- a/clang/test/OpenMP/for_simd_lastprivate_messages.cpp +++ b/clang/test/OpenMP/for_simd_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp-simd %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized extern int omp_default_mem_alloc; void foo() { diff --git a/clang/test/OpenMP/for_simd_loop_messages.cpp b/clang/test/OpenMP/for_simd_loop_messages.cpp index 6c5820ab7c451..1cc5988ea8092 100644 --- a/clang/test/OpenMP/for_simd_loop_messages.cpp +++ b/clang/test/OpenMP/for_simd_loop_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized class S { int a; diff --git a/clang/test/OpenMP/for_simd_misc_messages.c b/clang/test/OpenMP/for_simd_misc_messages.c index fb3ed365d02ba..bd0d86baaaf88 100644 --- a/clang/test/OpenMP/for_simd_misc_messages.c +++ b/clang/test/OpenMP/for_simd_misc_messages.c @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -verify=expected,omp45 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -verify=expected,omp50 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify=expected,omp50 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -verify=expected,omp45 -verify %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -verify=expected,omp50 -verify %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -verify=expected,omp50 -verify %s -Wuninitialized void xxx(int argc) { int x; // expected-note {{initialize the variable 'x' to silence this warning}} diff --git a/clang/test/OpenMP/for_simd_reduction_messages.cpp b/clang/test/OpenMP/for_simd_reduction_messages.cpp index f61a75951bc68..bd8869e73eec9 100644 --- a/clang/test/OpenMP/for_simd_reduction_messages.cpp +++ b/clang/test/OpenMP/for_simd_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 %s -Wuninitialized extern int omp_default_mem_alloc; void xxx(int argc) { diff --git a/clang/test/OpenMP/for_simd_scan_codegen.cpp b/clang/test/OpenMP/for_simd_scan_codegen.cpp index 2c7f53a0aa36c..3a06cf3a8a1d5 100644 --- a/clang/test/OpenMP/for_simd_scan_codegen.cpp +++ b/clang/test/OpenMP/for_simd_scan_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/master_taskloop_ast_print.cpp b/clang/test/OpenMP/master_taskloop_ast_print.cpp index dcd53450739fb..4f58d03250de4 100644 --- a/clang/test/OpenMP/master_taskloop_ast_print.cpp +++ b/clang/test/OpenMP/master_taskloop_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/master_taskloop_codegen.cpp b/clang/test/OpenMP/master_taskloop_codegen.cpp index 9d4c3ea53daa9..37f1344cc596a 100644 --- a/clang/test/OpenMP/master_taskloop_codegen.cpp +++ b/clang/test/OpenMP/master_taskloop_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=50 -x c++ -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=50 -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/master_taskloop_lastprivate_messages.cpp b/clang/test/OpenMP/master_taskloop_lastprivate_messages.cpp index 37e1c22a9666c..69954e5e7f34d 100644 --- a/clang/test/OpenMP/master_taskloop_lastprivate_messages.cpp +++ b/clang/test/OpenMP/master_taskloop_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp-simd %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized typedef void **omp_allocator_handle_t; extern const omp_allocator_handle_t omp_null_allocator; diff --git a/clang/test/OpenMP/master_taskloop_loop_messages.cpp b/clang/test/OpenMP/master_taskloop_loop_messages.cpp index 10a60f306a904..e5f65e427adf5 100644 --- a/clang/test/OpenMP/master_taskloop_loop_messages.cpp +++ b/clang/test/OpenMP/master_taskloop_loop_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized class S { int a; diff --git a/clang/test/OpenMP/master_taskloop_reduction_messages.cpp b/clang/test/OpenMP/master_taskloop_reduction_messages.cpp index 4fb8c66f47447..b4c6970d27c96 100644 --- a/clang/test/OpenMP/master_taskloop_reduction_messages.cpp +++ b/clang/test/OpenMP/master_taskloop_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized typedef void **omp_allocator_handle_t; extern const omp_allocator_handle_t omp_null_allocator; diff --git a/clang/test/OpenMP/master_taskloop_simd_ast_print.cpp b/clang/test/OpenMP/master_taskloop_simd_ast_print.cpp index f5822014b1241..54df1bbccc69f 100644 --- a/clang/test/OpenMP/master_taskloop_simd_ast_print.cpp +++ b/clang/test/OpenMP/master_taskloop_simd_ast_print.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP45 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP45 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5 +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50 // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/master_taskloop_simd_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_codegen.cpp index 194815c059e72..e54aabf1de218 100644 --- a/clang/test/OpenMP/master_taskloop_simd_codegen.cpp +++ b/clang/test/OpenMP/master_taskloop_simd_codegen.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=45 -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix CHECK --check-prefix OMP45 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck %s --check-prefix CHECK --check-prefix OMP45 -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=50 -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50 +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=45 -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=50 -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/master_taskloop_simd_lastprivate_messages.cpp b/clang/test/OpenMP/master_taskloop_simd_lastprivate_messages.cpp index 13e2cd6c6a087..aa46086ba4d5f 100644 --- a/clang/test/OpenMP/master_taskloop_simd_lastprivate_messages.cpp +++ b/clang/test/OpenMP/master_taskloop_simd_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp-simd %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized typedef void **omp_allocator_handle_t; extern const omp_allocator_handle_t omp_null_allocator; diff --git a/clang/test/OpenMP/master_taskloop_simd_loop_messages.cpp b/clang/test/OpenMP/master_taskloop_simd_loop_messages.cpp index a4eb21bf24b69..0f4f424c87af2 100644 --- a/clang/test/OpenMP/master_taskloop_simd_loop_messages.cpp +++ b/clang/test/OpenMP/master_taskloop_simd_loop_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized class S { int a; diff --git a/clang/test/OpenMP/master_taskloop_simd_misc_messages.c b/clang/test/OpenMP/master_taskloop_simd_misc_messages.c index 6c7759bdce0d6..3ad50dca4d6e9 100644 --- a/clang/test/OpenMP/master_taskloop_simd_misc_messages.c +++ b/clang/test/OpenMP/master_taskloop_simd_misc_messages.c @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -verify=expected,omp45 -triple x86_64-unknown-unknown %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -verify=expected,omp50 -triple x86_64-unknown-unknown %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify=expected,omp50 -triple x86_64-unknown-unknown %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -verify=expected,omp45 -triple x86_64-unknown-unknown %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -verify=expected,omp50 -triple x86_64-unknown-unknown %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -verify=expected,omp50 -triple x86_64-unknown-unknown %s -Wuninitialized void xxx(int argc) { int x; // expected-note {{initialize the variable 'x' to silence this warning}} diff --git a/clang/test/OpenMP/master_taskloop_simd_reduction_messages.cpp b/clang/test/OpenMP/master_taskloop_simd_reduction_messages.cpp index 19d375cc75821..ebc861967df11 100644 --- a/clang/test/OpenMP/master_taskloop_simd_reduction_messages.cpp +++ b/clang/test/OpenMP/master_taskloop_simd_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized typedef void **omp_allocator_handle_t; extern const omp_allocator_handle_t omp_null_allocator; diff --git a/clang/test/OpenMP/nesting_of_regions.cpp b/clang/test/OpenMP/nesting_of_regions.cpp index f391d21df616d..7edf5ac4e6cd8 100644 --- a/clang/test/OpenMP/nesting_of_regions.cpp +++ b/clang/test/OpenMP/nesting_of_regions.cpp @@ -1,10 +1,10 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -verify=expected,omp45,omp45warn %s -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -verify=expected,omp50 %s +// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify=expected,omp50 %s // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -verify=expected,omp45 -Wno-openmp %s // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -verify=expected,omp45 -Wno-source-uses-openmp %s // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -verify=expected,omp45,omp45warn %s -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -verify=expected,omp50 %s +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -verify=expected,omp50 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} void bar(); diff --git a/clang/test/OpenMP/nvptx_asm_delayed_diags.c b/clang/test/OpenMP/nvptx_asm_delayed_diags.c index 2f82abd7477b9..50155810ffeb1 100644 --- a/clang/test/OpenMP/nvptx_asm_delayed_diags.c +++ b/clang/test/OpenMP/nvptx_asm_delayed_diags.c @@ -1,11 +1,11 @@ +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized +// RUN: %clang_cc1 -verify -DDIAGS -DIMMEDIATE -fopenmp -fopenmp-version=45 -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized +// RUN: %clang_cc1 -verify -DDIAGS -DDELAYED -fopenmp -fopenmp-version=45 -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized // RUN: %clang_cc1 -fopenmp -x c -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized -// RUN: %clang_cc1 -verify -DDIAGS -DIMMEDIATE -fopenmp -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized -// RUN: %clang_cc1 -verify -DDIAGS -DDELAYED -fopenmp -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized -// RUN: %clang_cc1 -fopenmp -x c -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-version=50 -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify=expected,omp5 -fopenmp -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-version=50 %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp5 -DDIAGS -DOMP5 -DIMMEDIATE -fopenmp -fopenmp-version=50 -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp5 -DDIAGS -DOMP5 -DDELAYED -fopenmp -fopenmp-version=50 -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp5 -fopenmp -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp5 -DDIAGS -DOMP5 -DIMMEDIATE -fopenmp -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp5 -DDIAGS -DOMP5 -DDELAYED -fopenmp -x c -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -fsyntax-only -Wuninitialized // REQUIRES: x86-registered-target // REQUIRES: nvptx-registered-target diff --git a/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp b/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp index 9881eab8d656f..10703be929f9b 100644 --- a/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp +++ b/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp @@ -1,12 +1,22 @@ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=50 -DGPU -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=50 -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=50 -DGPU -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -fopenmp-version=50 -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' - -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=50 -DNOHOST -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=50 -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=50 -DNOHOST -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -fopenmp-version=50 -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DGPU +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DGPU +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DNOHOST +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DNOHOST +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DGPU +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DGPU +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DNOHOST +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DNOHOST +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}' // expected-no-diagnostics // CHECK-DAG: ret i32 2 diff --git a/clang/test/OpenMP/nvptx_declare_variant_implementation_vendor_codegen.cpp b/clang/test/OpenMP/nvptx_declare_variant_implementation_vendor_codegen.cpp index 4cece48a067bc..82b5da20885cf 100644 --- a/clang/test/OpenMP/nvptx_declare_variant_implementation_vendor_codegen.cpp +++ b/clang/test/OpenMP/nvptx_declare_variant_implementation_vendor_codegen.cpp @@ -1,7 +1,12 @@ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=50 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=50 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}' -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=50 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -fopenmp-version=50 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}' +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=45 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=45 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}' +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=45 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -fopenmp-version=45 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}' + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}' +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}' // expected-no-diagnostics // CHECK-DAG: ret i32 2 diff --git a/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp b/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp index 6a9ce799d01e4..b4079e44734a9 100644 --- a/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp +++ b/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=50 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=50 | FileCheck %s --implicit-check-not='call i32 {@_Z3bazv|@_Z3barv}' -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=50 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -fopenmp-version=50 | FileCheck %s --implicit-check-not='call i32 {@_Z3bazv|@_Z3barv}' +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not='call i32 {@_Z3bazv|@_Z3barv}' +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --implicit-check-not='call i32 {@_Z3bazv|@_Z3barv}' // expected-no-diagnostics // CHECK-DAG: @_Z3barv diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 62cbeaa1326a7..4b194a315e0eb 100644 --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -7,6 +7,16 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -29,9 +39,9 @@ int main(int argc, char **argv) { // SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40 // SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-DAG: @__omp_offloading_{{.*}}_main_l20_exec_mode = weak constant i8 0 +// CHECK-DAG: @__omp_offloading_{{.*}}_main_[[LINE:l.+]]_exec_mode = weak constant i8 0 -// CHECK: define weak void @__omp_offloading_{{.*}}_main_l20([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_main_[[LINE]]([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}) // SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], // SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], // SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp index 029e4a469a698..de8d4e0d234c1 100644 --- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -4,13 +4,20 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 template tx ftemplate(int n) { @@ -48,92 +55,82 @@ int bar(int n){ // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}} - - - - - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}( - // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align - // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack - // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align - // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align - // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd - // CHECK: br label {{%?}}[[EXEC:.+]] - // - // CHECK: [[EXEC]] - // CHECK: {{call|invoke}} void [[OP1:@.+]]({{.+}}, {{.+}}, i16* [[AA]]) - // CHECK: br label {{%?}}[[DONE:.+]] - // - // CHECK: [[DONE]] - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - // CHECK: } - - // CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]]) - // CHECK: = alloca i32*, align - // CHECK: = alloca i32*, align - // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align - // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align - // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align - // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align - // CHECK: store i16 {{%.+}}, i16* [[AA]], align - // CHECK: ret void - // CHECK: } - - - - - - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}( - // CHECK: [[A_ADDR:%.+]] = alloca i32*, align - // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align - // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align - // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align - // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align - // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align - // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align - // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align - // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align - // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd - // CHECK: br label {{%?}}[[EXEC:.+]] - // - // CHECK: [[EXEC]] - // CHECK: {{call|invoke}} void [[OP2:@.+]]({{.+}}, {{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) - // CHECK: br label {{%?}}[[DONE:.+]] - // - // CHECK: [[DONE]] - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - // CHECK: } - - // CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]]) - // CHECK: = alloca i32*, align - // CHECK: = alloca i32*, align - // CHECK: [[A_ADDR:%.+]] = alloca i32*, align - // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align - // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align - // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align - // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align - // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align - // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align - // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align - // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align - // CHECK: store i32 {{%.+}}, i32* [[A]], align - // CHECK: store i16 {{%.+}}, i16* [[AA]], align - // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], - // CHECK: store i32 {{%.+}}, i32* [[ELT]], align - // CHECK: ret void - // CHECK: } +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( +// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align +// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack +// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align +// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align +// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: br label {{%?}}[[EXEC:.+]] +// +// CHECK: [[EXEC]] +// CHECK: {{call|invoke}} void [[OP1:@.+]]({{.+}}, {{.+}}, i16* [[AA]]) +// CHECK: br label {{%?}}[[DONE:.+]] +// +// CHECK: [[DONE]] +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[EXIT]] +// CHECK: ret void +// CHECK: } + +// CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]]) +// CHECK: = alloca i32*, align +// CHECK: = alloca i32*, align +// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align +// CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align +// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align +// CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align +// CHECK: store i16 {{%.+}}, i16* [[AA]], align +// CHECK: ret void +// CHECK: } + +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( +// CHECK: [[A_ADDR:%.+]] = alloca i32*, align +// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align +// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align +// CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align +// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align +// CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align +// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align +// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align +// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align +// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: br label {{%?}}[[EXEC:.+]] +// +// CHECK: [[EXEC]] +// CHECK: {{call|invoke}} void [[OP2:@.+]]({{.+}}, {{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) +// CHECK: br label {{%?}}[[DONE:.+]] +// +// CHECK: [[DONE]] +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[EXIT]] +// CHECK: ret void +// CHECK: } + +// CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]]) +// CHECK: = alloca i32*, align +// CHECK: = alloca i32*, align +// CHECK: [[A_ADDR:%.+]] = alloca i32*, align +// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align +// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align +// CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align +// CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align +// CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align +// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align +// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align +// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align +// CHECK: store i32 {{%.+}}, i32* [[A]], align +// CHECK: store i16 {{%.+}}, i16* [[AA]], align +// CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], +// CHECK: store i32 {{%.+}}, i32* [[ELT]], align +// CHECK: ret void +// CHECK: } #endif diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp index 1aac48198415c..93de4b6397ba1 100644 --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -4,13 +4,20 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 2 target regions on the gpu is set to non-SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l21}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l28}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0 template tx ftemplate(int n) { @@ -41,67 +48,67 @@ int bar(int n){ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l21}}( - // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align - // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align - // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align - // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() - // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) - // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], - // CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i16* [[AA]]) - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // CHECK: ret void - // CHECK: } +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l28}}( +// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align +// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align +// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align +// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) +// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], +// CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i16* [[AA]]) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK: ret void +// CHECK: } - // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* {{[^%]*}}[[ARG:%.+]]) - // CHECK: = alloca i32*, align - // CHECK: = alloca i32*, align - // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align - // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align - // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align - // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align - // CHECK: store i16 {{%.+}}, i16* [[AA]], align - // CHECK: ret void - // CHECK: } +// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* {{[^%]*}}[[ARG:%.+]]) +// CHECK: = alloca i32*, align +// CHECK: = alloca i32*, align +// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align +// CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align +// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align +// CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align +// CHECK: store i16 {{%.+}}, i16* [[AA]], align +// CHECK: ret void +// CHECK: } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}( - // CHECK: [[A_ADDR:%.+]] = alloca i32*, align - // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align - // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align - // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align - // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align - // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align - // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align - // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align - // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align - // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() - // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) - // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], - // CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // CHECK: ret void - // CHECK: } +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( +// CHECK: [[A_ADDR:%.+]] = alloca i32*, align +// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align +// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align +// CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align +// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align +// CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align +// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align +// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align +// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align +// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) +// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], +// CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK: ret void +// CHECK: } - // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]]) - // CHECK: = alloca i32*, align - // CHECK: = alloca i32*, align - // CHECK: [[A_ADDR:%.+]] = alloca i32*, align - // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align - // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align - // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align - // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align - // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align - // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align - // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align - // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align - // CHECK: store i32 {{%.+}}, i32* [[A]], align - // CHECK: store i16 {{%.+}}, i16* [[AA]], align - // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], - // CHECK: store i32 {{%.+}}, i32* [[ELT]], align - // CHECK: ret void - // CHECK: } +// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]]) +// CHECK: = alloca i32*, align +// CHECK: = alloca i32*, align +// CHECK: [[A_ADDR:%.+]] = alloca i32*, align +// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align +// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align +// CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align +// CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align +// CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align +// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align +// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align +// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align +// CHECK: store i32 {{%.+}}, i32* [[A]], align +// CHECK: store i16 {{%.+}}, i16* [[AA]], align +// CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], +// CHECK: store i32 {{%.+}}, i32* [[ELT]], align +// CHECK: ret void +// CHECK: } #endif diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp index 4b32f9d87bcd0..778b8d3300dfa 100644 --- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp @@ -4,14 +4,21 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 3 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l22}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 template tx ftemplate(int n) { @@ -46,64 +53,57 @@ int bar(int n){ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l22}}( - // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd - // CHECK: br label {{%?}}[[EXEC:.+]] - // - // CHECK: [[EXEC]] - // CHECK-NOT: call void @__kmpc_push_proc_bind - // CHECK: {{call|invoke}} void [[OP1:@.+]]( - // CHECK: br label {{%?}}[[DONE:.+]] - // - // CHECK: [[DONE]] - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - // CHECK: } - - - - - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}( - // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd - // CHECK: br label {{%?}}[[EXEC:.+]] - // - // CHECK: [[EXEC]] - // CHECK-NOT: call void @__kmpc_push_proc_bind - // CHECK: {{call|invoke}} void [[OP1:@.+]]( - // CHECK: br label {{%?}}[[DONE:.+]] - // - // CHECK: [[DONE]] - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - // CHECK: } - - - - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}( - // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd - // CHECK: br label {{%?}}[[EXEC:.+]] - // - // CHECK: [[EXEC]] - // CHECK-NOT: call void @__kmpc_push_proc_bind - // CHECK: {{call|invoke}} void [[OP1:@.+]]( - // CHECK: br label {{%?}}[[DONE:.+]] - // - // CHECK: [[DONE]] - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - // CHECK: } +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}( +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: br label {{%?}}[[EXEC:.+]] +// +// CHECK: [[EXEC]] +// CHECK-NOT: call void @__kmpc_push_proc_bind +// CHECK: {{call|invoke}} void [[OP1:@.+]]( +// CHECK: br label {{%?}}[[DONE:.+]] +// +// CHECK: [[DONE]] +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[EXIT]] +// CHECK: ret void +// CHECK: } + +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: br label {{%?}}[[EXEC:.+]] +// +// CHECK: [[EXEC]] +// CHECK-NOT: call void @__kmpc_push_proc_bind +// CHECK: {{call|invoke}} void [[OP1:@.+]]( +// CHECK: br label {{%?}}[[DONE:.+]] +// +// CHECK: [[DONE]] +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[EXIT]] +// CHECK: ret void +// CHECK: } + +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: br label {{%?}}[[EXEC:.+]] +// +// CHECK: [[EXEC]] +// CHECK-NOT: call void @__kmpc_push_proc_bind +// CHECK: {{call|invoke}} void [[OP1:@.+]]( +// CHECK: br label {{%?}}[[DONE:.+]] +// +// CHECK: [[DONE]] +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[EXIT]] +// CHECK: ret void +// CHECK: } #endif diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp index 7a1f01c1f1ad4..168b82057cacf 100644 --- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp @@ -4,15 +4,22 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l25}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l30}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l35}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 0 #define N 1000 @@ -53,28 +60,28 @@ int bar(int n){ return a; } -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l25}}( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l32}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l30}}( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l37}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l35}}( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l42}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l40}}( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l47}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini @@ -82,5 +89,4 @@ int bar(int n){ // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void - #endif diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp index c90a3d2248e84..4fd0f71c5e5db 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -9,17 +9,29 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR + // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l62}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l69}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l61}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l66}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l74}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l81}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -86,7 +98,7 @@ int bar(int n){ // SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l38( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l50( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void [[PARALLEL:@.+]]( @@ -239,13 +251,13 @@ int bar(int n){ // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK: define weak void @__omp_offloading_{{.*}}_l62(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_l74(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [10 x [10 x i32]]* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) // CHECK-DIV64: div i64 // CHECK-DIV32-NO: div i64 -// CHECK: define weak void @__omp_offloading_{{.*}}_l69(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_l81(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{.*}}) diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp index 2397b867afc00..fb0bf9c01ff01 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp @@ -4,6 +4,13 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -19,9 +26,9 @@ int main(int argc, char **argv) { return 0; } -// CHECK: @__omp_offloading_{{.*}}_main_l16_exec_mode = weak constant i8 0 +// CHECK: @__omp_offloading_{{.*}}_main_[[LINE:l.+]]_exec_mode = weak constant i8 0 -// CHECK: define weak void @__omp_offloading_{{.*}}_main_l16(i{{64|32}} %{{[^,].*}}, i32* nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_main_[[LINE]](i{{64|32}} %{{[^,].*}}, i32* nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}}) // CHECK: call void @__kmpc_spmd_kernel_init( // CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @ // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp index 50ce394461066..a933c7e021b82 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -7,15 +7,25 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR + // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 4 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l39}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 // CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l59}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -71,7 +81,7 @@ int bar(int n){ // SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l33( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp index 091e1f8d0a2b1..8d12c857cb434 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp @@ -4,15 +4,22 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l30}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l36}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l46}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l53}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -62,7 +69,7 @@ int bar(int n){ return a; } -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l30( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l37( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) @@ -70,7 +77,7 @@ int bar(int n){ // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l36( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) @@ -78,7 +85,7 @@ int bar(int n){ // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l41( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l48( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) @@ -86,7 +93,7 @@ int bar(int n){ // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK: define {{.*}}void {{@__omp_offloading_.+}}_l46({{.+}}, i{{32|64}} [[F_IN:%.+]]) +// CHECK: define {{.*}}void {{@__omp_offloading_.+}}_l53({{.+}}, i{{32|64}} [[F_IN:%.+]]) // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) diff --git a/clang/test/OpenMP/parallel_ast_print.cpp b/clang/test/OpenMP/parallel_ast_print.cpp index 3285e101134a4..0f309c0add050 100644 --- a/clang/test/OpenMP/parallel_ast_print.cpp +++ b/clang/test/OpenMP/parallel_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_default_messages.cpp b/clang/test/OpenMP/parallel_default_messages.cpp index 594ef1e2bb8a2..e9d2946b7686a 100644 --- a/clang/test/OpenMP/parallel_default_messages.cpp +++ b/clang/test/OpenMP/parallel_default_messages.cpp @@ -1,8 +1,6 @@ - - -// RUN: %clang_cc1 -verify=expected,ge40 -fopenmp -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,ge40 -fopenmp-version=45 -fopenmp -ferror-limit 100 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,ge40 -fopenmp-simd -ferror-limit 100 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,ge40 -fopenmp-version=50 -fopenmp -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,ge40 -fopenmp -ferror-limit 100 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,ge40 -fopenmp-version=40 -fopenmp -ferror-limit 100 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify -fopenmp-version=31 -fopenmp -ferror-limit 100 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify -fopenmp-version=30 -fopenmp -ferror-limit 100 -o - %s -Wuninitialized diff --git a/clang/test/OpenMP/parallel_for_ast_print.cpp b/clang/test/OpenMP/parallel_for_ast_print.cpp index 4006d79ff2cdb..bdcae82f90e8c 100644 --- a/clang/test/OpenMP/parallel_for_ast_print.cpp +++ b/clang/test/OpenMP/parallel_for_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_for_codegen.cpp b/clang/test/OpenMP/parallel_for_codegen.cpp index a17506adad89a..f8cd51e93a65c 100644 --- a/clang/test/OpenMP/parallel_for_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_codegen.cpp @@ -18,12 +18,12 @@ // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix=OMP5 %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=OMP5 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp -DOMP5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix=OMP5 %s +// RUN: %clang_cc1 -fopenmp -DOMP5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -DOMP5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=OMP5 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -DOMP5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s #ifndef HEADER #define HEADER diff --git a/clang/test/OpenMP/parallel_for_lastprivate_conditional.cpp b/clang/test/OpenMP/parallel_for_lastprivate_conditional.cpp index 87a2e1b699d1c..f58131cbc534c 100644 --- a/clang/test/OpenMP/parallel_for_lastprivate_conditional.cpp +++ b/clang/test/OpenMP/parallel_for_lastprivate_conditional.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -DOMP5 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -DOMP5 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -DOMP5 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -DOMP5 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics diff --git a/clang/test/OpenMP/parallel_for_lastprivate_messages.cpp b/clang/test/OpenMP/parallel_for_lastprivate_messages.cpp index fbbf826e6c03e..387d1213af5cd 100644 --- a/clang/test/OpenMP/parallel_for_lastprivate_messages.cpp +++ b/clang/test/OpenMP/parallel_for_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp-simd %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized extern int omp_default_mem_alloc; void foo() { diff --git a/clang/test/OpenMP/parallel_for_loop_messages.cpp b/clang/test/OpenMP/parallel_for_loop_messages.cpp index cbcc18419dcc1..0b755f7411748 100644 --- a/clang/test/OpenMP/parallel_for_loop_messages.cpp +++ b/clang/test/OpenMP/parallel_for_loop_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized class S { int a; diff --git a/clang/test/OpenMP/parallel_for_messages.cpp b/clang/test/OpenMP/parallel_for_messages.cpp index b3cda295fad77..1c28c67b08937 100644 --- a/clang/test/OpenMP/parallel_for_messages.cpp +++ b/clang/test/OpenMP/parallel_for_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 100 -std=c++11 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 100 -std=c++11 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 100 -std=c++11 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 100 -std=c++11 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -std=c++11 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 100 -std=c++11 -o - %s -Wuninitialized void xxx(int argc) { int x; // expected-note {{initialize the variable 'x' to silence this warning}} diff --git a/clang/test/OpenMP/parallel_for_reduction_messages.cpp b/clang/test/OpenMP/parallel_for_reduction_messages.cpp index e88aeed41d003..b22585a85fbbb 100644 --- a/clang/test/OpenMP/parallel_for_reduction_messages.cpp +++ b/clang/test/OpenMP/parallel_for_reduction_messages.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp-simd -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp-simd -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized extern int omp_default_mem_alloc; void xxx(int argc) { diff --git a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp index fcee3d645b4ae..2645bdb3ddd8a 100644 --- a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_for_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_scan_codegen.cpp index f5687ac556979..3f01d3857a471 100644 --- a/clang/test/OpenMP/parallel_for_scan_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_scan_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_for_simd_ast_print.cpp b/clang/test/OpenMP/parallel_for_simd_ast_print.cpp index 5a079b8ac0fd6..2c063ed81b352 100644 --- a/clang/test/OpenMP/parallel_for_simd_ast_print.cpp +++ b/clang/test/OpenMP/parallel_for_simd_ast_print.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -verify -fopenmp -DOMP5 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45 -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -verify -fopenmp-simd -DOMP5 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 +// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_for_simd_codegen.cpp b/clang/test/OpenMP/parallel_for_simd_codegen.cpp index 715328771ccce..213d745d6e7e5 100644 --- a/clang/test/OpenMP/parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_simd_codegen.cpp @@ -1,22 +1,22 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=OMP45 --check-prefix=CHECK +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=OMP50 --check-prefix=CHECK // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=OMP50 --check-prefix=CHECK -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=50 -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG - -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=45 -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=50 -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s // expected-no-diagnostics // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_for_simd_lastprivate_messages.cpp b/clang/test/OpenMP/parallel_for_simd_lastprivate_messages.cpp index 4dca240503975..8a6e8249aead5 100644 --- a/clang/test/OpenMP/parallel_for_simd_lastprivate_messages.cpp +++ b/clang/test/OpenMP/parallel_for_simd_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp-simd %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized extern int omp_default_mem_alloc; void foo() { diff --git a/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp b/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp index cbc14ed6663a9..f55453f6e8e15 100644 --- a/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp +++ b/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized class S { int a; diff --git a/clang/test/OpenMP/parallel_for_simd_misc_messages.c b/clang/test/OpenMP/parallel_for_simd_misc_messages.c index d2b088a29c57a..d39f1b12211ea 100644 --- a/clang/test/OpenMP/parallel_for_simd_misc_messages.c +++ b/clang/test/OpenMP/parallel_for_simd_misc_messages.c @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -verify=expected,omp45 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -verify=expected,omp50 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify=expected,omp50 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -verify=expected,omp45 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -verify=expected,omp50 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -verify=expected,omp50 %s -Wuninitialized // expected-error@+1 {{unexpected OpenMP directive '#pragma omp parallel for simd'}} #pragma omp parallel for simd diff --git a/clang/test/OpenMP/parallel_for_simd_reduction_messages.cpp b/clang/test/OpenMP/parallel_for_simd_reduction_messages.cpp index f8776713e20e8..d5daa64ca4145 100644 --- a/clang/test/OpenMP/parallel_for_simd_reduction_messages.cpp +++ b/clang/test/OpenMP/parallel_for_simd_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 -o - %s -Wuninitialized extern int omp_default_mem_alloc; void xxx(int argc) { diff --git a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp index 42fb9de348649..56f7ea8e50317 100644 --- a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_master_ast_print.cpp b/clang/test/OpenMP/parallel_master_ast_print.cpp index e521488fafc30..c6cba2d2b5e3c 100644 --- a/clang/test/OpenMP/parallel_master_ast_print.cpp +++ b/clang/test/OpenMP/parallel_master_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_master_reduction_messages.cpp b/clang/test/OpenMP/parallel_master_reduction_messages.cpp index b74d3fe936e83..2313181fe76c7 100644 --- a/clang/test/OpenMP/parallel_master_reduction_messages.cpp +++ b/clang/test/OpenMP/parallel_master_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized extern int omp_default_mem_alloc; void xxx(int argc) { diff --git a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp index ab76987a59c93..b235165e9a55f 100644 --- a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-linux -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_master_taskloop_ast_print.cpp b/clang/test/OpenMP/parallel_master_taskloop_ast_print.cpp index d58121e86667a..35fdddc0a7bed 100644 --- a/clang/test/OpenMP/parallel_master_taskloop_ast_print.cpp +++ b/clang/test/OpenMP/parallel_master_taskloop_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp index 219905ed278bb..578712c8f8091 100644 --- a/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp +++ b/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=50 -x c++ -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=50 -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/parallel_master_taskloop_lastprivate_messages.cpp b/clang/test/OpenMP/parallel_master_taskloop_lastprivate_messages.cpp index 1c383937e8052..7d5dfee7e5929 100644 --- a/clang/test/OpenMP/parallel_master_taskloop_lastprivate_messages.cpp +++ b/clang/test/OpenMP/parallel_master_taskloop_lastprivate_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-version=45 -fopenmp-simd %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-version=50 -fopenmp-simd %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd %s -Wuninitialized typedef void **omp_allocator_handle_t; extern const omp_allocator_handle_t omp_null_allocator; diff --git a/clang/test/OpenMP/parallel_master_taskloop_loop_messages.cpp b/clang/test/OpenMP/parallel_master_taskloop_loop_messages.cpp index 2f0edc4539127..f94592c96447c 100644 --- a/clang/test/OpenMP/parallel_master_taskloop_loop_messages.cpp +++ b/clang/test/OpenMP/parallel_master_taskloop_loop_messages.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized // RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp4 %s -Wuninitialized -// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized +// RUN: %clang_cc1 -fsyntax-only -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify=expected,omp5 %s -Wuninitialized class S { int a; diff --git a/clang/test/OpenMP/parallel_master_taskloop_reduction_messages.cpp b/clang/test/OpenMP/parallel_master_taskloop_reduction_messages.cpp index aa4aa485369a5..c5ad633eac72b 100644 --- a/clang/test/OpenMP/parallel_master_taskloop_reduction_messages.cpp +++ b/clang/test/OpenMP/parallel_master_taskloop_reduction_messages.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized // RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++98 -ferror-limit 150 -o - %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -std=c++11 -ferror-limit 150 -o - %s -Wuninitialized typedef void **omp_allocator_handle_t; extern const omp_allocator_handle_t omp_null_allocator; diff --git a/clang/test/OpenMP/target_codegen.cpp b/clang/test/OpenMP/target_codegen.cpp index 55b03aae00a58..c8570bdf6b65f 100644 --- a/clang/test/OpenMP/target_codegen.cpp +++ b/clang/test/OpenMP/target_codegen.cpp @@ -706,6 +706,8 @@ int bar(int n){ // CHECK: [[IFEND]] +// OMP45: define internal void @__omp_offloading_{{.+}}_{{.+}}bar{{.+}}_l838(i[[SZ]] %{{.+}}) + // OMP45: define {{.*}}@{{.*}}zee{{.*}} // OMP45: [[LOCAL_THIS:%.+]] = alloca [[S2]]* @@ -803,6 +805,7 @@ int bar(int n){ // CHECK-DAG: load i16, i16* [[REF_AA]] // CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 +// OMP50: define internal void @__omp_offloading_{{.+}}_{{.+}}bar{{.+}}_l838(i[[SZ]] %{{.+}}) // OMP50: define {{.*}}@{{.*}}zee{{.*}} @@ -833,7 +836,11 @@ int bar(int n){ void bar () { #define pragma_target _Pragma("omp target") pragma_target -{} +{ + global = 0; +#pragma omp parallel shared(global) + global = 1; +} } class S2 { diff --git a/clang/test/OpenMP/target_parallel_codegen_registration.cpp b/clang/test/OpenMP/target_parallel_codegen_registration.cpp index 61534d898912d..6a4a8d944cb8c 100644 --- a/clang/test/OpenMP/target_parallel_codegen_registration.cpp +++ b/clang/test/OpenMP/target_parallel_codegen_registration.cpp @@ -40,6 +40,47 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s // SIMD-ONLY2-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target parallel codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + +// Check that no target code is emitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s +// SIMD-ONLY2-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -401,31 +442,31 @@ int bar(int a){ // Check metadata is properly generated: // CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 254, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 270, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 276, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 287, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 396, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 299, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 299, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 287, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 229, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 295, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 311, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 317, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 328, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 437, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 340, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 340, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 328, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 270, i32 {{[0-9]+}}} // TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 254, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 270, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 276, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 287, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 396, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 299, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 299, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 287, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 229, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 295, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 311, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 317, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 328, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 437, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 340, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 340, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 328, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 270, i32 {{[0-9]+}}} #endif diff --git a/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp b/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp index de44331c8e1af..623635fb3543e 100644 --- a/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp +++ b/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp @@ -40,6 +40,47 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s // SIMD-ONLY2-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target parallel for codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + +// Check that no target code is emitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s +// SIMD-ONLY2-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -411,31 +452,31 @@ int bar(int a){ // Check metadata is properly generated: // CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} // TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} #endif diff --git a/clang/test/OpenMP/target_parallel_for_if_messages.cpp b/clang/test/OpenMP/target_parallel_for_if_messages.cpp index 99bb13d24f2b8..857490be4d54c 100644 --- a/clang/test/OpenMP/target_parallel_for_if_messages.cpp +++ b/clang/test/OpenMP/target_parallel_for_if_messages.cpp @@ -2,6 +2,9 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp-simd %s -Wuninitialized + void foo() { } diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp index 824040de6fea2..c2a63a09f26ef 100644 --- a/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp +++ b/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp @@ -40,6 +40,47 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s // SIMD-ONLY2-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target parallel for simd codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + +// Check that no target code is emitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s +// SIMD-ONLY2-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -411,31 +452,31 @@ int bar(int a){ // Check metadata is properly generated: // CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} // TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} #endif diff --git a/clang/test/OpenMP/target_parallel_if_codegen.cpp b/clang/test/OpenMP/target_parallel_if_codegen.cpp index 32b79e4e4b381..91c697fcb4d42 100644 --- a/clang/test/OpenMP/target_parallel_if_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_if_codegen.cpp @@ -34,6 +34,41 @@ // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s // SIMD-ONLY1-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER diff --git a/clang/test/OpenMP/target_simd_codegen_registration.cpp b/clang/test/OpenMP/target_simd_codegen_registration.cpp index f756cb48917c2..eacea9891d5bb 100644 --- a/clang/test/OpenMP/target_simd_codegen_registration.cpp +++ b/clang/test/OpenMP/target_simd_codegen_registration.cpp @@ -40,6 +40,47 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s // SIMD-ONLY2-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target simd codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + +// Check that no target code is emitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s +// SIMD-ONLY2-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -411,31 +452,31 @@ int bar(int a){ // Check metadata is properly generated: // CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} // TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} #endif diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp index 985dc42692f86..3e830b29f7f51 100644 --- a/clang/test/OpenMP/target_teams_codegen.cpp +++ b/clang/test/OpenMP/target_teams_codegen.cpp @@ -34,6 +34,41 @@ // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY1 %s // SIMD-ONLY1-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CHECK --check-prefix CHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix TCHECK --check-prefix TCHECK-32 + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -477,13 +512,13 @@ int foo(int n) { // CHECK: define internal {{.*}}void [[OMP_OUTLINED4]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, [10 x float]* {{.+}}, i[[SZ]] %{{.+}}, float* {{.+}}, [5 x [10 x double]]* {{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, double* {{.+}}, [[TT]]* {{.+}}) // To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. -// CHECK: define {{.*}}void @__omp_offloading_{{.*}}foo{{.*}}_l334(i[[SZ]] %{{.+}}) +// CHECK: define {{.*}}void @__omp_offloading_{{.*}}foo{{.*}}_l369(i[[SZ]] %{{.+}}) // CHECK: define internal void {{@.+}}(i32* {{.+}}, i32* {{.+}}, i[[SZ]] %{{.+}}) -// CHECK: define {{.*}}void @__omp_offloading_{{.*}}foo{{.*}}_l337(i[[SZ]] %{{.+}}) +// CHECK: define {{.*}}void @__omp_offloading_{{.*}}foo{{.*}}_l372(i[[SZ]] %{{.+}}) // CHECK: define internal void {{@.+}}(i32* {{.+}}, i32* {{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %{{.+}}) void bazzzz(int n, int f[n]) { -// CHECK: define internal void @__omp_offloading_{{.+}}bazzzz{{.+}}_l489(i[[SZ]] %{{[^,]+}}) +// CHECK: define internal void @__omp_offloading_{{.+}}bazzzz{{.+}}_l524(i[[SZ]] %{{[^,]+}}) // CHECK: [[VLA:%.+]] = load i[[SZ]], i[[SZ]]* % // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @{{.+}}, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]])* @{{.+}} to void (i32*, i32*, ...)*), i[[SZ]] [[VLA]]) #pragma omp target teams private(f) diff --git a/clang/test/OpenMP/target_teams_codegen_registration.cpp b/clang/test/OpenMP/target_teams_codegen_registration.cpp index e29843b117369..1d7fe9f0926f5 100644 --- a/clang/test/OpenMP/target_teams_codegen_registration.cpp +++ b/clang/test/OpenMP/target_teams_codegen_registration.cpp @@ -40,6 +40,47 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s // SIMD-ONLY2-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target teams codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + +// Check that no target code is emitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s +// SIMD-ONLY2-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -401,31 +442,31 @@ int bar(int a){ // Check metadata is properly generated: // CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 254, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 270, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 276, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 287, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 396, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 299, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 299, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 287, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 229, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 295, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 311, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 317, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 328, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 437, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 340, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 340, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 328, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 270, i32 {{[0-9]+}}} // TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 254, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 270, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 276, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 287, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 396, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 299, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 299, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 287, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 229, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 295, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 311, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 317, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 328, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 437, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 340, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 340, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 328, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 270, i32 {{[0-9]+}}} #endif diff --git a/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp b/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp index d260f918b41e9..c2289b486d2e7 100644 --- a/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp +++ b/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp @@ -40,6 +40,47 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s // SIMD-ONLY2-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target teams distribute codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + +// Check that no target code is emitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s +// SIMD-ONLY2-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -411,31 +452,31 @@ int bar(int a){ // Check metadata is properly generated: // CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} // TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} #endif diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp index 227ca5c8eb91b..16fa64b99e942 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp @@ -40,6 +40,47 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s // SIMD-ONLY2-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target teams distribute parallel for simd codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + +// Check that no target code is emitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s +// SIMD-ONLY2-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -411,31 +452,31 @@ int bar(int a){ // Check metadata is properly generated: // CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} // TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} #endif diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp index 96b72aba60320..d9aaa16640994 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp @@ -40,6 +40,47 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s // SIMD-ONLY2-NOT: {{__kmpc|__tgt}} +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// Test target teams distribute simd codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s +// SIMD-ONLY1-NOT: {{__kmpc|__tgt}} + +// Check that no target code is emitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s +// SIMD-ONLY2-NOT: {{__kmpc|__tgt}} + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -411,32 +452,32 @@ int bar(int a){ // Check metadata is properly generated: // CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} // TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 256, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 274, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 281, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 405, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 300, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 307, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 293, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 {{[0-9]+}}} // TCHECK-DAG: !{!"llvm.loop.vectorize.enable", i1 true} // CHECK-DAG: !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/clang/test/PCH/cxx-invalid-destructor.cpp b/clang/test/PCH/cxx-invalid-destructor.cpp new file mode 100644 index 0000000000000..fc89cf1f3dfc1 --- /dev/null +++ b/clang/test/PCH/cxx-invalid-destructor.cpp @@ -0,0 +1,4 @@ +// RUN: %clang_cc1 -x c++ -std=c++11 -emit-pch -o %t %S/cxx-invalid-destructor.h -fallow-pch-with-compiler-errors +// RUN: %clang_cc1 -x c++ -std=c++11 -include-pch %t %S/cxx-invalid-destructor.cpp -fsyntax-only -fno-validate-pch + +Foo f; diff --git a/clang/test/PCH/cxx-invalid-destructor.h b/clang/test/PCH/cxx-invalid-destructor.h new file mode 100644 index 0000000000000..59095a37c203e --- /dev/null +++ b/clang/test/PCH/cxx-invalid-destructor.h @@ -0,0 +1,7 @@ +struct Base { + ~Base(); +}; + +struct Foo : public Base { + ~Base(); +}; diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c index 3550af3b680b9..cffd82fdf3c52 100644 --- a/clang/test/Preprocessor/init-ppc64.c +++ b/clang/test/Preprocessor/init-ppc64.c @@ -628,6 +628,7 @@ // PPCPOWER10:#define _ARCH_PWR7 1 // PPCPOWER10:#define _ARCH_PWR8 1 // PPCPOWER10:#define _ARCH_PWR9 1 +// PPCPOWER10:#define __MMA__ 1 // // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu future -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCFUTURE %s // @@ -645,6 +646,10 @@ // PPCFUTURE:#define _ARCH_PWR8 1 // PPCFUTURE:#define _ARCH_PWR9 1 // PPCFUTURE:#define _ARCH_PWR_FUTURE 1 +// PPCFUTURE:#define __MMA__ 1 +// +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-feature +mma -target-cpu power9 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC-MMA %s +// PPC-MMA:#define __MMA__ 1 // // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-feature +float128 -target-cpu power9 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC-FLOAT128 %s // PPC-FLOAT128:#define __FLOAT128__ 1 diff --git a/clang/test/Preprocessor/init-zos.c b/clang/test/Preprocessor/init-zos.c new file mode 100644 index 0000000000000..50c4ed9e539e0 --- /dev/null +++ b/clang/test/Preprocessor/init-zos.c @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=s390x-none-zos -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix S390X-ZOS %s +// RUN: %clang_cc1 -x c++ -std=gnu++14 -E -dM -ffreestanding -triple=s390x-none-zos -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix S390X-ZOS -check-prefix S390X-ZOS-GNUXX %s + +// S390X-ZOS-GNUXX:#define _EXT 1 +// S390X-ZOS:#define _LONG_LONG 1 +// S390X-ZOS-GNUXX:#define _MI_BUILTIN 1 +// S390X-ZOS:#define _OPEN_DEFAULT 1 +// S390X-ZOS:#define _UNIX03_WITHDRAWN 1 +// S390X-ZOS-GNUXX:#define _XOPEN_SOURCE 600 +// S390X-ZOS:#define __370__ 1 +// S390X-ZOS:#define __64BIT__ 1 +// S390X-ZOS:#define __BFP__ 1 +// S390X-ZOS:#define __BOOL__ 1 +// S390X-ZOS-GNUXX:#define __DLL__ 1 +// S390X-ZOS:#define __LONGNAME__ 1 +// S390X-ZOS:#define __MVS__ 1 +// S390X-ZOS:#define __THW_370__ 1 +// S390X-ZOS:#define __THW_BIG_ENDIAN__ 1 +// S390X-ZOS:#define __TOS_390__ 1 +// S390X-ZOS:#define __TOS_MVS__ 1 +// S390X-ZOS:#define __XPLINK__ 1 +// S390X-ZOS-GNUXX:#define __wchar_t 1 diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c index abab9274ffbb2..5326596fee93c 100644 --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -1629,6 +1629,145 @@ // CHECK_TGL_M64: #define __x86_64 1 // CHECK_TGL_M64: #define __x86_64__ 1 +// RUN: %clang -march=sapphirerapids -m32 -E -dM %s -o - 2>&1 \ +// RUN: -target i386-unknown-linux \ +// RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_SPR_M32 +// CHECK_SPR_M32: #define __AES__ 1 +// CHECK_SPR_M32: #define __AMXBF16__ 1 +// CHECK_SPR_M32: #define __AMXINT8__ 1 +// CHECK_SPR_M32: #define __AMXTILE__ 1 +// CHECK_SPR_M32: #define __AVX2__ 1 +// CHECK_SPR_M32: #define __AVX512BF16__ 1 +// CHECK_SPR_M32: #define __AVX512BITALG__ 1 +// CHECK_SPR_M32: #define __AVX512BW__ 1 +// CHECK_SPR_M32: #define __AVX512CD__ 1 +// CHECK_SPR_M32: #define __AVX512DQ__ 1 +// CHECK_SPR_M32: #define __AVX512F__ 1 +// CHECK_SPR_M32: #define __AVX512IFMA__ 1 +// CHECK_SPR_M32: #define __AVX512VBMI2__ 1 +// CHECK_SPR_M32: #define __AVX512VBMI__ 1 +// CHECK_SPR_M32: #define __AVX512VL__ 1 +// CHECK_SPR_M32: #define __AVX512VNNI__ 1 +// CHECK_SPR_M32: #define __AVX512VPOPCNTDQ__ 1 +// CHECK_SPR_M32: #define __AVX__ 1 +// CHECK_SPR_M32: #define __BMI2__ 1 +// CHECK_SPR_M32: #define __BMI__ 1 +// CHECK_SPR_M32: #define __CLDEMOTE__ 1 +// CHECK_SPR_M32: #define __CLFLUSHOPT__ 1 +// CHECK_SPR_M32: #define __CLWB__ 1 +// CHECK_SPR_M32: #define __ENQCMD__ 1 +// CHECK_SPR_M32: #define __F16C__ 1 +// CHECK_SPR_M32: #define __FMA__ 1 +// CHECK_SPR_M32: #define __GFNI__ 1 +// CHECK_SPR_M32: #define __INVPCID__ 1 +// CHECK_SPR_M32: #define __LZCNT__ 1 +// CHECK_SPR_M32: #define __MMX__ 1 +// CHECK_SPR_M32: #define __MOVBE__ 1 +// CHECK_SPR_M32: #define __PCLMUL__ 1 +// CHECK_SPR_M32: #define __PCONFIG__ 1 +// CHECK_SPR_M32: #define __PKU__ 1 +// CHECK_SPR_M32: #define __POPCNT__ 1 +// CHECK_SPR_M32: #define __PRFCHW__ 1 +// CHECK_SPR_M32: #define __PTWRITE__ 1 +// CHECK_SPR_M32: #define __RDPID__ 1 +// CHECK_SPR_M32: #define __RDRND__ 1 +// CHECK_SPR_M32: #define __RDSEED__ 1 +// CHECK_SPR_M32: #define __SERIALIZE__ 1 +// CHECK_SPR_M32: #define __SGX__ 1 +// CHECK_SPR_M32: #define __SHA__ 1 +// CHECK_SPR_M32: #define __SHSTK__ 1 +// CHECK_SPR_M32: #define __SSE2__ 1 +// CHECK_SPR_M32: #define __SSE3__ 1 +// CHECK_SPR_M32: #define __SSE4_1__ 1 +// CHECK_SPR_M32: #define __SSE4_2__ 1 +// CHECK_SPR_M32: #define __SSE__ 1 +// CHECK_SPR_M32: #define __SSSE3__ 1 +// CHECK_SPR_M32: #define __TSXLDTRK__ 1 +// CHECK_SPR_M32: #define __VAES__ 1 +// CHECK_SPR_M32: #define __VPCLMULQDQ__ 1 +// CHECK_SPR_M32: #define __WAITPKG__ 1 +// CHECK_SPR_M32: #define __WBNOINVD__ 1 +// CHECK_SPR_M32: #define __XSAVEC__ 1 +// CHECK_SPR_M32: #define __XSAVEOPT__ 1 +// CHECK_SPR_M32: #define __XSAVES__ 1 +// CHECK_SPR_M32: #define __XSAVE__ 1 +// CHECK_SPR_M32: #define __corei7 1 +// CHECK_SPR_M32: #define __corei7__ 1 +// CHECK_SPR_M32: #define __i386 1 +// CHECK_SPR_M32: #define __i386__ 1 +// CHECK_SPR_M32: #define __tune_corei7__ 1 +// CHECK_SPR_M32: #define i386 1 + +// RUN: %clang -march=sapphirerapids -m64 -E -dM %s -o - 2>&1 \ +// RUN: -target i386-unknown-linux \ +// RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_SPR_M64 +// CHECK_SPR_M64: #define __AES__ 1 +// CHECK_SPR_M64: #define __AMXBF16__ 1 +// CHECK_SPR_M64: #define __AMXINT8__ 1 +// CHECK_SPR_M64: #define __AMXTILE__ 1 +// CHECK_SPR_M64: #define __AVX2__ 1 +// CHECK_SPR_M64: #define __AVX512BF16__ 1 +// CHECK_SPR_M64: #define __AVX512BITALG__ 1 +// CHECK_SPR_M64: #define __AVX512BW__ 1 +// CHECK_SPR_M64: #define __AVX512CD__ 1 +// CHECK_SPR_M64: #define __AVX512DQ__ 1 +// CHECK_SPR_M64: #define __AVX512F__ 1 +// CHECK_SPR_M64: #define __AVX512IFMA__ 1 +// CHECK_SPR_M64: #define __AVX512VBMI2__ 1 +// CHECK_SPR_M64: #define __AVX512VBMI__ 1 +// CHECK_SPR_M64: #define __AVX512VL__ 1 +// CHECK_SPR_M64: #define __AVX512VNNI__ 1 +// CHECK_SPR_M64: #define __AVX512VPOPCNTDQ__ 1 +// CHECK_SPR_M64: #define __AVX__ 1 +// CHECK_SPR_M64: #define __BMI2__ 1 +// CHECK_SPR_M64: #define __BMI__ 1 +// CHECK_SPR_M64: #define __CLDEMOTE__ 1 +// CHECK_SPR_M64: #define __CLFLUSHOPT__ 1 +// CHECK_SPR_M64: #define __CLWB__ 1 +// CHECK_SPR_M64: #define __ENQCMD__ 1 +// CHECK_SPR_M64: #define __F16C__ 1 +// CHECK_SPR_M64: #define __FMA__ 1 +// CHECK_SPR_M64: #define __GFNI__ 1 +// CHECK_SPR_M64: #define __INVPCID__ 1 +// CHECK_SPR_M64: #define __LZCNT__ 1 +// CHECK_SPR_M64: #define __MMX__ 1 +// CHECK_SPR_M64: #define __MOVBE__ 1 +// CHECK_SPR_M64: #define __PCLMUL__ 1 +// CHECK_SPR_M64: #define __PCONFIG__ 1 +// CHECK_SPR_M64: #define __PKU__ 1 +// CHECK_SPR_M64: #define __POPCNT__ 1 +// CHECK_SPR_M64: #define __PRFCHW__ 1 +// CHECK_SPR_M64: #define __PTWRITE__ 1 +// CHECK_SPR_M64: #define __RDPID__ 1 +// CHECK_SPR_M64: #define __RDRND__ 1 +// CHECK_SPR_M64: #define __RDSEED__ 1 +// CHECK_SPR_M64: #define __SERIALIZE__ 1 +// CHECK_SPR_M64: #define __SGX__ 1 +// CHECK_SPR_M64: #define __SHA__ 1 +// CHECK_SPR_M64: #define __SHSTK__ 1 +// CHECK_SPR_M64: #define __SSE2__ 1 +// CHECK_SPR_M64: #define __SSE3__ 1 +// CHECK_SPR_M64: #define __SSE4_1__ 1 +// CHECK_SPR_M64: #define __SSE4_2__ 1 +// CHECK_SPR_M64: #define __SSE__ 1 +// CHECK_SPR_M64: #define __SSSE3__ 1 +// CHECK_SPR_M64: #define __TSXLDTRK__ 1 +// CHECK_SPR_M64: #define __VAES__ 1 +// CHECK_SPR_M64: #define __VPCLMULQDQ__ 1 +// CHECK_SPR_M64: #define __WAITPKG__ 1 +// CHECK_SPR_M64: #define __WBNOINVD__ 1 +// CHECK_SPR_M64: #define __XSAVEC__ 1 +// CHECK_SPR_M64: #define __XSAVEOPT__ 1 +// CHECK_SPR_M64: #define __XSAVES__ 1 +// CHECK_SPR_M64: #define __XSAVE__ 1 +// CHECK_SPR_M64: #define __amd64 1 +// CHECK_SPR_M64: #define __amd64__ 1 +// CHECK_SPR_M64: #define __corei7 1 +// CHECK_SPR_M64: #define __corei7__ 1 +// CHECK_SPR_M64: #define __tune_corei7__ 1 +// CHECK_SPR_M64: #define __x86_64 1 +// CHECK_SPR_M64: #define __x86_64__ 1 + // RUN: %clang -march=atom -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATOM_M32 diff --git a/clang/test/Sema/attr-arm-sve-vector-bits.c b/clang/test/Sema/attr-arm-sve-vector-bits.c index 30ae4c66f72e0..f143037fd6114 100644 --- a/clang/test/Sema/attr-arm-sve-vector-bits.c +++ b/clang/test/Sema/attr-arm-sve-vector-bits.c @@ -102,8 +102,11 @@ void f(int c) { svint8_t ss8; void *sel __attribute__((unused)); - sel = c ? ss8 : fs8; // expected-error {{incompatible operand types ('svint8_t' (aka '__SVInt8_t') and 'fixed_int8_t' (aka '__SVInt8_t'))}} - sel = c ? fs8 : ss8; // expected-error {{incompatible operand types ('fixed_int8_t' (aka '__SVInt8_t') and 'svint8_t' (aka '__SVInt8_t'))}} + sel = c ? ss8 : fs8; // expected-error {{cannot convert between a fixed-length and a sizeless vector}} + sel = c ? fs8 : ss8; // expected-error {{cannot convert between a fixed-length and a sizeless vector}} + + sel = fs8 + ss8; // expected-error {{cannot convert between a fixed-length and a sizeless vector}} + sel = ss8 + fs8; // expected-error {{cannot convert between a fixed-length and a sizeless vector}} } // --------------------------------------------------------------------------// @@ -192,14 +195,18 @@ TEST_CAST(bfloat16) TEST_CAST(bool) // Test the implicit conversion only applies to valid types -fixed_int8_t to_fixed_int8_t__from_svuint8_t(svuint8_t x) { return x; } // expected-error {{returning 'svuint8_t' (aka '__SVUint8_t') from a function with incompatible result type 'fixed_int8_t' (aka '__SVInt8_t')}} -fixed_bool_t to_fixed_bool_t__from_svint32_t(svint32_t x) { return x; } // expected-error {{returning 'svint32_t' (aka '__SVInt32_t') from a function with incompatible result type 'fixed_bool_t' (aka '__SVBool_t')}} +fixed_int8_t to_fixed_int8_t__from_svuint8_t(svuint8_t x) { return x; } // expected-error-re {{returning 'svuint8_t' (aka '__SVUint8_t') from a function with incompatible result type 'fixed_int8_t' (vector of {{[0-9]+}} 'signed char' values)}} +fixed_bool_t to_fixed_bool_t__from_svint32_t(svint32_t x) { return x; } // expected-error-re {{returning 'svint32_t' (aka '__SVInt32_t') from a function with incompatible result type 'fixed_bool_t' (vector of {{[0-9]+}} 'unsigned char' values)}} + +// Test conversion between predicate and uint8 is invalid, both have the same +// memory representation. +fixed_bool_t to_fixed_bool_t__from_svuint8_t(svuint8_t x) { return x; } // expected-error-re {{returning 'svuint8_t' (aka '__SVUint8_t') from a function with incompatible result type 'fixed_bool_t' (vector of {{[0-9]+}} 'unsigned char' values)}} // Test the implicit conversion only applies to fixed-length types typedef signed int vSInt32 __attribute__((__vector_size__(16))); -svint32_t to_svint32_t_from_gnut(vSInt32 x) { return x; } // expected-error {{returning 'vSInt32' (vector of 4 'int' values) from a function with incompatible result type 'svint32_t' (aka '__SVInt32_t')}} +svint32_t to_svint32_t_from_gnut(vSInt32 x) { return x; } // expected-error-re {{returning 'vSInt32' (vector of {{[0-9]+}} 'int' values) from a function with incompatible result type 'svint32_t' (aka '__SVInt32_t')}} -vSInt32 to_gnut_from_svint32_t(svint32_t x) { return x; } // expected-error {{returning 'svint32_t' (aka '__SVInt32_t') from a function with incompatible result type 'vSInt32' (vector of 4 'int' values)}} +vSInt32 to_gnut_from_svint32_t(svint32_t x) { return x; } // expected-error-re {{returning 'svint32_t' (aka '__SVInt32_t') from a function with incompatible result type 'vSInt32' (vector of {{[0-9]+}} 'int' values)}} // --------------------------------------------------------------------------// // Test the scalable and fixed-length types can be used interchangeably diff --git a/clang/test/SemaCXX/abstract.cpp b/clang/test/SemaCXX/abstract.cpp index 1fda21caea49f..c6ee3c6e4b865 100644 --- a/clang/test/SemaCXX/abstract.cpp +++ b/clang/test/SemaCXX/abstract.cpp @@ -279,7 +279,7 @@ namespace pr12658 { virtual void f() = 0; // expected-note {{unimplemented pure virtual method 'f' in 'C'}} }; - void foo( C& c ) {} + void foo(const C& c ) {} void bar( void ) { foo(C(99)); // expected-error {{allocating an object of abstract class type 'pr12658::C'}} diff --git a/clang/test/SemaCXX/attr-arm-sve-vector-bits.cpp b/clang/test/SemaCXX/attr-arm-sve-vector-bits.cpp new file mode 100644 index 0000000000000..c8ce257ad3265 --- /dev/null +++ b/clang/test/SemaCXX/attr-arm-sve-vector-bits.cpp @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fsyntax-only -verify -std=c++11 -msve-vector-bits=512 -fallow-half-arguments-and-returns %s +// expected-no-diagnostics + +#define N __ARM_FEATURE_SVE_BITS_EXPERIMENTAL + +typedef __SVInt8_t svint8_t; +typedef svint8_t fixed_int8_t __attribute__((arm_sve_vector_bits(N))); + +template struct S { T var; }; + +S s; + +svint8_t to_svint8_t(fixed_int8_t x) { return x; } +fixed_int8_t from_svint8_t(svint8_t x) { return x; } diff --git a/clang/test/SemaCXX/decl-expr-ambiguity.cpp b/clang/test/SemaCXX/decl-expr-ambiguity.cpp index b77e226b5d012..a15ec397b4aed 100644 --- a/clang/test/SemaCXX/decl-expr-ambiguity.cpp +++ b/clang/test/SemaCXX/decl-expr-ambiguity.cpp @@ -12,7 +12,7 @@ void f() { T(a)->m = 7; int(a)++; // expected-error {{assignment to cast is illegal}} __extension__ int(a)++; // expected-error {{assignment to cast is illegal}} - __typeof(int)(a,5)< struct C { static const int n = f(T()); // expected-error {{no matching function}} diff --git a/clang/test/SemaTemplate/friend.cpp b/clang/test/SemaTemplate/friend.cpp index 777682be3f1b8..283c7732ccff1 100644 --- a/clang/test/SemaTemplate/friend.cpp +++ b/clang/test/SemaTemplate/friend.cpp @@ -122,3 +122,22 @@ namespace qualified_friend_finds_nothing { namespace N { void f(int); } B bi; // ok?! } + +namespace PR37556 { + inline namespace N { int x1, x2, y1, y2; } // expected-note 2{{previous}} + struct X { + friend void x1(int); + friend void PR37556::x2(int); // expected-error {{different kind}} + }; + template struct Y { + friend void y1(T); + friend void PR37556::y2(T); // expected-error {{different kind}} + }; + template struct Y; + template struct Z { + friend void z1(T); + friend void PR37556::z2(T); // expected-error {{does not match any}} + }; + inline namespace N { int z1, z2; } + template struct Z; +} diff --git a/clang/tools/clang-format/git-clang-format b/clang/tools/clang-format/git-clang-format index f3cd585e7f4a0..e4dc4cbc1dc9e 100755 --- a/clang/tools/clang-format/git-clang-format +++ b/clang/tools/clang-format/git-clang-format @@ -148,7 +148,8 @@ def main(): for filename in changed_lines: print(' %s' % filename) if not changed_lines: - print('no modified files to format') + if opts.verbose >= 0: + print('no modified files to format') return # The computed diff outputs absolute paths, so we must cd before accessing # those files. diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index 9cab6cca0dc42..51391d2216268 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -5803,6 +5803,7 @@ struct SourceWithCompletedTagList : clang::ExternalASTSource { Record->completeDefinition(); CompletedTags.push_back(Tag); } + using clang::ExternalASTSource::CompleteType; }; TEST_P(ImportWithExternalSource, CompleteRecordBeforeImporting) { diff --git a/clang/unittests/Tooling/HeaderIncludesTest.cpp b/clang/unittests/Tooling/HeaderIncludesTest.cpp index d38104fe40ecd..37007fbfb65e9 100644 --- a/clang/unittests/Tooling/HeaderIncludesTest.cpp +++ b/clang/unittests/Tooling/HeaderIncludesTest.cpp @@ -40,7 +40,7 @@ class HeaderIncludesTest : public ::testing::Test { return *Result; } - const std::string FileName = "fix.cpp"; + std::string FileName = "fix.cpp"; IncludeStyle Style = format::getLLVMStyle().IncludeStyle; }; @@ -102,6 +102,15 @@ TEST_F(HeaderIncludesTest, InsertAfterMainHeader) { Style = format::getGoogleStyle(format::FormatStyle::LanguageKind::LK_Cpp) .IncludeStyle; EXPECT_EQ(Expected, insert(Code, "")); + + FileName = "fix.cu.cpp"; + EXPECT_EQ(Expected, insert(Code, "")); + + FileName = "fix_test.cu.cpp"; + EXPECT_EQ(Expected, insert(Code, "")); + + FileName = "bar.cpp"; + EXPECT_NE(Expected, insert(Code, "")) << "Not main header"; } TEST_F(HeaderIncludesTest, InsertBeforeSystemHeaderLLVM) { diff --git a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp index 994dd68028ea6..a07187e22e930 100644 --- a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp +++ b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp @@ -24,27 +24,27 @@ int main() {} void foo() {} )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached |-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-main +| |-'int' +| |-SimpleDeclarator Declarator +| | |-'main' | | `-ParametersAndQualifiers -| | |-( -| | `-) +| | |-'(' OpenParen +| | `-')' CloseParen | `-CompoundStatement -| |-{ -| `-} +| |-'{' OpenParen +| `-'}' CloseParen `-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-foo + |-'void' + |-SimpleDeclarator Declarator + | |-'foo' | `-ParametersAndQualifiers - | |-( - | `-) + | |-'(' OpenParen + | `-')' CloseParen `-CompoundStatement - |-{ - `-} + |-'{' OpenParen + `-'}' CloseParen )txt")); } @@ -55,20 +55,20 @@ int a; int b = 42; )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached |-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | `-a -| `-; +| |-'int' +| |-SimpleDeclarator Declarator +| | `-'a' +| `-';' `-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-b - | |-= + |-'int' + |-SimpleDeclarator Declarator + | |-'b' + | |-'=' | `-IntegerLiteralExpression - | `-42 - `-; + | `-'42' LiteralToken + `-';' )txt")); } @@ -78,26 +78,27 @@ TEST_P(SyntaxTreeTest, SimpleFunction) { void foo(int a, int b) {} )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-foo + |-'void' + |-SimpleDeclarator Declarator + | |-'foo' | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | |-int - | | `-SimpleDeclarator - | | `-a - | |-, - | |-SimpleDeclaration - | | |-int - | | `-SimpleDeclarator - | | `-b - | `-) + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | |-SimpleDeclaration ListElement + | | | |-'int' + | | | `-SimpleDeclarator Declarator + | | | `-'a' + | | |-',' ListDelimiter + | | `-SimpleDeclaration ListElement + | | |-'int' + | | `-SimpleDeclarator Declarator + | | `-'b' + | `-')' CloseParen `-CompoundStatement - |-{ - `-} + |-'{' OpenParen + `-'}' CloseParen )txt")); } @@ -110,36 +111,36 @@ void test() { } )cpp", {R"txt( -IfStatement -|-if -|-( +IfStatement Statement +|-'if' IntroducerKeyword +|-'(' |-IntegerLiteralExpression -| `-1 -|-) -`-CompoundStatement - |-{ - `-} +| `-'1' LiteralToken +|-')' +`-CompoundStatement ThenStatement + |-'{' OpenParen + `-'}' CloseParen )txt", R"txt( -IfStatement -|-if -|-( +IfStatement Statement +|-'if' IntroducerKeyword +|-'(' |-IntegerLiteralExpression -| `-1 -|-) -|-CompoundStatement -| |-{ -| `-} -|-else -`-IfStatement - |-if - |-( +| `-'1' LiteralToken +|-')' +|-CompoundStatement ThenStatement +| |-'{' OpenParen +| `-'}' CloseParen +|-'else' ElseKeyword +`-IfStatement ElseStatement + |-'if' IntroducerKeyword + |-'(' |-IntegerLiteralExpression - | `-0 - |-) - `-CompoundStatement - |-{ - `-} + | `-'0' LiteralToken + |-')' + `-CompoundStatement ThenStatement + |-'{' OpenParen + `-'}' CloseParen )txt"})); } @@ -151,15 +152,15 @@ void test() { } )cpp", {R"txt( -ForStatement -|-for -|-( -|-; -|-; -|-) -`-CompoundStatement - |-{ - `-} +ForStatement Statement +|-'for' IntroducerKeyword +|-'(' +|-';' +|-';' +|-')' +`-CompoundStatement BodyStatement + |-'{' OpenParen + `-'}' CloseParen )txt"})); } @@ -176,20 +177,20 @@ void test() { } )cpp", {R"txt( -RangeBasedForStatement -|-for -|-( +RangeBasedForStatement Statement +|-'for' IntroducerKeyword +|-'(' |-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | `-x -| `-: +| |-'int' +| |-SimpleDeclarator Declarator +| | `-'x' +| `-':' |-IdExpression -| `-UnqualifiedId -| `-a -|-) -`-EmptyStatement - `-; +| `-UnqualifiedId UnqualifiedId +| `-'a' +|-')' +`-EmptyStatement BodyStatement + `-';' )txt"})); } @@ -201,15 +202,15 @@ void test() { } )cpp", {R"txt( -DeclarationStatement +DeclarationStatement Statement |-SimpleDeclaration -| |-int -| `-SimpleDeclarator -| |-a -| |-= +| |-'int' +| `-SimpleDeclarator Declarator +| |-'a' +| |-'=' | `-IntegerLiteralExpression -| `-10 -`-; +| `-'10' LiteralToken +`-';' )txt"})); } @@ -224,25 +225,25 @@ void test() { } )cpp", {R"txt( -SwitchStatement -|-switch -|-( +SwitchStatement Statement +|-'switch' IntroducerKeyword +|-'(' |-IntegerLiteralExpression -| `-1 -|-) -`-CompoundStatement - |-{ - |-CaseStatement - | |-case - | |-IntegerLiteralExpression - | | `-0 - | |-: - | `-DefaultStatement - | |-default - | |-: - | `-EmptyStatement - | `-; - `-} +| `-'1' LiteralToken +|-')' +`-CompoundStatement BodyStatement + |-'{' OpenParen + |-CaseStatement Statement + | |-'case' IntroducerKeyword + | |-IntegerLiteralExpression CaseValue + | | `-'0' LiteralToken + | |-':' + | `-DefaultStatement BodyStatement + | |-'default' IntroducerKeyword + | |-':' + | `-EmptyStatement BodyStatement + | `-';' + `-'}' CloseParen )txt"})); } @@ -254,21 +255,21 @@ void test() { } )cpp", {R"txt( -WhileStatement -|-while -|-( +WhileStatement Statement +|-'while' IntroducerKeyword +|-'(' |-IntegerLiteralExpression -| `-1 -|-) -`-CompoundStatement - |-{ - |-ContinueStatement - | |-continue - | `-; - |-BreakStatement - | |-break - | `-; - `-} +| `-'1' LiteralToken +|-')' +`-CompoundStatement BodyStatement + |-'{' OpenParen + |-ContinueStatement Statement + | |-'continue' IntroducerKeyword + | `-';' + |-BreakStatement Statement + | |-'break' IntroducerKeyword + | `-';' + `-'}' CloseParen )txt"})); } @@ -283,14 +284,14 @@ int test() { } )cpp", {R"txt( -UnknownStatement -|-foo -|-: +UnknownStatement Statement +|-'foo' +|-':' `-ReturnStatement - |-return - |-IntegerLiteralExpression - | `-100 - `-; + |-'return' IntroducerKeyword + |-IntegerLiteralExpression ReturnValue + | `-'100' LiteralToken + `-';' )txt"})); } @@ -305,48 +306,48 @@ void test() { } )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-test + |-'void' + |-SimpleDeclarator Declarator + | |-'test' | `-ParametersAndQualifiers - | |-( - | `-) + | |-'(' OpenParen + | `-')' CloseParen `-CompoundStatement - |-{ - |-ExpressionStatement - | |-UnknownExpression - | | |-IdExpression - | | | `-UnqualifiedId - | | | `-test - | | |-( - | | `-) - | `-; - |-IfStatement - | |-if - | |-( + |-'{' OpenParen + |-ExpressionStatement Statement + | |-CallExpression Expression + | | |-IdExpression Callee + | | | `-UnqualifiedId UnqualifiedId + | | | `-'test' + | | |-'(' OpenParen + | | `-')' CloseParen + | `-';' + |-IfStatement Statement + | |-'if' IntroducerKeyword + | |-'(' | |-IntegerLiteralExpression - | | `-1 - | |-) - | |-ExpressionStatement - | | |-UnknownExpression - | | | |-IdExpression - | | | | `-UnqualifiedId - | | | | `-test - | | | |-( - | | | `-) - | | `-; - | |-else - | `-ExpressionStatement - | |-UnknownExpression - | | |-IdExpression - | | | `-UnqualifiedId - | | | `-test - | | |-( - | | `-) - | `-; - `-} + | | `-'1' LiteralToken + | |-')' + | |-ExpressionStatement ThenStatement + | | |-CallExpression Expression + | | | |-IdExpression Callee + | | | | `-UnqualifiedId UnqualifiedId + | | | | `-'test' + | | | |-'(' OpenParen + | | | `-')' CloseParen + | | `-';' + | |-'else' ElseKeyword + | `-ExpressionStatement ElseStatement + | |-CallExpression Expression + | | |-IdExpression Callee + | | | `-UnqualifiedId UnqualifiedId + | | | `-'test' + | | |-'(' OpenParen + | | `-')' CloseParen + | `-';' + `-'}' CloseParen )txt")); } @@ -358,9 +359,9 @@ void test(int a) { } )cpp", {R"txt( -IdExpression -`-UnqualifiedId - `-a +IdExpression Expression +`-UnqualifiedId UnqualifiedId + `-'a' )txt"})); } @@ -378,20 +379,21 @@ void test(X x) { } )cpp", {R"txt( -UnknownExpression -|-IdExpression -| `-UnqualifiedId -| |-operator -| `-+ -|-( -|-IdExpression -| `-UnqualifiedId -| `-x -|-, -|-IdExpression -| `-UnqualifiedId -| `-x -`-) +CallExpression Expression +|-IdExpression Callee +| `-UnqualifiedId UnqualifiedId +| |-'operator' +| `-'+' +|-'(' OpenParen +|-CallArguments Arguments +| |-IdExpression ListElement +| | `-UnqualifiedId UnqualifiedId +| | `-'x' +| |-',' ListDelimiter +| `-IdExpression ListElement +| `-UnqualifiedId UnqualifiedId +| `-'x' +`-')' CloseParen )txt"})); } @@ -409,18 +411,18 @@ void test(X x) { } )cpp", {R"txt( -UnknownExpression -|-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-x -| |-. -| `-IdExpression -| `-UnqualifiedId -| |-operator -| `-int -|-( -`-) +CallExpression Expression +|-MemberExpression Callee +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'x' +| |-'.' AccessToken +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| |-'operator' +| `-'int' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -436,16 +438,17 @@ void test() { } )cpp", {R"txt( -UnknownExpression -|-IdExpression -| `-UnqualifiedId -| |-operator -| |-"" -| `-_w -|-( -|-CharacterLiteralExpression -| `-'1' -`-) +CallExpression Expression +|-IdExpression Callee +| `-UnqualifiedId UnqualifiedId +| |-'operator' +| |-'""' +| `-'_w' +|-'(' OpenParen +|-CallArguments Arguments +| `-CharacterLiteralExpression ListElement +| `-''1'' LiteralToken +`-')' CloseParen )txt"})); } @@ -461,18 +464,18 @@ void test(X x) { } )cpp", {R"txt( -UnknownExpression -|-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-x -| |-. -| `-IdExpression -| `-UnqualifiedId -| |-~ -| `-X -|-( -`-) +CallExpression Expression +|-MemberExpression Callee +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'x' +| |-'.' AccessToken +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| |-'~' +| `-'X' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -492,21 +495,21 @@ void test(X x) { } )cpp", {R"txt( -UnknownExpression -|-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-x -| |-. -| `-IdExpression -| `-UnqualifiedId -| `-~ -|-decltype -|-( -|-x -|-) -|-( -`-) +CallExpression Expression +|-MemberExpression Callee +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'x' +| |-'.' AccessToken +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| `-'~' +|-'decltype' +|-'(' +|-'x' +|-')' +|-'(' +`-')' CloseParen )txt"})); } @@ -523,15 +526,15 @@ void test() { } )cpp", {R"txt( -UnknownExpression -|-IdExpression -| `-UnqualifiedId -| |-f -| |-< -| |-int -| `-> -|-( -`-) +CallExpression Expression +|-IdExpression Callee +| `-UnqualifiedId UnqualifiedId +| |-'f' +| |-'<' +| |-'int' +| `-'>' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -555,25 +558,25 @@ void test() { {R"txt( SimpleDeclaration |-NestedNameSpecifier -| |-:: -| |-IdentifierNameSpecifier -| | `-n -| `-:: -|-S -`-SimpleDeclarator +| |-'::' ListDelimiter +| |-IdentifierNameSpecifier ListElement +| | `-'n' +| `-'::' ListDelimiter +|-'S' +`-SimpleDeclarator Declarator `-UnknownExpression - `-s1 + `-'s1' )txt", R"txt( SimpleDeclaration |-NestedNameSpecifier -| |-IdentifierNameSpecifier -| | `-n -| `-:: -|-S -`-SimpleDeclarator +| |-IdentifierNameSpecifier ListElement +| | `-'n' +| `-'::' ListDelimiter +|-'S' +`-SimpleDeclarator Declarator `-UnknownExpression - `-s2 + `-'s2' )txt"})); } @@ -595,33 +598,33 @@ void test() { {R"txt( SimpleDeclaration |-NestedNameSpecifier -| |-:: -| |-SimpleTemplateNameSpecifier -| | |-template -| | |-ST -| | |-< -| | |-int -| | `-> -| `-:: -|-S -`-SimpleDeclarator +| |-'::' ListDelimiter +| |-SimpleTemplateNameSpecifier ListElement +| | |-'template' +| | |-'ST' +| | |-'<' +| | |-'int' +| | `-'>' +| `-'::' ListDelimiter +|-'S' +`-SimpleDeclarator Declarator `-UnknownExpression - `-s1 + `-'s1' )txt", R"txt( SimpleDeclaration |-NestedNameSpecifier -| |-:: -| |-SimpleTemplateNameSpecifier -| | |-ST -| | |-< -| | |-int -| | `-> -| `-:: -|-S -`-SimpleDeclarator +| |-'::' ListDelimiter +| |-SimpleTemplateNameSpecifier ListElement +| | |-'ST' +| | |-'<' +| | |-'int' +| | `-'>' +| `-'::' ListDelimiter +|-'S' +`-SimpleDeclarator Declarator `-UnknownExpression - `-s2 + `-'s2' )txt"})); } @@ -639,21 +642,21 @@ void test(S s) { } )cpp", {R"txt( -UnknownExpression -|-IdExpression -| |-NestedNameSpecifier -| | |-DecltypeNameSpecifier -| | | |-decltype -| | | |-( +CallExpression Expression +|-IdExpression Callee +| |-NestedNameSpecifier Qualifier +| | |-DecltypeNameSpecifier ListElement +| | | |-'decltype' +| | | |-'(' | | | |-IdExpression -| | | | `-UnqualifiedId -| | | | `-s -| | | `-) -| | `-:: -| `-UnqualifiedId -| `-f -|-( -`-) +| | | | `-UnqualifiedId UnqualifiedId +| | | | `-'s' +| | | `-')' +| | `-'::' ListDelimiter +| `-UnqualifiedId UnqualifiedId +| `-'f' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -673,35 +676,35 @@ void test() { } )cpp", {R"txt( -UnknownExpression -|-IdExpression -| |-NestedNameSpecifier -| | |-IdentifierNameSpecifier -| | | `-S -| | `-:: -| `-UnqualifiedId -| |-f -| |-< -| |-int -| `-> -|-( -`-) +CallExpression Expression +|-IdExpression Callee +| |-NestedNameSpecifier Qualifier +| | |-IdentifierNameSpecifier ListElement +| | | `-'S' +| | `-'::' ListDelimiter +| `-UnqualifiedId UnqualifiedId +| |-'f' +| |-'<' +| |-'int' +| `-'>' +|-'(' OpenParen +`-')' CloseParen )txt", R"txt( -UnknownExpression -|-IdExpression -| |-NestedNameSpecifier -| | |-IdentifierNameSpecifier -| | | `-S -| | `-:: -| |-template -| `-UnqualifiedId -| |-f -| |-< -| |-int -| `-> -|-( -`-) +CallExpression Expression +|-IdExpression Callee +| |-NestedNameSpecifier Qualifier +| | |-IdentifierNameSpecifier ListElement +| | | `-'S' +| | `-'::' ListDelimiter +| |-'template' TemplateKeyword +| `-UnqualifiedId UnqualifiedId +| |-'f' +| |-'<' +| |-'int' +| `-'>' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -723,28 +726,28 @@ void test() { } )cpp", {R"txt( -UnknownExpression -|-IdExpression -| |-NestedNameSpecifier -| | |-:: -| | |-IdentifierNameSpecifier -| | | `-n -| | |-:: -| | |-SimpleTemplateNameSpecifier -| | | |-template -| | | |-ST -| | | |-< -| | | |-int -| | | `-> -| | `-:: -| |-template -| `-UnqualifiedId -| |-f -| |-< -| |-int -| `-> -|-( -`-) +CallExpression Expression +|-IdExpression Callee +| |-NestedNameSpecifier Qualifier +| | |-'::' ListDelimiter +| | |-IdentifierNameSpecifier ListElement +| | | `-'n' +| | |-'::' ListDelimiter +| | |-SimpleTemplateNameSpecifier ListElement +| | | |-'template' +| | | |-'ST' +| | | |-'<' +| | | |-'int' +| | | `-'>' +| | `-'::' ListDelimiter +| |-'template' TemplateKeyword +| `-UnqualifiedId UnqualifiedId +| |-'f' +| |-'<' +| |-'int' +| `-'>' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -767,55 +770,55 @@ void test() { } )cpp", {R"txt( -UnknownExpression -|-IdExpression -| |-NestedNameSpecifier -| | |-IdentifierNameSpecifier -| | | `-T -| | |-:: -| | |-SimpleTemplateNameSpecifier -| | | |-template -| | | |-U -| | | |-< -| | | |-int -| | | `-> -| | `-:: -| `-UnqualifiedId -| `-f -|-( -`-) +CallExpression Expression +|-IdExpression Callee +| |-NestedNameSpecifier Qualifier +| | |-IdentifierNameSpecifier ListElement +| | | `-'T' +| | |-'::' ListDelimiter +| | |-SimpleTemplateNameSpecifier ListElement +| | | |-'template' +| | | |-'U' +| | | |-'<' +| | | |-'int' +| | | `-'>' +| | `-'::' ListDelimiter +| `-UnqualifiedId UnqualifiedId +| `-'f' +|-'(' OpenParen +`-')' CloseParen )txt", R"txt( -UnknownExpression -|-IdExpression -| |-NestedNameSpecifier -| | |-IdentifierNameSpecifier -| | | `-T -| | |-:: -| | |-IdentifierNameSpecifier -| | | `-U -| | `-:: -| `-UnqualifiedId -| `-f -|-( -`-) +CallExpression Expression +|-IdExpression Callee +| |-NestedNameSpecifier Qualifier +| | |-IdentifierNameSpecifier ListElement +| | | `-'T' +| | |-'::' ListDelimiter +| | |-IdentifierNameSpecifier ListElement +| | | `-'U' +| | `-'::' ListDelimiter +| `-UnqualifiedId UnqualifiedId +| `-'f' +|-'(' OpenParen +`-')' CloseParen )txt", R"txt( -UnknownExpression -|-IdExpression -| |-NestedNameSpecifier -| | |-IdentifierNameSpecifier -| | | `-T -| | `-:: -| |-template -| `-UnqualifiedId -| |-f -| |-< +CallExpression Expression +|-IdExpression Callee +| |-NestedNameSpecifier Qualifier +| | |-IdentifierNameSpecifier ListElement +| | | `-'T' +| | `-'::' ListDelimiter +| |-'template' TemplateKeyword +| `-UnqualifiedId UnqualifiedId +| |-'f' +| |-'<' | |-IntegerLiteralExpression -| | `-0 -| `-> -|-( -`-) +| | `-'0' LiteralToken +| `-'>' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -832,8 +835,8 @@ struct S { }; )cpp", {R"txt( -ThisExpression -`-this +ThisExpression ReturnValue +`-'this' IntroducerKeyword )txt"})); } @@ -851,13 +854,13 @@ struct S { }; )cpp", {R"txt( -MemberExpression -|-ThisExpression -| `-this -|--> -`-IdExpression - `-UnqualifiedId - `-a +MemberExpression Expression +|-ThisExpression Object +| `-'this' IntroducerKeyword +|-'->' AccessToken +`-IdExpression Member + `-UnqualifiedId UnqualifiedId + `-'a' )txt"})); } @@ -875,9 +878,9 @@ struct S { }; )cpp", {R"txt( -IdExpression -`-UnqualifiedId - `-a +IdExpression Expression +`-UnqualifiedId UnqualifiedId + `-'a' )txt"})); } @@ -891,35 +894,35 @@ void test() { } )cpp", {R"txt( -ParenExpression -|-( -|-IntegerLiteralExpression -| `-1 -`-) +ParenExpression Expression +|-'(' OpenParen +|-IntegerLiteralExpression SubExpression +| `-'1' LiteralToken +`-')' CloseParen )txt", R"txt( -ParenExpression -|-( -|-ParenExpression -| |-( -| |-IntegerLiteralExpression -| | `-1 -| `-) -`-) +ParenExpression Expression +|-'(' OpenParen +|-ParenExpression SubExpression +| |-'(' OpenParen +| |-IntegerLiteralExpression SubExpression +| | `-'1' LiteralToken +| `-')' CloseParen +`-')' CloseParen )txt", R"txt( -ParenExpression -|-( -|-BinaryOperatorExpression -| |-IntegerLiteralExpression -| | `-1 -| |-+ -| `-ParenExpression -| |-( -| |-IntegerLiteralExpression -| | `-2 -| `-) -`-) +ParenExpression Expression +|-'(' OpenParen +|-BinaryOperatorExpression SubExpression +| |-IntegerLiteralExpression LeftHandSide +| | `-'1' LiteralToken +| |-'+' OperatorToken +| `-ParenExpression RightHandSide +| |-'(' OpenParen +| |-IntegerLiteralExpression SubExpression +| | `-'2' LiteralToken +| `-')' CloseParen +`-')' CloseParen )txt"})); } @@ -935,8 +938,8 @@ void test() { } )cpp", {R"txt( -CharUserDefinedLiteralExpression -`-'2'_c +CharUserDefinedLiteralExpression Expression +`-''2'_c' LiteralToken )txt"})); } @@ -955,8 +958,8 @@ void test() { } )cpp", {R"txt( -StringUserDefinedLiteralExpression -`-"12"_s +StringUserDefinedLiteralExpression Expression +`-'"12"_s' LiteralToken )txt"})); } @@ -978,16 +981,16 @@ void test() { } )cpp", {R"txt( -IntegerUserDefinedLiteralExpression -`-12_i +IntegerUserDefinedLiteralExpression Expression +`-'12_i' LiteralToken )txt", R"txt( -IntegerUserDefinedLiteralExpression -`-12_r +IntegerUserDefinedLiteralExpression Expression +`-'12_r' LiteralToken )txt", R"txt( -IntegerUserDefinedLiteralExpression -`-12_t +IntegerUserDefinedLiteralExpression Expression +`-'12_t' LiteralToken )txt"})); } @@ -1009,16 +1012,16 @@ void test() { } )cpp", {R"txt( -FloatUserDefinedLiteralExpression -`-1.2_f +FloatUserDefinedLiteralExpression Expression +`-'1.2_f' LiteralToken )txt", R"txt( -FloatUserDefinedLiteralExpression -`-1.2_r +FloatUserDefinedLiteralExpression Expression +`-'1.2_r' LiteralToken )txt", R"txt( -FloatUserDefinedLiteralExpression -`-1.2_t +FloatUserDefinedLiteralExpression Expression +`-'1.2_t' LiteralToken )txt"})); } @@ -1034,12 +1037,12 @@ void test() { } )cpp", {R"txt( -IntegerLiteralExpression -`-12ll +IntegerLiteralExpression Expression +`-'12ll' LiteralToken )txt", R"txt( -IntegerLiteralExpression -`-12ull +IntegerLiteralExpression Expression +`-'12ull' LiteralToken )txt"})); } @@ -1054,8 +1057,8 @@ void test() { } )cpp", {R"txt( -IntegerLiteralExpression -`-0b1100 +IntegerLiteralExpression Expression +`-'0b1100' LiteralToken )txt"})); } @@ -1070,8 +1073,8 @@ void test() { } )cpp", {R"txt( -IntegerLiteralExpression -`-1'2'0ull +IntegerLiteralExpression Expression +`-'1'2'0ull' LiteralToken )txt"})); } @@ -1088,28 +1091,28 @@ void test() { } )cpp", {R"txt( -CharacterLiteralExpression -`-'a' +CharacterLiteralExpression Expression +`-''a'' LiteralToken )txt", R"txt( -CharacterLiteralExpression -`-'\n' +CharacterLiteralExpression Expression +`-''\n'' LiteralToken )txt", R"txt( -CharacterLiteralExpression -`-'\x20' +CharacterLiteralExpression Expression +`-''\x20'' LiteralToken )txt", R"txt( -CharacterLiteralExpression -`-'\0' +CharacterLiteralExpression Expression +`-''\0'' LiteralToken )txt", R"txt( -CharacterLiteralExpression -`-L'a' +CharacterLiteralExpression Expression +`-'L'a'' LiteralToken )txt", R"txt( -CharacterLiteralExpression -`-L'α' +CharacterLiteralExpression Expression +`-'L'α'' LiteralToken )txt"})); } @@ -1127,20 +1130,20 @@ void test() { } )cpp", {R"txt( -CharacterLiteralExpression -`-u'a' +CharacterLiteralExpression Expression +`-'u'a'' LiteralToken )txt", R"txt( -CharacterLiteralExpression -`-u'構' +CharacterLiteralExpression Expression +`-'u'構'' LiteralToken )txt", R"txt( -CharacterLiteralExpression -`-U'a' +CharacterLiteralExpression Expression +`-'U'a'' LiteralToken )txt", R"txt( -CharacterLiteralExpression -`-U'🌲' +CharacterLiteralExpression Expression +`-'U'🌲'' LiteralToken )txt"})); } @@ -1156,12 +1159,12 @@ void test() { } )cpp", {R"txt( -CharacterLiteralExpression -`-u8'a' +CharacterLiteralExpression Expression +`-'u8'a'' LiteralToken )txt", R"txt( -CharacterLiteralExpression -`-u8'\x7f' +CharacterLiteralExpression Expression +`-'u8'\x7f'' LiteralToken )txt"})); } @@ -1176,20 +1179,20 @@ void test() { } )cpp", {R"txt( -FloatingLiteralExpression -`-1e-2 +FloatingLiteralExpression Expression +`-'1e-2' LiteralToken )txt", R"txt( -FloatingLiteralExpression -`-2. +FloatingLiteralExpression Expression +`-'2.' LiteralToken )txt", R"txt( -FloatingLiteralExpression -`-.2 +FloatingLiteralExpression Expression +`-'.2' LiteralToken )txt", R"txt( -FloatingLiteralExpression -`-2.f +FloatingLiteralExpression Expression +`-'2.f' LiteralToken )txt"})); } @@ -1207,20 +1210,20 @@ void test() { } )cpp", {R"txt( -FloatingLiteralExpression -`-0xfp1 +FloatingLiteralExpression Expression +`-'0xfp1' LiteralToken )txt", R"txt( -FloatingLiteralExpression -`-0xf.p1 +FloatingLiteralExpression Expression +`-'0xf.p1' LiteralToken )txt", R"txt( -FloatingLiteralExpression -`-0x.fp1 +FloatingLiteralExpression Expression +`-'0x.fp1' LiteralToken )txt", R"txt( -FloatingLiteralExpression -`-0xf.fp1f +FloatingLiteralExpression Expression +`-'0xf.fp1f' LiteralToken )txt"})); } @@ -1233,12 +1236,12 @@ void test() { } )cpp", {R"txt( -StringLiteralExpression -`-"a\n\0\x20" +StringLiteralExpression Expression +`-'"a\n\0\x20"' LiteralToken )txt", R"txt( -StringLiteralExpression -`-L"αβ" +StringLiteralExpression Expression +`-'L"αβ"' LiteralToken )txt"})); } @@ -1255,16 +1258,16 @@ void test() { } )cpp", {R"txt( -StringLiteralExpression -`-u8"a\x1f\x05" +StringLiteralExpression Expression +`-'u8"a\x1f\x05"' LiteralToken )txt", R"txt( -StringLiteralExpression -`-u"C++抽象構文木" +StringLiteralExpression Expression +`-'u"C++抽象構文木"' LiteralToken )txt", R"txt( -StringLiteralExpression -`-U"📖🌲\n" +StringLiteralExpression Expression +`-'U"📖🌲\n"' LiteralToken )txt"})); } @@ -1282,23 +1285,23 @@ TEST_P(SyntaxTreeTest, StringLiteral_Raw) { " Hello \"Syntax\" \\\"\n" " )SyntaxTree\";\n" "}\n", - "*: TranslationUnit\n" + "TranslationUnit Detached\n" "`-SimpleDeclaration\n" - " |-void\n" - " |-SimpleDeclarator\n" - " | |-test\n" + " |-'void'\n" + " |-SimpleDeclarator Declarator\n" + " | |-'test'\n" " | `-ParametersAndQualifiers\n" - " | |-(\n" - " | `-)\n" + " | |-'(' OpenParen\n" + " | `-')' CloseParen\n" " `-CompoundStatement\n" - " |-{\n" - " |-ExpressionStatement\n" - " | |-StringLiteralExpression\n" - " | | `-R\"SyntaxTree(\n" + " |-'{' OpenParen\n" + " |-ExpressionStatement Statement\n" + " | |-StringLiteralExpression Expression\n" + " | | `-'R\"SyntaxTree(\n" " Hello \"Syntax\" \\\"\n" - " )SyntaxTree\"\n" - " | `-;\n" - " `-}\n")); + " )SyntaxTree\"' LiteralToken\n" + " | `-';'\n" + " `-'}' CloseParen\n")); } TEST_P(SyntaxTreeTest, BoolLiteral) { @@ -1313,12 +1316,12 @@ void test() { } )cpp", {R"txt( -BoolLiteralExpression -`-true +BoolLiteralExpression Expression +`-'true' LiteralToken )txt", R"txt( -BoolLiteralExpression -`-false +BoolLiteralExpression Expression +`-'false' LiteralToken )txt"})); } @@ -1333,8 +1336,8 @@ void test() { } )cpp", {R"txt( -CxxNullPtrExpression -`-nullptr +CxxNullPtrExpression Expression +`-'nullptr' LiteralToken )txt"})); } @@ -1347,18 +1350,18 @@ void test(int a) { } )cpp", {R"txt( -PostfixUnaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-a -`-++ +PostfixUnaryOperatorExpression Expression +|-IdExpression Operand +| `-UnqualifiedId UnqualifiedId +| `-'a' +`-'++' OperatorToken )txt", R"txt( -PostfixUnaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-a -`--- +PostfixUnaryOperatorExpression Expression +|-IdExpression Operand +| `-UnqualifiedId UnqualifiedId +| `-'a' +`-'--' OperatorToken )txt"})); } @@ -1377,74 +1380,74 @@ void test(int a, int *ap) { } )cpp", {R"txt( -PrefixUnaryOperatorExpression -|--- -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'--' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt", R"txt( -PrefixUnaryOperatorExpression -|-++ -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'++' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt", R"txt( -PrefixUnaryOperatorExpression -|-~ -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'~' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt", R"txt( -PrefixUnaryOperatorExpression -|-- -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'-' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt", R"txt( -PrefixUnaryOperatorExpression -|-+ -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'+' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt", R"txt( -PrefixUnaryOperatorExpression -|-& -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'&' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt", R"txt( -PrefixUnaryOperatorExpression -|-* -`-IdExpression - `-UnqualifiedId - `-ap +PrefixUnaryOperatorExpression Expression +|-'*' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'ap' )txt", R"txt( -PrefixUnaryOperatorExpression -|-! -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'!' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt", R"txt( -PrefixUnaryOperatorExpression -|-__real -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'__real' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt", R"txt( -PrefixUnaryOperatorExpression -|-__imag -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'__imag' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt"})); } @@ -1460,18 +1463,18 @@ void test(int a, bool b) { } )cpp", {R"txt( -PrefixUnaryOperatorExpression -|-compl -`-IdExpression - `-UnqualifiedId - `-a +PrefixUnaryOperatorExpression Expression +|-'compl' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'a' )txt", R"txt( -PrefixUnaryOperatorExpression -|-not -`-IdExpression - `-UnqualifiedId - `-b +PrefixUnaryOperatorExpression Expression +|-'not' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'b' )txt"})); } @@ -1489,63 +1492,63 @@ void test(int a) { } )cpp", {R"txt( -BinaryOperatorExpression -|-IntegerLiteralExpression -| `-1 -|-- -`-IntegerLiteralExpression - `-2 +BinaryOperatorExpression Expression +|-IntegerLiteralExpression LeftHandSide +| `-'1' LiteralToken +|-'-' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'2' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-IntegerLiteralExpression -| `-1 -|-== -`-IntegerLiteralExpression - `-2 +BinaryOperatorExpression Expression +|-IntegerLiteralExpression LeftHandSide +| `-'1' LiteralToken +|-'==' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'2' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-a -|-= -`-IntegerLiteralExpression - `-1 +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'a' +|-'=' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'1' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-a -|-<<= -`-IntegerLiteralExpression - `-1 +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'a' +|-'<<=' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'1' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-IntegerLiteralExpression -| `-1 -|-|| -`-IntegerLiteralExpression - `-0 +BinaryOperatorExpression Expression +|-IntegerLiteralExpression LeftHandSide +| `-'1' LiteralToken +|-'||' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'0' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-IntegerLiteralExpression -| `-1 -|-& -`-IntegerLiteralExpression - `-2 +BinaryOperatorExpression Expression +|-IntegerLiteralExpression LeftHandSide +| `-'1' LiteralToken +|-'&' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'2' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-a -|-!= -`-IntegerLiteralExpression - `-3 +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'a' +|-'!=' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'3' LiteralToken )txt"})); } @@ -1563,37 +1566,37 @@ void test(int a) { } )cpp", {R"txt( -BinaryOperatorExpression -|-BoolLiteralExpression -| `-true -|-|| -`-BoolLiteralExpression - `-false +BinaryOperatorExpression Expression +|-BoolLiteralExpression LeftHandSide +| `-'true' LiteralToken +|-'||' OperatorToken +`-BoolLiteralExpression RightHandSide + `-'false' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-BoolLiteralExpression -| `-true -|-or -`-BoolLiteralExpression - `-false +BinaryOperatorExpression Expression +|-BoolLiteralExpression LeftHandSide +| `-'true' LiteralToken +|-'or' OperatorToken +`-BoolLiteralExpression RightHandSide + `-'false' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-IntegerLiteralExpression -| `-1 -|-bitand -`-IntegerLiteralExpression - `-2 +BinaryOperatorExpression Expression +|-IntegerLiteralExpression LeftHandSide +| `-'1' LiteralToken +|-'bitand' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'2' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-a -|-xor_eq -`-IntegerLiteralExpression - `-3 +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'a' +|-'xor_eq' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'3' LiteralToken )txt"})); } @@ -1605,26 +1608,26 @@ void test() { } )cpp", {R"txt( -BinaryOperatorExpression -|-ParenExpression -| |-( -| |-BinaryOperatorExpression -| | |-IntegerLiteralExpression -| | | `-1 -| | |-+ -| | `-IntegerLiteralExpression -| | `-2 -| `-) -|-* -`-ParenExpression - |-( - |-BinaryOperatorExpression - | |-IntegerLiteralExpression - | | `-4 - | |-/ - | `-IntegerLiteralExpression - | `-2 - `-) +BinaryOperatorExpression Expression +|-ParenExpression LeftHandSide +| |-'(' OpenParen +| |-BinaryOperatorExpression SubExpression +| | |-IntegerLiteralExpression LeftHandSide +| | | `-'1' LiteralToken +| | |-'+' OperatorToken +| | `-IntegerLiteralExpression RightHandSide +| | `-'2' LiteralToken +| `-')' CloseParen +|-'*' OperatorToken +`-ParenExpression RightHandSide + |-'(' OpenParen + |-BinaryOperatorExpression SubExpression + | |-IntegerLiteralExpression LeftHandSide + | | `-'4' LiteralToken + | |-'/' OperatorToken + | `-IntegerLiteralExpression RightHandSide + | `-'2' LiteralToken + `-')' CloseParen )txt"})); } @@ -1637,32 +1640,32 @@ void test(int a, int b) { } )cpp", {R"txt( -BinaryOperatorExpression -|-BinaryOperatorExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-a -| |-+ -| `-IdExpression -| `-UnqualifiedId -| `-b -|-+ -`-IntegerLiteralExpression - `-42 +BinaryOperatorExpression Expression +|-BinaryOperatorExpression LeftHandSide +| |-IdExpression LeftHandSide +| | `-UnqualifiedId UnqualifiedId +| | `-'a' +| |-'+' OperatorToken +| `-IdExpression RightHandSide +| `-UnqualifiedId UnqualifiedId +| `-'b' +|-'+' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'42' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-a -|-= -`-BinaryOperatorExpression - |-IdExpression - | `-UnqualifiedId - | `-b - |-= - `-IntegerLiteralExpression - `-42 +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'a' +|-'=' OperatorToken +`-BinaryOperatorExpression RightHandSide + |-IdExpression LeftHandSide + | `-UnqualifiedId UnqualifiedId + | `-'b' + |-'=' OperatorToken + `-IntegerLiteralExpression RightHandSide + `-'42' LiteralToken )txt"})); } @@ -1675,36 +1678,36 @@ void test() { } )cpp", {R"txt( -BinaryOperatorExpression -|-BinaryOperatorExpression -| |-IntegerLiteralExpression -| | `-1 -| |-+ -| `-BinaryOperatorExpression -| |-IntegerLiteralExpression -| | `-2 -| |-* -| `-IntegerLiteralExpression -| `-3 -|-+ -`-IntegerLiteralExpression - `-4 +BinaryOperatorExpression Expression +|-BinaryOperatorExpression LeftHandSide +| |-IntegerLiteralExpression LeftHandSide +| | `-'1' LiteralToken +| |-'+' OperatorToken +| `-BinaryOperatorExpression RightHandSide +| |-IntegerLiteralExpression LeftHandSide +| | `-'2' LiteralToken +| |-'*' OperatorToken +| `-IntegerLiteralExpression RightHandSide +| `-'3' LiteralToken +|-'+' OperatorToken +`-IntegerLiteralExpression RightHandSide + `-'4' LiteralToken )txt", R"txt( -BinaryOperatorExpression -|-BinaryOperatorExpression -| |-IntegerLiteralExpression -| | `-1 -| |-% -| `-IntegerLiteralExpression -| `-2 -|-+ -`-BinaryOperatorExpression - |-IntegerLiteralExpression - | `-3 - |-* - `-IntegerLiteralExpression - `-4 +BinaryOperatorExpression Expression +|-BinaryOperatorExpression LeftHandSide +| |-IntegerLiteralExpression LeftHandSide +| | `-'1' LiteralToken +| |-'%' OperatorToken +| `-IntegerLiteralExpression RightHandSide +| `-'2' LiteralToken +|-'+' OperatorToken +`-BinaryOperatorExpression RightHandSide + |-IntegerLiteralExpression LeftHandSide + | `-'3' LiteralToken + |-'*' OperatorToken + `-IntegerLiteralExpression RightHandSide + `-'4' LiteralToken )txt"})); } @@ -1722,14 +1725,14 @@ void test(X x, X y) { } )cpp", {R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-x -|-= -`-IdExpression - `-UnqualifiedId - `-y +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'x' +|-'=' OperatorToken +`-IdExpression RightHandSide + `-UnqualifiedId UnqualifiedId + `-'y' )txt"})); } @@ -1750,15 +1753,15 @@ void test(X x, X y) { } )cpp", {R"txt( -BinaryOperatorExpression -|-UnknownExpression +BinaryOperatorExpression Expression +|-UnknownExpression LeftHandSide | `-IdExpression -| `-UnqualifiedId -| `-x -|-+ -`-IdExpression - `-UnqualifiedId - `-y +| `-UnqualifiedId UnqualifiedId +| `-'x' +|-'+' OperatorToken +`-IdExpression RightHandSide + `-UnqualifiedId UnqualifiedId + `-'y' )txt"})); } @@ -1776,14 +1779,14 @@ void test(X x, X y) { } )cpp", {R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-x -|-< -`-IdExpression - `-UnqualifiedId - `-y +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'x' +|-'<' OperatorToken +`-IdExpression RightHandSide + `-UnqualifiedId UnqualifiedId + `-'y' )txt"})); } @@ -1801,14 +1804,14 @@ void test(X x, X y) { } )cpp", {R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-x -|-<< -`-IdExpression - `-UnqualifiedId - `-y +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'x' +|-'<<' OperatorToken +`-IdExpression RightHandSide + `-UnqualifiedId UnqualifiedId + `-'y' )txt"})); } @@ -1826,14 +1829,14 @@ void test(X x, X y) { } )cpp", {R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-x -|-, -`-IdExpression - `-UnqualifiedId - `-y +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'x' +|-',' OperatorToken +`-IdExpression RightHandSide + `-UnqualifiedId UnqualifiedId + `-'y' )txt"})); } @@ -1851,14 +1854,14 @@ void test(X* xp, int X::* pmi) { } )cpp", {R"txt( -BinaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-xp -|-->* -`-IdExpression - `-UnqualifiedId - `-pmi +BinaryOperatorExpression Expression +|-IdExpression LeftHandSide +| `-UnqualifiedId UnqualifiedId +| `-'xp' +|-'->*' OperatorToken +`-IdExpression RightHandSide + `-UnqualifiedId UnqualifiedId + `-'pmi' )txt"})); } @@ -1876,11 +1879,11 @@ void test(X x) { } )cpp", {R"txt( -PrefixUnaryOperatorExpression -|-! -`-IdExpression - `-UnqualifiedId - `-x +PrefixUnaryOperatorExpression Expression +|-'!' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'x' )txt"})); } @@ -1898,11 +1901,11 @@ void test(X x) { } )cpp", {R"txt( -PrefixUnaryOperatorExpression -|-& -`-IdExpression - `-UnqualifiedId - `-x +PrefixUnaryOperatorExpression Expression +|-'&' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'x' )txt"})); } @@ -1920,11 +1923,11 @@ void test(X x) { } )cpp", {R"txt( -PrefixUnaryOperatorExpression -|-++ -`-IdExpression - `-UnqualifiedId - `-x +PrefixUnaryOperatorExpression Expression +|-'++' OperatorToken +`-IdExpression Operand + `-UnqualifiedId UnqualifiedId + `-'x' )txt"})); } @@ -1942,11 +1945,11 @@ void test(X x) { } )cpp", {R"txt( -PostfixUnaryOperatorExpression -|-IdExpression -| `-UnqualifiedId -| `-x -`-++ +PostfixUnaryOperatorExpression Expression +|-IdExpression Operand +| `-UnqualifiedId UnqualifiedId +| `-'x' +`-'++' OperatorToken )txt"})); } @@ -1961,14 +1964,14 @@ void test(struct S s) { } )cpp", {R"txt( -MemberExpression -|-IdExpression -| `-UnqualifiedId -| `-s -|-. -`-IdExpression - `-UnqualifiedId - `-a +MemberExpression Expression +|-IdExpression Object +| `-UnqualifiedId UnqualifiedId +| `-'s' +|-'.' AccessToken +`-IdExpression Member + `-UnqualifiedId UnqualifiedId + `-'a' )txt"})); } @@ -1986,14 +1989,14 @@ void test(S s) { } )cpp", {R"txt( -MemberExpression -|-IdExpression -| `-UnqualifiedId -| `-s -|-. -`-IdExpression - `-UnqualifiedId - `-a +MemberExpression Expression +|-IdExpression Object +| `-UnqualifiedId UnqualifiedId +| `-'s' +|-'.' AccessToken +`-IdExpression Member + `-UnqualifiedId UnqualifiedId + `-'a' )txt"})); } @@ -2008,14 +2011,14 @@ void test(struct S* sp) { } )cpp", {R"txt( -MemberExpression -|-IdExpression -| `-UnqualifiedId -| `-sp -|--> -`-IdExpression - `-UnqualifiedId - `-a +MemberExpression Expression +|-IdExpression Object +| `-UnqualifiedId UnqualifiedId +| `-'sp' +|-'->' AccessToken +`-IdExpression Member + `-UnqualifiedId UnqualifiedId + `-'a' )txt"})); } @@ -2030,19 +2033,19 @@ void test(struct S s){ } )cpp", {R"txt( -MemberExpression -|-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-s -| |-. -| `-IdExpression -| `-UnqualifiedId -| `-next -|--> -`-IdExpression - `-UnqualifiedId - `-next +MemberExpression Expression +|-MemberExpression Object +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'s' +| |-'.' AccessToken +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| `-'next' +|-'->' AccessToken +`-IdExpression Member + `-UnqualifiedId UnqualifiedId + `-'next' )txt"})); } @@ -2060,18 +2063,18 @@ void test(S s) { } )cpp", {R"txt( -UnknownExpression -|-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-s -| |-. -| `-IdExpression -| `-UnqualifiedId -| |-operator -| `-! -|-( -`-) +CallExpression Expression +|-MemberExpression Callee +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'s' +| |-'.' AccessToken +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| |-'operator' +| `-'!' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -2093,21 +2096,21 @@ void test(S s) [[{ )cpp", {R"txt( CompoundStatement -|-{ -|-ExpressionStatement -| `-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-s -| |-. -| `-IdExpression -| `-UnqualifiedId -| `-x -|-< -|-int -|-> -|-; -`-} +|-'{' OpenParen +|-ExpressionStatement Statement +| `-MemberExpression Expression +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'s' +| |-'.' AccessToken +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| `-'x' +|-'<' +|-'int' +|-'>' +|-';' +`-'}' CloseParen )txt"})); } @@ -2126,20 +2129,20 @@ void test(S* sp){ } )cpp", {R"txt( -UnknownExpression -|-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-sp -| |--> -| `-IdExpression -| `-UnqualifiedId -| |-f -| |-< -| |-int -| `-> -|-( -`-) +CallExpression Expression +|-MemberExpression Callee +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'sp' +| |-'->' AccessToken +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| |-'f' +| |-'<' +| |-'int' +| `-'>' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -2158,21 +2161,21 @@ void test(S s){ } )cpp", {R"txt( -UnknownExpression -|-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-s -| |-. -| |-template -| `-IdExpression -| `-UnqualifiedId -| |-f -| |-< -| |-int -| `-> -|-( -`-) +CallExpression Expression +|-MemberExpression Callee +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'s' +| |-'.' AccessToken +| |-'template' +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| |-'f' +| |-'<' +| |-'int' +| `-'>' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -2192,40 +2195,40 @@ void test(S s){ } )cpp", {R"txt( -UnknownExpression -|-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-s -| |-. -| `-IdExpression -| |-NestedNameSpecifier -| | |-IdentifierNameSpecifier -| | | `-Base -| | `-:: -| `-UnqualifiedId -| `-f -|-( -`-) +CallExpression Expression +|-MemberExpression Callee +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'s' +| |-'.' AccessToken +| `-IdExpression Member +| |-NestedNameSpecifier Qualifier +| | |-IdentifierNameSpecifier ListElement +| | | `-'Base' +| | `-'::' ListDelimiter +| `-UnqualifiedId UnqualifiedId +| `-'f' +|-'(' OpenParen +`-')' CloseParen )txt", R"txt( -UnknownExpression -|-MemberExpression -| |-IdExpression -| | `-UnqualifiedId -| | `-s -| |-. -| `-IdExpression -| |-NestedNameSpecifier -| | |-:: -| | |-IdentifierNameSpecifier -| | | `-S -| | `-:: -| `-UnqualifiedId -| |-~ -| `-S -|-( -`-) +CallExpression Expression +|-MemberExpression Callee +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'s' +| |-'.' AccessToken +| `-IdExpression Member +| |-NestedNameSpecifier Qualifier +| | |-'::' ListDelimiter +| | |-IdentifierNameSpecifier ListElement +| | | `-'S' +| | `-'::' ListDelimiter +| `-UnqualifiedId UnqualifiedId +| |-'~' +| `-'S' +|-'(' OpenParen +`-')' CloseParen )txt"})); } @@ -2253,510 +2256,1014 @@ void test(S* sp) { } )cpp", {R"txt( -UnknownExpression -|-MemberExpression -| |-UnknownExpression -| | |-MemberExpression -| | | |-IdExpression -| | | | `-UnqualifiedId -| | | | `-sp -| | | |--> -| | | `-IdExpression -| | | `-UnqualifiedId -| | | `-getU -| | |-( -| | `-) -| |-. -| `-IdExpression -| |-NestedNameSpecifier -| | |-SimpleTemplateNameSpecifier -| | | |-template -| | | |-U -| | | |-< -| | | |-int -| | | `-> -| | `-:: -| |-template -| `-UnqualifiedId -| |-f -| |-< -| |-int -| `-> -|-( -`-) +CallExpression Expression +|-MemberExpression Callee +| |-CallExpression Object +| | |-MemberExpression Callee +| | | |-IdExpression Object +| | | | `-UnqualifiedId UnqualifiedId +| | | | `-'sp' +| | | |-'->' AccessToken +| | | `-IdExpression Member +| | | `-UnqualifiedId UnqualifiedId +| | | `-'getU' +| | |-'(' OpenParen +| | `-')' CloseParen +| |-'.' AccessToken +| `-IdExpression Member +| |-NestedNameSpecifier Qualifier +| | |-SimpleTemplateNameSpecifier ListElement +| | | |-'template' +| | | |-'U' +| | | |-'<' +| | | |-'int' +| | | `-'>' +| | `-'::' ListDelimiter +| |-'template' TemplateKeyword +| `-UnqualifiedId UnqualifiedId +| |-'f' +| |-'<' +| |-'int' +| `-'>' +|-'(' OpenParen +`-')' CloseParen )txt"})); } -TEST_P(SyntaxTreeTest, MultipleDeclaratorsGrouping) { - EXPECT_TRUE(treeDumpEqual( +TEST_P(SyntaxTreeTest, CallExpression_Callee_Member) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -int *a, b; -int *c, d; +struct S{ + void f(); +}; +void test(S s) { + [[s.f()]]; +} )cpp", - R"txt( -*: TranslationUnit -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-* -| | `-a -| |-, -| |-SimpleDeclarator -| | `-b -| `-; -`-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-* - | `-c - |-, - |-SimpleDeclarator - | `-d - `-; -)txt")); + {R"txt( +CallExpression Expression +|-MemberExpression Callee +| |-IdExpression Object +| | `-UnqualifiedId UnqualifiedId +| | `-'s' +| |-'.' AccessToken +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| `-'f' +|-'(' OpenParen +`-')' CloseParen +)txt"})); } -TEST_P(SyntaxTreeTest, MultipleDeclaratorsGroupingTypedef) { - EXPECT_TRUE(treeDumpEqual( +TEST_P(SyntaxTreeTest, CallExpression_Callee_OperatorParens) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -typedef int *a, b; +struct S { + void operator()(); +}; +void test(S s) { + [[s()]]; +} )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-typedef - |-int - |-SimpleDeclarator - | |-* - | `-a - |-, - |-SimpleDeclarator - | `-b - `-; -)txt")); + {R"txt( +CallExpression Expression +|-IdExpression Callee +| `-UnqualifiedId UnqualifiedId +| `-'s' +|-'(' OpenParen +`-')' CloseParen +)txt"})); } -TEST_P(SyntaxTreeTest, MultipleDeclaratorsInsideStatement) { - EXPECT_TRUE(treeDumpEqual( +TEST_P(SyntaxTreeTest, CallExpression_Callee_OperatorParensChaining) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -void foo() { - int *a, b; - typedef int *ta, tb; +struct S { + S operator()(); +}; +void test(S s) { + [[s()()]]; } )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-foo - | `-ParametersAndQualifiers - | |-( - | `-) - `-CompoundStatement - |-{ - |-DeclarationStatement - | |-SimpleDeclaration - | | |-int - | | |-SimpleDeclarator - | | | |-* - | | | `-a - | | |-, - | | `-SimpleDeclarator - | | `-b - | `-; - |-DeclarationStatement - | |-SimpleDeclaration - | | |-typedef - | | |-int - | | |-SimpleDeclarator - | | | |-* - | | | `-ta - | | |-, - | | `-SimpleDeclarator - | | `-tb - | `-; - `-} -)txt")); + {R"txt( +CallExpression Expression +|-CallExpression Callee +| |-IdExpression Callee +| | `-UnqualifiedId UnqualifiedId +| | `-'s' +| |-'(' OpenParen +| `-')' CloseParen +|-'(' OpenParen +`-')' CloseParen +)txt"})); } -TEST_P(SyntaxTreeTest, SizeTTypedef) { - if (!GetParam().isCXX11OrLater()) { +TEST_P(SyntaxTreeTest, CallExpression_Callee_MemberWithThis) { + if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -typedef decltype(sizeof(void *)) size_t; - )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-typedef - |-decltype - |-( - |-UnknownExpression - | |-sizeof - | |-( - | |-void - | |-* - | `-) - |-) - |-SimpleDeclarator - | `-size_t - `-; -)txt")); +struct Base { + void f(); +}; +struct S: public Base { + void f(); + void test() { + [[this->f()]]; + [[f()]]; + [[this->Base::f()]]; + } +}; +)cpp", + {R"txt( +CallExpression Expression +|-MemberExpression Callee +| |-ThisExpression Object +| | `-'this' IntroducerKeyword +| |-'->' AccessToken +| `-IdExpression Member +| `-UnqualifiedId UnqualifiedId +| `-'f' +|-'(' OpenParen +`-')' CloseParen + )txt", + R"txt( +CallExpression Expression +|-IdExpression Callee +| `-UnqualifiedId UnqualifiedId +| `-'f' +|-'(' OpenParen +`-')' CloseParen + )txt", + R"txt( +CallExpression Expression +|-MemberExpression Callee +| |-ThisExpression Object +| | `-'this' IntroducerKeyword +| |-'->' AccessToken +| `-IdExpression Member +| |-NestedNameSpecifier Qualifier +| | |-IdentifierNameSpecifier ListElement +| | | `-'Base' +| | `-'::' ListDelimiter +| `-UnqualifiedId UnqualifiedId +| `-'f' +|-'(' OpenParen +`-')' CloseParen +)txt"})); } -TEST_P(SyntaxTreeTest, Namespace_Nested) { +TEST_P(SyntaxTreeTest, CallExpression_Callee_FunctionPointer) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -namespace a { namespace b {} } -namespace a::b {} +void (*pf)(); +void test() { + [[pf()]]; + [[(*pf)()]]; +} )cpp", - R"txt( -*: TranslationUnit -|-NamespaceDefinition -| |-namespace -| |-a -| |-{ -| |-NamespaceDefinition -| | |-namespace -| | |-b -| | |-{ -| | `-} -| `-} -`-NamespaceDefinition - |-namespace - |-a - |-:: - |-b - |-{ - `-} -)txt")); + {R"txt( +CallExpression Expression +|-IdExpression Callee +| `-UnqualifiedId UnqualifiedId +| `-'pf' +|-'(' OpenParen +`-')' CloseParen +)txt", + R"txt( +CallExpression Expression +|-ParenExpression Callee +| |-'(' OpenParen +| |-PrefixUnaryOperatorExpression SubExpression +| | |-'*' OperatorToken +| | `-IdExpression Operand +| | `-UnqualifiedId UnqualifiedId +| | `-'pf' +| `-')' CloseParen +|-'(' OpenParen +`-')' CloseParen +)txt"})); } -TEST_P(SyntaxTreeTest, Namespace_Unnamed) { +TEST_P(SyntaxTreeTest, CallExpression_Callee_MemberFunctionPointer) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -namespace {} +struct S { + void f(); +}; +void test(S s) { + void (S::*pmf)(); + pmf = &S::f; + [[(s.*pmf)()]]; +} )cpp", - R"txt( -*: TranslationUnit -`-NamespaceDefinition - |-namespace - |-{ - `-} -)txt")); + {R"txt( +CallExpression Expression +|-ParenExpression Callee +| |-'(' OpenParen +| |-BinaryOperatorExpression SubExpression +| | |-IdExpression LeftHandSide +| | | `-UnqualifiedId UnqualifiedId +| | | `-'s' +| | |-'.*' OperatorToken +| | `-IdExpression RightHandSide +| | `-UnqualifiedId UnqualifiedId +| | `-'pmf' +| `-')' CloseParen +|-'(' OpenParen +`-')' CloseParen +)txt"})); } -TEST_P(SyntaxTreeTest, Namespace_Alias) { +TEST_P(SyntaxTreeTest, CallExpression_Arguments_Zero) { if (!GetParam().isCXX()) { return; } EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -namespace a {} -[[namespace foo = a;]] +void f(); +void test() { + [[f();]] +} )cpp", {R"txt( -NamespaceAliasDefinition -|-namespace -|-foo -|-= -|-a -`-; +ExpressionStatement Statement +|-CallExpression Expression +| |-IdExpression Callee +| | `-UnqualifiedId UnqualifiedId +| | `-'f' +| |-'(' OpenParen +| `-')' CloseParen +`-';' )txt"})); } -TEST_P(SyntaxTreeTest, UsingDirective) { +TEST_P(SyntaxTreeTest, CallExpression_Arguments_One) { if (!GetParam().isCXX()) { return; } EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -namespace ns {} -[[using namespace ::ns;]] +void f(int); +void test() { + [[f(1);]] +} )cpp", {R"txt( -UsingNamespaceDirective -|-using -|-namespace -|-NestedNameSpecifier -| `-:: -|-ns -`-; +ExpressionStatement Statement +|-CallExpression Expression +| |-IdExpression Callee +| | `-UnqualifiedId UnqualifiedId +| | `-'f' +| |-'(' OpenParen +| |-CallArguments Arguments +| | `-IntegerLiteralExpression ListElement +| | `-'1' LiteralToken +| `-')' CloseParen +`-';' )txt"})); } -TEST_P(SyntaxTreeTest, UsingDeclaration) { +TEST_P(SyntaxTreeTest, CallExpression_Arguments_Multiple) { if (!GetParam().isCXX()) { return; } EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -namespace ns { int a; } -[[using ns::a;]] +void f(int, char, float); +void test() { + [[f(1, '2', 3.);]] +} )cpp", {R"txt( -UsingDeclaration -|-using -|-NestedNameSpecifier -| |-IdentifierNameSpecifier -| | `-ns -| `-:: -|-a -`-; +ExpressionStatement Statement +|-CallExpression Expression +| |-IdExpression Callee +| | `-UnqualifiedId UnqualifiedId +| | `-'f' +| |-'(' OpenParen +| |-CallArguments Arguments +| | |-IntegerLiteralExpression ListElement +| | | `-'1' LiteralToken +| | |-',' ListDelimiter +| | |-CharacterLiteralExpression ListElement +| | | `-''2'' LiteralToken +| | |-',' ListDelimiter +| | `-FloatingLiteralExpression ListElement +| | `-'3.' LiteralToken +| `-')' CloseParen +`-';' )txt"})); } -TEST_P(SyntaxTreeTest, FreeStandingClasses) { - // Free-standing classes, must live inside a SimpleDeclaration. - EXPECT_TRUE(treeDumpEqual( +TEST_P(SyntaxTreeTest, CallExpression_Arguments_Assignment) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -struct X; -struct X {}; - -struct Y *y1; -struct Y {} *y2; - -struct {} *a1; +void f(int); +void test(int a) { + [[f(a = 1);]] +} )cpp", - R"txt( -*: TranslationUnit -|-SimpleDeclaration -| |-struct -| |-X -| `-; -|-SimpleDeclaration -| |-struct -| |-X -| |-{ -| |-} -| `-; -|-SimpleDeclaration -| |-struct -| |-Y -| |-SimpleDeclarator -| | |-* -| | `-y1 -| `-; -|-SimpleDeclaration -| |-struct -| |-Y -| |-{ -| |-} -| |-SimpleDeclarator -| | |-* -| | `-y2 -| `-; -`-SimpleDeclaration - |-struct - |-{ - |-} - |-SimpleDeclarator - | |-* - | `-a1 - `-; -)txt")); + {R"txt( +ExpressionStatement Statement +|-CallExpression Expression +| |-IdExpression Callee +| | `-UnqualifiedId UnqualifiedId +| | `-'f' +| |-'(' OpenParen +| |-CallArguments Arguments +| | `-BinaryOperatorExpression ListElement +| | |-IdExpression LeftHandSide +| | | `-UnqualifiedId UnqualifiedId +| | | `-'a' +| | |-'=' OperatorToken +| | `-IntegerLiteralExpression RightHandSide +| | `-'1' LiteralToken +| `-')' CloseParen +`-';' +)txt"})); } -TEST_P(SyntaxTreeTest, StaticMemberFunction) { +TEST_P(SyntaxTreeTest, CallExpression_Arguments_BracedInitList_Empty) { if (!GetParam().isCXX11OrLater()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -struct S { - static void f(){} -}; +void f(int[]); +void test() { + [[f({});]] +} )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-struct - |-S - |-{ - |-SimpleDeclaration - | |-static - | |-void - | |-SimpleDeclarator - | | |-f - | | `-ParametersAndQualifiers - | | |-( - | | `-) - | `-CompoundStatement - | |-{ - | `-} - |-} - `-; -)txt")); + {R"txt( +ExpressionStatement Statement +|-CallExpression Expression +| |-IdExpression Callee +| | `-UnqualifiedId UnqualifiedId +| | `-'f' +| |-'(' OpenParen +| |-CallArguments Arguments +| | `-UnknownExpression ListElement +| | `-UnknownExpression +| | |-'{' +| | `-'}' +| `-')' CloseParen +`-';' +)txt"})); } -TEST_P(SyntaxTreeTest, ConversionMemberFunction) { - if (!GetParam().isCXX()) { +TEST_P(SyntaxTreeTest, CallExpression_Arguments_BracedInitList_Simple) { + if (!GetParam().isCXX11OrLater()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -struct X { - operator int(); +struct TT {}; +struct T{ + int a; + TT b; }; +void f(T); +void test() { + [[f({1, {}});]] +} )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-struct - |-X - |-{ - |-SimpleDeclaration - | |-SimpleDeclarator - | | |-operator - | | |-int - | | `-ParametersAndQualifiers - | | |-( - | | `-) - | `-; - |-} - `-; -)txt")); + {R"txt( +ExpressionStatement Statement +|-CallExpression Expression +| |-IdExpression Callee +| | `-UnqualifiedId UnqualifiedId +| | `-'f' +| |-'(' OpenParen +| |-CallArguments Arguments +| | `-UnknownExpression ListElement +| | `-UnknownExpression +| | |-'{' +| | |-IntegerLiteralExpression +| | | `-'1' LiteralToken +| | |-',' +| | |-UnknownExpression +| | | `-UnknownExpression +| | | |-'{' +| | | `-'}' +| | `-'}' +| `-')' CloseParen +`-';' +)txt"})); } -TEST_P(SyntaxTreeTest, LiteralOperatorDeclaration) { +TEST_P(SyntaxTreeTest, CallExpression_Arguments_BracedInitList_Designated) { if (!GetParam().isCXX11OrLater()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -unsigned operator "" _c(char); - )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-unsigned - |-SimpleDeclarator - | |-operator - | |-"" - | |-_c - | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | `-char - | `-) - `-; -)txt")); +struct TT {}; +struct T{ + int a; + TT b; +}; +void f(T); +void test() { + [[f({.a = 1, .b {}});]] +} +)cpp", + {R"txt( +ExpressionStatement Statement +|-CallExpression Expression +| |-IdExpression Callee +| | `-UnqualifiedId UnqualifiedId +| | `-'f' +| |-'(' OpenParen +| |-CallArguments Arguments +| | `-UnknownExpression ListElement +| | `-UnknownExpression +| | |-'{' +| | |-UnknownExpression +| | | |-'.' +| | | |-'a' +| | | |-'=' +| | | `-IntegerLiteralExpression +| | | `-'1' LiteralToken +| | |-',' +| | |-UnknownExpression +| | | |-'.' +| | | |-'b' +| | | `-UnknownExpression +| | | `-UnknownExpression +| | | |-'{' +| | | `-'}' +| | `-'}' +| `-')' CloseParen +`-';' +)txt"})); } -TEST_P(SyntaxTreeTest, NumericLiteralOperatorTemplateDeclaration) { - if (!GetParam().isCXX11OrLater()) { +TEST_P(SyntaxTreeTest, CallExpression_Arguments_ParameterPack) { + if (!GetParam().isCXX11OrLater() || GetParam().hasDelayedTemplateParsing()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -template -unsigned operator "" _t(); +template +void test(T t, Args... args) { + [[test(args...)]]; +} +)cpp", + {R"txt( +CallExpression Expression +|-UnknownExpression Callee +| `-'test' +|-'(' OpenParen +|-CallArguments Arguments +| `-UnknownExpression ListElement +| |-IdExpression +| | `-UnqualifiedId UnqualifiedId +| | `-'args' +| `-'...' +`-')' CloseParen +)txt"})); +} + +TEST_P(SyntaxTreeTest, MultipleDeclaratorsGrouping) { + EXPECT_TRUE(treeDumpEqual( + R"cpp( +int *a, b; +int *c, d; +)cpp", + R"txt( +TranslationUnit Detached +|-SimpleDeclaration +| |-'int' +| |-SimpleDeclarator Declarator +| | |-'*' +| | `-'a' +| |-',' +| |-SimpleDeclarator Declarator +| | `-'b' +| `-';' +`-SimpleDeclaration + |-'int' + |-SimpleDeclarator Declarator + | |-'*' + | `-'c' + |-',' + |-SimpleDeclarator Declarator + | `-'d' + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, MultipleDeclaratorsGroupingTypedef) { + EXPECT_TRUE(treeDumpEqual( + R"cpp( +typedef int *a, b; +)cpp", + R"txt( +TranslationUnit Detached +`-SimpleDeclaration + |-'typedef' + |-'int' + |-SimpleDeclarator Declarator + | |-'*' + | `-'a' + |-',' + |-SimpleDeclarator Declarator + | `-'b' + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, MultipleDeclaratorsInsideStatement) { + EXPECT_TRUE(treeDumpEqual( + R"cpp( +void foo() { + int *a, b; + typedef int *ta, tb; +} +)cpp", + R"txt( +TranslationUnit Detached +`-SimpleDeclaration + |-'void' + |-SimpleDeclarator Declarator + | |-'foo' + | `-ParametersAndQualifiers + | |-'(' OpenParen + | `-')' CloseParen + `-CompoundStatement + |-'{' OpenParen + |-DeclarationStatement Statement + | |-SimpleDeclaration + | | |-'int' + | | |-SimpleDeclarator Declarator + | | | |-'*' + | | | `-'a' + | | |-',' + | | `-SimpleDeclarator Declarator + | | `-'b' + | `-';' + |-DeclarationStatement Statement + | |-SimpleDeclaration + | | |-'typedef' + | | |-'int' + | | |-SimpleDeclarator Declarator + | | | |-'*' + | | | `-'ta' + | | |-',' + | | `-SimpleDeclarator Declarator + | | `-'tb' + | `-';' + `-'}' CloseParen +)txt")); +} + +TEST_P(SyntaxTreeTest, SizeTTypedef) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( +typedef decltype(sizeof(void *)) size_t; )cpp", R"txt( -*: TranslationUnit -`-TemplateDeclaration - |-template - |-< - |-SimpleDeclaration - | `-char - |-... - |-> - `-SimpleDeclaration - |-unsigned - |-SimpleDeclarator - | |-operator - | |-"" - | |-_t - | `-ParametersAndQualifiers - | |-( - | `-) - `-; +TranslationUnit Detached +`-SimpleDeclaration + |-'typedef' + |-'decltype' + |-'(' + |-UnknownExpression + | |-'sizeof' + | |-'(' + | |-'void' + | |-'*' + | `-')' + |-')' + |-SimpleDeclarator Declarator + | `-'size_t' + `-';' )txt")); } -TEST_P(SyntaxTreeTest, OverloadedOperatorDeclaration) { +TEST_P(SyntaxTreeTest, Namespace_Nested) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( +namespace a { namespace b {} } +)cpp", + R"txt( +TranslationUnit Detached +`-NamespaceDefinition + |-'namespace' + |-'a' + |-'{' + |-NamespaceDefinition + | |-'namespace' + | |-'b' + | |-'{' + | `-'}' + `-'}' +)txt")); +} + +TEST_P(SyntaxTreeTest, Namespace_NestedDefinition) { + if (!GetParam().isCXX17OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( +namespace a::b {} +)cpp", + R"txt( +TranslationUnit Detached +`-NamespaceDefinition + |-'namespace' + |-'a' + |-'::' + |-'b' + |-'{' + `-'}' +)txt")); +} + +TEST_P(SyntaxTreeTest, Namespace_Unnamed) { if (!GetParam().isCXX()) { return; } EXPECT_TRUE(treeDumpEqual( R"cpp( +namespace {} +)cpp", + R"txt( +TranslationUnit Detached +`-NamespaceDefinition + |-'namespace' + |-'{' + `-'}' +)txt")); +} + +TEST_P(SyntaxTreeTest, Namespace_Alias) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +namespace a {} +[[namespace foo = a;]] +)cpp", + {R"txt( +NamespaceAliasDefinition +|-'namespace' +|-'foo' +|-'=' +|-'a' +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, UsingDirective) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +namespace ns {} +[[using namespace ::ns;]] +)cpp", + {R"txt( +UsingNamespaceDirective +|-'using' +|-'namespace' +|-NestedNameSpecifier +| `-'::' ListDelimiter +|-'ns' +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, UsingDeclaration_Namespace) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +namespace ns { int a; } +[[using ns::a;]] +)cpp", + {R"txt( +UsingDeclaration +|-'using' +|-NestedNameSpecifier +| |-IdentifierNameSpecifier ListElement +| | `-'ns' +| `-'::' ListDelimiter +|-'a' +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, UsingDeclaration_ClassMember) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +template struct X { + [[using T::foo;]] + [[using typename T::bar;]] +}; +)cpp", + {R"txt( +UsingDeclaration +|-'using' +|-NestedNameSpecifier +| |-IdentifierNameSpecifier ListElement +| | `-'T' +| `-'::' ListDelimiter +|-'foo' +`-';' +)txt", + R"txt( +UsingDeclaration +|-'using' +|-'typename' +|-NestedNameSpecifier +| |-IdentifierNameSpecifier ListElement +| | `-'T' +| `-'::' ListDelimiter +|-'bar' +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, UsingTypeAlias) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( +using type = int; +)cpp", + R"txt( +TranslationUnit Detached +`-TypeAliasDeclaration + |-'using' + |-'type' + |-'=' + |-'int' + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, FreeStandingClass_ForwardDeclaration) { + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +[[struct X;]] +[[struct Y *y1;]] +)cpp", + {R"txt( +SimpleDeclaration +|-'struct' +|-'X' +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'struct' +|-'Y' +|-SimpleDeclarator Declarator +| |-'*' +| `-'y1' +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, FreeStandingClasses_Definition) { + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +[[struct X {};]] +[[struct Y {} *y2;]] +[[struct {} *a1;]] +)cpp", + {R"txt( +SimpleDeclaration +|-'struct' +|-'X' +|-'{' +|-'}' +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'struct' +|-'Y' +|-'{' +|-'}' +|-SimpleDeclarator Declarator +| |-'*' +| `-'y2' +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'struct' +|-'{' +|-'}' +|-SimpleDeclarator Declarator +| |-'*' +| `-'a1' +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, StaticMemberFunction) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +struct S { + [[static void f(){}]] +}; +)cpp", + {R"txt( +SimpleDeclaration +|-'static' +|-'void' +|-SimpleDeclarator Declarator +| |-'f' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| `-')' CloseParen +`-CompoundStatement + |-'{' OpenParen + `-'}' CloseParen +)txt"})); +} + +TEST_P(SyntaxTreeTest, ConversionMemberFunction) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( struct X { - X& operator=(const X&); + [[operator int();]] }; )cpp", + {R"txt( +SimpleDeclaration +|-SimpleDeclarator Declarator +| |-'operator' +| |-'int' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| `-')' CloseParen +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, LiteralOperatorDeclaration) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( +unsigned operator "" _c(char); + )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-struct - |-X - |-{ + |-'unsigned' + |-SimpleDeclarator Declarator + | |-'operator' + | |-'""' + | |-'_c' + | `-ParametersAndQualifiers + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | `-SimpleDeclaration ListElement + | | `-'char' + | `-')' CloseParen + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, NumericLiteralOperatorTemplateDeclaration) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( +template +unsigned operator "" _t(); + )cpp", + R"txt( +TranslationUnit Detached +`-TemplateDeclaration Declaration + |-'template' IntroducerKeyword + |-'<' |-SimpleDeclaration - | |-X - | |-SimpleDeclarator - | | |-& - | | |-operator - | | |-= - | | `-ParametersAndQualifiers - | | |-( - | | |-SimpleDeclaration - | | | |-const - | | | |-X - | | | `-SimpleDeclarator - | | | `-& - | | `-) - | `-; - |-} - `-; + | `-'char' + |-'...' + |-'>' + `-SimpleDeclaration + |-'unsigned' + |-SimpleDeclarator Declarator + | |-'operator' + | |-'""' + | |-'_t' + | `-ParametersAndQualifiers + | |-'(' OpenParen + | `-')' CloseParen + `-';' )txt")); } -TEST_P(SyntaxTreeTest, OverloadedOperatorFriendDeclarataion) { +TEST_P(SyntaxTreeTest, OverloadedOperatorDeclaration) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( struct X { - friend X operator+(X, const X&); + [[X& operator=(const X&);]] }; )cpp", - R"txt( -*: TranslationUnit + {R"txt( +SimpleDeclaration +|-'X' +|-SimpleDeclarator Declarator +| |-'&' +| |-'operator' +| |-'=' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-ParameterDeclarationList Parameters +| | `-SimpleDeclaration ListElement +| | |-'const' +| | |-'X' +| | `-SimpleDeclarator Declarator +| | `-'&' +| `-')' CloseParen +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, OverloadedOperatorFriendDeclaration) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +struct X { + [[friend X operator+(X, const X&);]] +}; +)cpp", + {R"txt( +UnknownDeclaration `-SimpleDeclaration - |-struct - |-X - |-{ - |-UnknownDeclaration - | `-SimpleDeclaration - | |-friend - | |-X - | |-SimpleDeclarator - | | |-operator - | | |-+ - | | `-ParametersAndQualifiers - | | |-( - | | |-SimpleDeclaration - | | | `-X - | | |-, - | | |-SimpleDeclaration - | | | |-const - | | | |-X - | | | `-SimpleDeclarator - | | | `-& - | | `-) - | `-; - |-} - `-; -)txt")); + |-'friend' + |-'X' + |-SimpleDeclarator Declarator + | |-'operator' + | |-'+' + | `-ParametersAndQualifiers + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | |-SimpleDeclaration ListElement + | | | `-'X' + | | |-',' ListDelimiter + | | `-SimpleDeclaration ListElement + | | |-'const' + | | |-'X' + | | `-SimpleDeclarator Declarator + | | `-'&' + | `-')' CloseParen + `-';' +)txt"})); } TEST_P(SyntaxTreeTest, ClassTemplateDeclaration) { @@ -2769,20 +3276,20 @@ template struct ST {}; )cpp", R"txt( -*: TranslationUnit -`-TemplateDeclaration - |-template - |-< +TranslationUnit Detached +`-TemplateDeclaration Declaration + |-'template' IntroducerKeyword + |-'<' |-UnknownDeclaration - | |-typename - | `-T - |-> + | |-'typename' + | `-'T' + |-'>' `-SimpleDeclaration - |-struct - |-ST - |-{ - |-} - `-; + |-'struct' + |-'ST' + |-'{' + |-'}' + `-';' )txt")); } @@ -2796,27 +3303,27 @@ template T f(); )cpp", R"txt( -*: TranslationUnit -`-TemplateDeclaration - |-template - |-< +TranslationUnit Detached +`-TemplateDeclaration Declaration + |-'template' IntroducerKeyword + |-'<' |-UnknownDeclaration - | |-typename - | `-T - |-> + | |-'typename' + | `-'T' + |-'>' `-SimpleDeclaration - |-T - |-SimpleDeclarator - | |-f + |-'T' + |-SimpleDeclarator Declarator + | |-'f' | `-ParametersAndQualifiers - | |-( - | `-) - `-; + | |-'(' OpenParen + | `-')' CloseParen + `-';' )txt")); } TEST_P(SyntaxTreeTest, VariableTemplateDeclaration) { - if (!GetParam().isCXX()) { + if (!GetParam().isCXX14OrLater()) { return; } EXPECT_TRUE(treeDumpEqual( @@ -2824,22 +3331,22 @@ TEST_P(SyntaxTreeTest, VariableTemplateDeclaration) { template T var = 10; )cpp", R"txt( -*: TranslationUnit -`-TemplateDeclaration - |-template - |-< +TranslationUnit Detached +`-TemplateDeclaration Declaration + |-'template' IntroducerKeyword + |-'<' |-UnknownDeclaration - | |-class - | `-T - |-> + | |-'class' + | `-'T' + |-'>' `-SimpleDeclaration - |-T - |-SimpleDeclarator - | |-var - | |-= + |-'T' + |-SimpleDeclarator Declarator + | |-'var' + | |-'=' | `-IntegerLiteralExpression - | `-10 - `-; + | `-'10' LiteralToken + `-';' )txt")); } @@ -2847,38 +3354,31 @@ TEST_P(SyntaxTreeTest, StaticMemberFunctionTemplate) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( struct S { - template - static U f(); + [[template + static U f();]] }; )cpp", - R"txt( -*: TranslationUnit + {R"txt( +TemplateDeclaration Declaration +|-'template' IntroducerKeyword +|-'<' +|-UnknownDeclaration +| |-'typename' +| `-'U' +|-'>' `-SimpleDeclaration - |-struct - |-S - |-{ - |-TemplateDeclaration - | |-template - | |-< - | |-UnknownDeclaration - | | |-typename - | | `-U - | |-> - | `-SimpleDeclaration - | |-static - | |-U - | |-SimpleDeclarator - | | |-f - | | `-ParametersAndQualifiers - | | |-( - | | `-) - | `-; - |-} - `-; -)txt")); + |-'static' + |-'U' + |-SimpleDeclarator Declarator + | |-'f' + | `-ParametersAndQualifiers + | |-'(' OpenParen + | `-')' CloseParen + `-';' +)txt"})); } TEST_P(SyntaxTreeTest, NestedTemplates) { @@ -2894,35 +3394,35 @@ struct X { }; )cpp", R"txt( -*: TranslationUnit -`-TemplateDeclaration - |-template - |-< +TranslationUnit Detached +`-TemplateDeclaration Declaration + |-'template' IntroducerKeyword + |-'<' |-UnknownDeclaration - | |-class - | `-T - |-> + | |-'class' + | `-'T' + |-'>' `-SimpleDeclaration - |-struct - |-X - |-{ - |-TemplateDeclaration - | |-template - | |-< + |-'struct' + |-'X' + |-'{' + |-TemplateDeclaration Declaration + | |-'template' IntroducerKeyword + | |-'<' | |-UnknownDeclaration - | | |-class - | | `-U - | |-> + | | |-'class' + | | `-'U' + | |-'>' | `-SimpleDeclaration - | |-U - | |-SimpleDeclarator - | | |-foo + | |-'U' + | |-SimpleDeclarator Declarator + | | |-'foo' | | `-ParametersAndQualifiers - | | |-( - | | `-) - | `-; - |-} - `-; + | | |-'(' OpenParen + | | `-')' CloseParen + | `-';' + |-'}' + `-';' )txt")); } @@ -2941,238 +3441,176 @@ namespace n { } )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-NamespaceDefinition - |-namespace - |-n - |-{ - |-TemplateDeclaration - | |-template - | |-< + |-'namespace' + |-'n' + |-'{' + |-TemplateDeclaration Declaration + | |-'template' IntroducerKeyword + | |-'<' | |-UnknownDeclaration - | | |-typename - | | `-T - | |-> + | | |-'typename' + | | `-'T' + | |-'>' | `-SimpleDeclaration - | |-struct - | |-ST - | |-{ - | |-TemplateDeclaration - | | |-template - | | |-< + | |-'struct' + | |-'ST' + | |-'{' + | |-TemplateDeclaration Declaration + | | |-'template' IntroducerKeyword + | | |-'<' | | |-UnknownDeclaration - | | | |-typename - | | | `-U - | | |-> + | | | |-'typename' + | | | `-'U' + | | |-'>' | | `-SimpleDeclaration - | | |-static - | | |-U - | | |-SimpleDeclarator - | | | |-f + | | |-'static' + | | |-'U' + | | |-SimpleDeclarator Declarator + | | | |-'f' | | | `-ParametersAndQualifiers - | | | |-( - | | | `-) - | | `-; - | |-} - | `-; - `-} + | | | |-'(' OpenParen + | | | `-')' CloseParen + | | `-';' + | |-'}' + | `-';' + `-'}' )txt")); } -TEST_P(SyntaxTreeTest, Templates2) { +TEST_P(SyntaxTreeTest, ClassTemplate_MemberClassDefinition) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( template struct X { struct Y; }; -template struct X::Y {}; +[[template struct X::Y {};]] )cpp", - R"txt( -*: TranslationUnit -|-TemplateDeclaration -| |-template -| |-< -| |-UnknownDeclaration -| | |-class -| | `-T -| |-> -| `-SimpleDeclaration -| |-struct -| |-X -| |-{ -| |-SimpleDeclaration -| | |-struct -| | |-Y -| | `-; -| |-} -| `-; -`-TemplateDeclaration - |-template - |-< - |-UnknownDeclaration - | |-class - | `-T - |-> - `-SimpleDeclaration - |-struct - |-NestedNameSpecifier - | |-SimpleTemplateNameSpecifier - | | |-X - | | |-< - | | |-T - | | `-> - | `-:: - |-Y - |-{ - |-} - `-; -)txt")); + {R"txt( +TemplateDeclaration Declaration +|-'template' IntroducerKeyword +|-'<' +|-UnknownDeclaration +| |-'class' +| `-'T' +|-'>' +`-SimpleDeclaration + |-'struct' + |-NestedNameSpecifier + | |-SimpleTemplateNameSpecifier ListElement + | | |-'X' + | | |-'<' + | | |-'T' + | | `-'>' + | `-'::' ListDelimiter + |-'Y' + |-'{' + |-'}' + `-';' +)txt"})); } -TEST_P(SyntaxTreeTest, TemplatesUsingUsing) { +TEST_P(SyntaxTreeTest, ExplicitClassTemplateInstantation_Definition) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -template struct X { - using T::foo; - using typename T::bar; -}; +template struct X {}; +[[template struct X;]] )cpp", - R"txt( -*: TranslationUnit -`-TemplateDeclaration - |-template - |-< - |-UnknownDeclaration - | |-class - | `-T - |-> - `-SimpleDeclaration - |-struct - |-X - |-{ - |-UsingDeclaration - | |-using - | |-NestedNameSpecifier - | | |-IdentifierNameSpecifier - | | | `-T - | | `-:: - | |-foo - | `-; - |-UsingDeclaration - | |-using - | |-typename - | |-NestedNameSpecifier - | | |-IdentifierNameSpecifier - | | | `-T - | | `-:: - | |-bar - | `-; - |-} - `-; -)txt")); + {R"txt( +ExplicitTemplateInstantiation +|-'template' IntroducerKeyword +`-SimpleDeclaration Declaration + |-'struct' + |-'X' + |-'<' + |-'double' + |-'>' + `-';' +)txt"})); } -TEST_P(SyntaxTreeTest, ExplicitTemplateInstantations) { +TEST_P(SyntaxTreeTest, ExplicitClassTemplateInstantation_Declaration) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( template struct X {}; -template struct X {}; -template <> struct X {}; +[[extern template struct X;]] +)cpp", + {R"txt( +ExplicitTemplateInstantiation +|-'extern' ExternKeyword +|-'template' IntroducerKeyword +`-SimpleDeclaration Declaration + |-'struct' + |-'X' + |-'<' + |-'float' + |-'>' + `-';' +)txt"})); +} -template struct X; -extern template struct X; +TEST_P(SyntaxTreeTest, ClassTemplateSpecialization_Partial) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +template struct X {}; +[[template struct X {};]] )cpp", - R"txt( -*: TranslationUnit -|-TemplateDeclaration -| |-template -| |-< -| |-UnknownDeclaration -| | |-class -| | `-T -| |-> -| `-SimpleDeclaration -| |-struct -| |-X -| |-{ -| |-} -| `-; -|-TemplateDeclaration -| |-template -| |-< -| |-UnknownDeclaration -| | |-class -| | `-T -| |-> -| `-SimpleDeclaration -| |-struct -| |-X -| |-< -| |-T -| |-* -| |-> -| |-{ -| |-} -| `-; -|-TemplateDeclaration -| |-template -| |-< -| |-> -| `-SimpleDeclaration -| |-struct -| |-X -| |-< -| |-int -| |-> -| |-{ -| |-} -| `-; -|-ExplicitTemplateInstantiation -| |-template -| `-SimpleDeclaration -| |-struct -| |-X -| |-< -| |-double -| |-> -| `-; -`-ExplicitTemplateInstantiation - |-extern - |-template - `-SimpleDeclaration - |-struct - |-X - |-< - |-float - |-> - `-; -)txt")); + {R"txt( +TemplateDeclaration Declaration +|-'template' IntroducerKeyword +|-'<' +|-UnknownDeclaration +| |-'class' +| `-'T' +|-'>' +`-SimpleDeclaration + |-'struct' + |-'X' + |-'<' + |-'T' + |-'*' + |-'>' + |-'{' + |-'}' + `-';' +)txt"})); } -TEST_P(SyntaxTreeTest, UsingType) { +TEST_P(SyntaxTreeTest, ClassTemplateSpecialization_Full) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( -using type = int; +template struct X {}; +[[template <> struct X {};]] )cpp", - R"txt( -*: TranslationUnit -`-TypeAliasDeclaration - |-using - |-type - |-= - |-int - `-; -)txt")); + {R"txt( +TemplateDeclaration Declaration +|-'template' IntroducerKeyword +|-'<' +|-'>' +`-SimpleDeclaration + |-'struct' + |-'X' + |-'<' + |-'int' + |-'>' + |-'{' + |-'}' + `-';' +)txt"})); } TEST_P(SyntaxTreeTest, EmptyDeclaration) { @@ -3181,9 +3619,9 @@ TEST_P(SyntaxTreeTest, EmptyDeclaration) { ; )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-EmptyDeclaration - `-; + `-';' )txt")); } @@ -3194,27 +3632,39 @@ TEST_P(SyntaxTreeTest, StaticAssert) { EXPECT_TRUE(treeDumpEqual( R"cpp( static_assert(true, "message"); +)cpp", + R"txt( +TranslationUnit Detached +`-StaticAssertDeclaration + |-'static_assert' + |-'(' + |-BoolLiteralExpression Condition + | `-'true' LiteralToken + |-',' + |-StringLiteralExpression Message + | `-'"message"' LiteralToken + |-')' + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, StaticAssert_WithoutMessage) { + if (!GetParam().isCXX17OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( static_assert(true); )cpp", R"txt( -*: TranslationUnit -|-StaticAssertDeclaration -| |-static_assert -| |-( -| |-BoolLiteralExpression -| | `-true -| |-, -| |-StringLiteralExpression -| | `-"message" -| |-) -| `-; +TranslationUnit Detached `-StaticAssertDeclaration - |-static_assert - |-( - |-BoolLiteralExpression - | `-true - |-) - `-; + |-'static_assert' + |-'(' + |-BoolLiteralExpression Condition + | `-'true' LiteralToken + |-')' + `-';' )txt")); } @@ -3228,30 +3678,30 @@ extern "C" int a; extern "C" { int b; int c; } )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached |-LinkageSpecificationDeclaration -| |-extern -| |-"C" +| |-'extern' +| |-'"C"' | `-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | `-a -| `-; +| |-'int' +| |-SimpleDeclarator Declarator +| | `-'a' +| `-';' `-LinkageSpecificationDeclaration - |-extern - |-"C" - |-{ + |-'extern' + |-'"C"' + |-'{' |-SimpleDeclaration - | |-int - | |-SimpleDeclarator - | | `-b - | `-; + | |-'int' + | |-SimpleDeclarator Declarator + | | `-'b' + | `-';' |-SimpleDeclaration - | |-int - | |-SimpleDeclarator - | | `-c - | `-; - `-} + | |-'int' + | |-SimpleDeclarator Declarator + | | `-'c' + | `-';' + `-'}' )txt")); } @@ -3265,34 +3715,34 @@ void test() { HALF_IF HALF_IF_2 else {} })cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-test + |-'void' + |-SimpleDeclarator Declarator + | |-'test' | `-ParametersAndQualifiers - | |-( - | `-) + | |-'(' OpenParen + | `-')' CloseParen `-CompoundStatement - |-{ - |-IfStatement - | |-I: if - | |-I: ( - | |-I: BinaryOperatorExpression - | | |-I: IntegerLiteralExpression - | | | `-I: 1 - | | |-I: + - | | `-I: IntegerLiteralExpression - | | `-I: 1 - | |-I: ) - | |-I: CompoundStatement - | | |-I: { - | | `-I: } - | |-else - | `-CompoundStatement - | |-{ - | `-} - `-} + |-'{' OpenParen + |-IfStatement Statement + | |-'if' IntroducerKeyword unmodifiable + | |-'(' unmodifiable + | |-BinaryOperatorExpression unmodifiable + | | |-IntegerLiteralExpression LeftHandSide unmodifiable + | | | `-'1' LiteralToken unmodifiable + | | |-'+' OperatorToken unmodifiable + | | `-IntegerLiteralExpression RightHandSide unmodifiable + | | `-'1' LiteralToken unmodifiable + | |-')' unmodifiable + | |-CompoundStatement ThenStatement unmodifiable + | | |-'{' OpenParen unmodifiable + | | `-'}' CloseParen unmodifiable + | |-'else' ElseKeyword + | `-CompoundStatement ElseStatement + | |-'{' OpenParen + | `-'}' CloseParen + `-'}' CloseParen )txt")); } @@ -3314,98 +3764,116 @@ void test() { } )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-test + |-'void' + |-SimpleDeclarator Declarator + | |-'test' | `-ParametersAndQualifiers - | |-( - | `-) + | |-'(' OpenParen + | `-')' CloseParen `-CompoundStatement - |-{ - |-CompoundStatement - | |-{ - | |-ExpressionStatement - | | |-IntegerLiteralExpression - | | | `-1 - | | `-; - | `-} - |-CompoundStatement - | |-{ - | |-ExpressionStatement - | | |-IntegerLiteralExpression - | | | `-2 - | | `-; - | `-} - `-} + |-'{' OpenParen + |-CompoundStatement Statement + | |-'{' OpenParen + | |-ExpressionStatement Statement + | | |-IntegerLiteralExpression Expression + | | | `-'1' LiteralToken + | | `-';' + | `-'}' CloseParen + |-CompoundStatement Statement + | |-'{' OpenParen + | |-ExpressionStatement Statement + | | |-IntegerLiteralExpression Expression + | | | `-'2' LiteralToken + | | `-';' + | `-'}' CloseParen + `-'}' CloseParen )txt")); } -TEST_P(SyntaxTreeTest, ArraySubscriptsInDeclarators) { +TEST_P(SyntaxTreeTest, ArrayDeclarator_Simple) { EXPECT_TRUE(treeDumpEqual( R"cpp( int a[10]; +)cpp", + R"txt( +TranslationUnit Detached +`-SimpleDeclaration + |-'int' + |-SimpleDeclarator Declarator + | |-'a' + | `-ArraySubscript + | |-'[' OpenParen + | |-IntegerLiteralExpression Size + | | `-'10' LiteralToken + | `-']' CloseParen + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, ArrayDeclarator_Multidimensional) { + EXPECT_TRUE(treeDumpEqual( + R"cpp( int b[1][2][3]; +)cpp", + R"txt( +TranslationUnit Detached +`-SimpleDeclaration + |-'int' + |-SimpleDeclarator Declarator + | |-'b' + | |-ArraySubscript + | | |-'[' OpenParen + | | |-IntegerLiteralExpression Size + | | | `-'1' LiteralToken + | | `-']' CloseParen + | |-ArraySubscript + | | |-'[' OpenParen + | | |-IntegerLiteralExpression Size + | | | `-'2' LiteralToken + | | `-']' CloseParen + | `-ArraySubscript + | |-'[' OpenParen + | |-IntegerLiteralExpression Size + | | `-'3' LiteralToken + | `-']' CloseParen + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, ArrayDeclarator_UnknownBound) { + EXPECT_TRUE(treeDumpEqual( + R"cpp( int c[] = {1,2,3}; )cpp", R"txt( -*: TranslationUnit -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-a -| | `-ArraySubscript -| | |-[ -| | |-IntegerLiteralExpression -| | | `-10 -| | `-] -| `-; -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-b -| | |-ArraySubscript -| | | |-[ -| | | |-IntegerLiteralExpression -| | | | `-1 -| | | `-] -| | |-ArraySubscript -| | | |-[ -| | | |-IntegerLiteralExpression -| | | | `-2 -| | | `-] -| | `-ArraySubscript -| | |-[ -| | |-IntegerLiteralExpression -| | | `-3 -| | `-] -| `-; +TranslationUnit Detached `-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-c + |-'int' + |-SimpleDeclarator Declarator + | |-'c' | |-ArraySubscript - | | |-[ - | | `-] - | |-= + | | |-'[' OpenParen + | | `-']' CloseParen + | |-'=' | `-UnknownExpression | `-UnknownExpression - | |-{ + | |-'{' | |-IntegerLiteralExpression - | | `-1 - | |-, + | | `-'1' LiteralToken + | |-',' | |-IntegerLiteralExpression - | | `-2 - | |-, + | | `-'2' LiteralToken + | |-',' | |-IntegerLiteralExpression - | | `-3 - | `-} - `-; + | | `-'3' LiteralToken + | `-'}' + `-';' )txt")); } -TEST_P(SyntaxTreeTest, StaticArraySubscriptsInDeclarators) { +TEST_P(SyntaxTreeTest, ArrayDeclarator_Static) { if (!GetParam().isC99OrLater()) { return; } @@ -3414,212 +3882,341 @@ TEST_P(SyntaxTreeTest, StaticArraySubscriptsInDeclarators) { void f(int xs[static 10]); )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-f + |-'void' + |-SimpleDeclarator Declarator + | |-'f' | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | |-int - | | `-SimpleDeclarator - | | |-xs - | | `-ArraySubscript - | | |-[ - | | |-static - | | |-IntegerLiteralExpression - | | | `-10 - | | `-] - | `-) - `-; + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | `-SimpleDeclaration ListElement + | | |-'int' + | | `-SimpleDeclarator Declarator + | | |-'xs' + | | `-ArraySubscript + | | |-'[' OpenParen + | | |-'static' + | | |-IntegerLiteralExpression Size + | | | `-'10' LiteralToken + | | `-']' CloseParen + | `-')' CloseParen + `-';' )txt")); } -TEST_P(SyntaxTreeTest, ParametersAndQualifiersInFreeFunctions) { - if (!GetParam().isCXX()) { - return; - } +TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Empty) { EXPECT_TRUE(treeDumpEqual( R"cpp( -int func1(); -int func2a(int a); -int func2b(int); -int func3a(int *ap); -int func3b(int *); -int func4a(int a, float b); -int func4b(int, float); +int func(); )cpp", R"txt( -*: TranslationUnit -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-func1 -| | `-ParametersAndQualifiers -| | |-( -| | `-) -| `-; -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-func2a -| | `-ParametersAndQualifiers -| | |-( -| | |-SimpleDeclaration -| | | |-int -| | | `-SimpleDeclarator -| | | `-a -| | `-) -| `-; +TranslationUnit Detached +`-SimpleDeclaration + |-'int' + |-SimpleDeclarator Declarator + | |-'func' + | `-ParametersAndQualifiers + | |-'(' OpenParen + | `-')' CloseParen + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Named) { + EXPECT_TRUE(treeDumpEqual( + R"cpp( +int func1(int a); +int func2(int *ap); +int func3(int a, float b); +)cpp", + R"txt( +TranslationUnit Detached |-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-func2b +| |-'int' +| |-SimpleDeclarator Declarator +| | |-'func1' | | `-ParametersAndQualifiers -| | |-( -| | |-SimpleDeclaration -| | | `-int -| | `-) -| `-; +| | |-'(' OpenParen +| | |-ParameterDeclarationList Parameters +| | | `-SimpleDeclaration ListElement +| | | |-'int' +| | | `-SimpleDeclarator Declarator +| | | `-'a' +| | `-')' CloseParen +| `-';' |-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-func3a +| |-'int' +| |-SimpleDeclarator Declarator +| | |-'func2' | | `-ParametersAndQualifiers -| | |-( -| | |-SimpleDeclaration -| | | |-int -| | | `-SimpleDeclarator -| | | |-* -| | | `-ap -| | `-) -| `-; +| | |-'(' OpenParen +| | |-ParameterDeclarationList Parameters +| | | `-SimpleDeclaration ListElement +| | | |-'int' +| | | `-SimpleDeclarator Declarator +| | | |-'*' +| | | `-'ap' +| | `-')' CloseParen +| `-';' +`-SimpleDeclaration + |-'int' + |-SimpleDeclarator Declarator + | |-'func3' + | `-ParametersAndQualifiers + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | |-SimpleDeclaration ListElement + | | | |-'int' + | | | `-SimpleDeclarator Declarator + | | | `-'a' + | | |-',' ListDelimiter + | | `-SimpleDeclaration ListElement + | | |-'float' + | | `-SimpleDeclarator Declarator + | | `-'b' + | `-')' CloseParen + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Unnamed) { + EXPECT_TRUE(treeDumpEqual( + R"cpp( +int func1(int); +int func2(int *); +int func3(int, float); +)cpp", + R"txt( +TranslationUnit Detached |-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-func3b +| |-'int' +| |-SimpleDeclarator Declarator +| | |-'func1' | | `-ParametersAndQualifiers -| | |-( -| | |-SimpleDeclaration -| | | |-int -| | | `-SimpleDeclarator -| | | `-* -| | `-) -| `-; +| | |-'(' OpenParen +| | |-ParameterDeclarationList Parameters +| | | `-SimpleDeclaration ListElement +| | | `-'int' +| | `-')' CloseParen +| `-';' |-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-func4a +| |-'int' +| |-SimpleDeclarator Declarator +| | |-'func2' | | `-ParametersAndQualifiers -| | |-( -| | |-SimpleDeclaration -| | | |-int -| | | `-SimpleDeclarator -| | | `-a -| | |-, -| | |-SimpleDeclaration -| | | |-float -| | | `-SimpleDeclarator -| | | `-b -| | `-) -| `-; +| | |-'(' OpenParen +| | |-ParameterDeclarationList Parameters +| | | `-SimpleDeclaration ListElement +| | | |-'int' +| | | `-SimpleDeclarator Declarator +| | | `-'*' +| | `-')' CloseParen +| `-';' +`-SimpleDeclaration + |-'int' + |-SimpleDeclarator Declarator + | |-'func3' + | `-ParametersAndQualifiers + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | |-SimpleDeclaration ListElement + | | | `-'int' + | | |-',' ListDelimiter + | | `-SimpleDeclaration ListElement + | | `-'float' + | `-')' CloseParen + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, + ParametersAndQualifiers_InVariadicFunctionTemplate_ParameterPack) { + if (!GetParam().isCXX11OrLater() || GetParam().hasDelayedTemplateParsing()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +template +[[void test(T , Args... );]] +)cpp", + {R"txt( +SimpleDeclaration +|-'void' +|-SimpleDeclarator Declarator +| |-'test' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-ParameterDeclarationList Parameters +| | |-SimpleDeclaration ListElement +| | | `-'T' +| | |-',' ListDelimiter +| | `-SimpleDeclaration ListElement +| | |-'Args' +| | `-'...' +| `-')' CloseParen +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, + ParametersAndQualifiers_InVariadicFunctionTemplate_NamedParameterPack) { + if (!GetParam().isCXX11OrLater() || GetParam().hasDelayedTemplateParsing()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +template +[[void test(T t, Args... args);]] +)cpp", + {R"txt( +SimpleDeclaration +|-'void' +|-SimpleDeclarator Declarator +| |-'test' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-ParameterDeclarationList Parameters +| | |-SimpleDeclaration ListElement +| | | |-'T' +| | | `-SimpleDeclarator Declarator +| | | `-'t' +| | |-',' ListDelimiter +| | `-SimpleDeclaration ListElement +| | |-'Args' +| | |-'...' +| | `-SimpleDeclarator Declarator +| | `-'args' +| `-')' CloseParen +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, + ParametersAndQualifiers_InFreeFunctions_VariadicArguments) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( +void test(int , char ...); +)cpp", + R"txt( +TranslationUnit Detached `-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-func4b + |-'void' + |-SimpleDeclarator Declarator + | |-'test' | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | `-int - | |-, - | |-SimpleDeclaration - | | `-float - | `-) - `-; + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | |-SimpleDeclaration ListElement + | | | `-'int' + | | |-',' ListDelimiter + | | `-SimpleDeclaration ListElement + | | `-'char' + | |-'...' + | `-')' CloseParen + `-';' )txt")); } -TEST_P(SyntaxTreeTest, ParametersAndQualifiersInFreeFunctionsCxx) { +TEST_P(SyntaxTreeTest, + ParametersAndQualifiers_InFreeFunctions_Cxx_CvQualifiers) { if (!GetParam().isCXX()) { return; } EXPECT_TRUE(treeDumpEqual( R"cpp( -int func1(const int a, volatile int b, const volatile int c); -int func2(int& a); +int func(const int a, volatile int b, const volatile int c); )cpp", R"txt( -*: TranslationUnit -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-func1 -| | `-ParametersAndQualifiers -| | |-( -| | |-SimpleDeclaration -| | | |-const -| | | |-int -| | | `-SimpleDeclarator -| | | `-a -| | |-, -| | |-SimpleDeclaration -| | | |-volatile -| | | |-int -| | | `-SimpleDeclarator -| | | `-b -| | |-, -| | |-SimpleDeclaration -| | | |-const -| | | |-volatile -| | | |-int -| | | `-SimpleDeclarator -| | | `-c -| | `-) -| `-; +TranslationUnit Detached +`-SimpleDeclaration + |-'int' + |-SimpleDeclarator Declarator + | |-'func' + | `-ParametersAndQualifiers + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | |-SimpleDeclaration ListElement + | | | |-'const' + | | | |-'int' + | | | `-SimpleDeclarator Declarator + | | | `-'a' + | | |-',' ListDelimiter + | | |-SimpleDeclaration ListElement + | | | |-'volatile' + | | | |-'int' + | | | `-SimpleDeclarator Declarator + | | | `-'b' + | | |-',' ListDelimiter + | | `-SimpleDeclaration ListElement + | | |-'const' + | | |-'volatile' + | | |-'int' + | | `-SimpleDeclarator Declarator + | | `-'c' + | `-')' CloseParen + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Cxx_Ref) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( +int func(int& a); +)cpp", + R"txt( +TranslationUnit Detached `-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-func2 + |-'int' + |-SimpleDeclarator Declarator + | |-'func' | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | |-int - | | `-SimpleDeclarator - | | |-& - | | `-a - | `-) - `-; + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | `-SimpleDeclaration ListElement + | | |-'int' + | | `-SimpleDeclarator Declarator + | | |-'&' + | | `-'a' + | `-')' CloseParen + `-';' )txt")); } -TEST_P(SyntaxTreeTest, ParametersAndQualifiersInFreeFunctionsCxx11) { +TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Cxx11_RefRef) { if (!GetParam().isCXX11OrLater()) { return; } EXPECT_TRUE(treeDumpEqual( R"cpp( -int func1(int&& a); +int func(int&& a); )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-func1 + |-'int' + |-SimpleDeclarator Declarator + | |-'func' | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | |-int - | | `-SimpleDeclarator - | | |-&& - | | `-a - | `-) - `-; + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | `-SimpleDeclaration ListElement + | | |-'int' + | | `-SimpleDeclarator Declarator + | | |-'&&' + | | `-'a' + | `-')' CloseParen + `-';' )txt")); } -TEST_P(SyntaxTreeTest, ParametersAndQualifiersInMemberFunctions) { +TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_Simple) { if (!GetParam().isCXX()) { return; } @@ -3627,78 +4224,121 @@ TEST_P(SyntaxTreeTest, ParametersAndQualifiersInMemberFunctions) { R"cpp( struct Test { int a(); - int b() const; - int c() volatile; - int d() const volatile; - int e() &; - int f() &&; }; )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-struct - |-Test - |-{ - |-SimpleDeclaration - | |-int - | |-SimpleDeclarator - | | |-a - | | `-ParametersAndQualifiers - | | |-( - | | `-) - | `-; - |-SimpleDeclaration - | |-int - | |-SimpleDeclarator - | | |-b - | | `-ParametersAndQualifiers - | | |-( - | | |-) - | | `-const - | `-; - |-SimpleDeclaration - | |-int - | |-SimpleDeclarator - | | |-c - | | `-ParametersAndQualifiers - | | |-( - | | |-) - | | `-volatile - | `-; - |-SimpleDeclaration - | |-int - | |-SimpleDeclarator - | | |-d - | | `-ParametersAndQualifiers - | | |-( - | | |-) - | | |-const - | | `-volatile - | `-; - |-SimpleDeclaration - | |-int - | |-SimpleDeclarator - | | |-e - | | `-ParametersAndQualifiers - | | |-( - | | |-) - | | `-& - | `-; + |-'struct' + |-'Test' + |-'{' |-SimpleDeclaration - | |-int - | |-SimpleDeclarator - | | |-f + | |-'int' + | |-SimpleDeclarator Declarator + | | |-'a' | | `-ParametersAndQualifiers - | | |-( - | | |-) - | | `-&& - | `-; - |-} - `-; + | | |-'(' OpenParen + | | `-')' CloseParen + | `-';' + |-'}' + `-';' )txt")); } +TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_CvQualifiers) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +struct Test { + [[int b() const;]] + [[int c() volatile;]] + [[int d() const volatile;]] +}; +)cpp", + {R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-'b' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-')' CloseParen +| `-'const' +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-'c' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-')' CloseParen +| `-'volatile' +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-'d' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-')' CloseParen +| |-'const' +| `-'volatile' +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_Ref) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +struct Test { + [[int e() &;]] +}; +)cpp", + {R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-'e' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-')' CloseParen +| `-'&' +`-';' +)txt"})); +} + +TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_RefRef) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +struct Test { + [[int f() &&;]] +}; +)cpp", + {R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-'f' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-')' CloseParen +| `-'&&' +`-';' +)txt"})); +} + TEST_P(SyntaxTreeTest, TrailingReturn) { if (!GetParam().isCXX11OrLater()) { return; @@ -3708,18 +4348,18 @@ TEST_P(SyntaxTreeTest, TrailingReturn) { auto foo() -> int; )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-auto - |-SimpleDeclarator - | |-foo + |-'auto' + |-SimpleDeclarator Declarator + | |-'foo' | `-ParametersAndQualifiers - | |-( - | |-) - | `-TrailingReturnType - | |--> - | `-int - `-; + | |-'(' OpenParen + | |-')' CloseParen + | `-TrailingReturnType TrailingReturn + | |-'->' ArrowToken + | `-'int' + `-';' )txt")); } @@ -3727,79 +4367,72 @@ TEST_P(SyntaxTreeTest, DynamicExceptionSpecification) { if (!GetParam().supportsCXXDynamicExceptionSpecification()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( struct MyException1 {}; struct MyException2 {}; -int a() throw(); -int b() throw(...); -int c() throw(MyException1); -int d() throw(MyException1, MyException2); +[[int a() throw();]] +[[int b() throw(...);]] +[[int c() throw(MyException1);]] +[[int d() throw(MyException1, MyException2);]] )cpp", - R"txt( -*: TranslationUnit -|-SimpleDeclaration -| |-struct -| |-MyException1 -| |-{ -| |-} -| `-; -|-SimpleDeclaration -| |-struct -| |-MyException2 -| |-{ -| |-} -| `-; -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-a -| | `-ParametersAndQualifiers -| | |-( -| | |-) -| | |-throw -| | |-( -| | `-) -| `-; -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-b -| | `-ParametersAndQualifiers -| | |-( -| | |-) -| | |-throw -| | |-( -| | |-... -| | `-) -| `-; -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-c -| | `-ParametersAndQualifiers -| | |-( -| | |-) -| | |-throw -| | |-( -| | |-MyException1 -| | `-) -| `-; -`-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-d - | `-ParametersAndQualifiers - | |-( - | |-) - | |-throw - | |-( - | |-MyException1 - | |-, - | |-MyException2 - | `-) - `-; -)txt")); + {R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-'a' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-')' CloseParen +| |-'throw' +| |-'(' +| `-')' +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-'b' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-')' CloseParen +| |-'throw' +| |-'(' +| |-'...' +| `-')' +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-'c' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-')' CloseParen +| |-'throw' +| |-'(' +| |-'MyException1' +| `-')' +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-'d' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-')' CloseParen +| |-'throw' +| |-'(' +| |-'MyException1' +| |-',' +| |-'MyException2' +| `-')' +`-';' +)txt"})); } TEST_P(SyntaxTreeTest, NoexceptExceptionSpecification) { @@ -3812,29 +4445,29 @@ int a() noexcept; int b() noexcept(true); )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached |-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-a +| |-'int' +| |-SimpleDeclarator Declarator +| | |-'a' | | `-ParametersAndQualifiers -| | |-( -| | |-) -| | `-noexcept -| `-; +| | |-'(' OpenParen +| | |-')' CloseParen +| | `-'noexcept' +| `-';' `-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-b + |-'int' + |-SimpleDeclarator Declarator + | |-'b' | `-ParametersAndQualifiers - | |-( - | |-) - | |-noexcept - | |-( + | |-'(' OpenParen + | |-')' CloseParen + | |-'noexcept' + | |-'(' | |-BoolLiteralExpression - | | `-true - | `-) - `-; + | | `-'true' LiteralToken + | `-')' + `-';' )txt")); } @@ -3847,106 +4480,126 @@ int (*c)(int); int *(d)(int); )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached |-SimpleDeclaration -| |-int -| |-SimpleDeclarator +| |-'int' +| |-SimpleDeclarator Declarator | | `-ParenDeclarator -| | |-( -| | |-a -| | `-) -| `-; +| | |-'(' OpenParen +| | |-'a' +| | `-')' CloseParen +| `-';' |-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-* +| |-'int' +| |-SimpleDeclarator Declarator +| | |-'*' | | `-ParenDeclarator -| | |-( -| | |-b -| | `-) -| `-; +| | |-'(' OpenParen +| | |-'b' +| | `-')' CloseParen +| `-';' |-SimpleDeclaration -| |-int -| |-SimpleDeclarator +| |-'int' +| |-SimpleDeclarator Declarator | | |-ParenDeclarator -| | | |-( -| | | |-* -| | | |-c -| | | `-) +| | | |-'(' OpenParen +| | | |-'*' +| | | |-'c' +| | | `-')' CloseParen | | `-ParametersAndQualifiers -| | |-( -| | |-SimpleDeclaration -| | | `-int -| | `-) -| `-; +| | |-'(' OpenParen +| | |-ParameterDeclarationList Parameters +| | | `-SimpleDeclaration ListElement +| | | `-'int' +| | `-')' CloseParen +| `-';' `-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-* + |-'int' + |-SimpleDeclarator Declarator + | |-'*' | |-ParenDeclarator - | | |-( - | | |-d - | | `-) + | | |-'(' OpenParen + | | |-'d' + | | `-')' CloseParen | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | `-int - | `-) - `-; + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | `-SimpleDeclaration ListElement + | | `-'int' + | `-')' CloseParen + `-';' )txt")); } -TEST_P(SyntaxTreeTest, ConstVolatileQualifiers) { +TEST_P(SyntaxTreeTest, Declaration_ConstVolatileQualifiers_SimpleConst) { EXPECT_TRUE(treeDumpEqual( R"cpp( const int west = -1; int const east = 1; -const int const universal = 0; -const int const *const *volatile b; )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached |-SimpleDeclaration -| |-const -| |-int -| |-SimpleDeclarator -| | |-west -| | |-= +| |-'const' +| |-'int' +| |-SimpleDeclarator Declarator +| | |-'west' +| | |-'=' | | `-PrefixUnaryOperatorExpression -| | |-- -| | `-IntegerLiteralExpression -| | `-1 -| `-; -|-SimpleDeclaration -| |-int -| |-const -| |-SimpleDeclarator -| | |-east -| | |-= -| | `-IntegerLiteralExpression -| | `-1 -| `-; -|-SimpleDeclaration -| |-const -| |-int -| |-const -| |-SimpleDeclarator -| | |-universal -| | |-= -| | `-IntegerLiteralExpression -| | `-0 -| `-; +| | |-'-' OperatorToken +| | `-IntegerLiteralExpression Operand +| | `-'1' LiteralToken +| `-';' +`-SimpleDeclaration + |-'int' + |-'const' + |-SimpleDeclarator Declarator + | |-'east' + | |-'=' + | `-IntegerLiteralExpression + | `-'1' LiteralToken + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, Declaration_ConstVolatileQualifiers_MultipleConst) { + EXPECT_TRUE(treeDumpEqual( + R"cpp( +const int const universal = 0; +)cpp", + R"txt( +TranslationUnit Detached +`-SimpleDeclaration + |-'const' + |-'int' + |-'const' + |-SimpleDeclarator Declarator + | |-'universal' + | |-'=' + | `-IntegerLiteralExpression + | `-'0' LiteralToken + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, Declaration_ConstVolatileQualifiers_ConstAndVolatile) { + EXPECT_TRUE(treeDumpEqual( + R"cpp( +const int const *const *volatile b; +)cpp", + R"txt( +TranslationUnit Detached `-SimpleDeclaration - |-const - |-int - |-const - |-SimpleDeclarator - | |-* - | |-const - | |-* - | |-volatile - | `-b - `-; + |-'const' + |-'int' + |-'const' + |-SimpleDeclarator Declarator + | |-'*' + | |-'const' + | |-'*' + | |-'volatile' + | `-'b' + `-';' )txt")); } @@ -3959,33 +4612,34 @@ TEST_P(SyntaxTreeTest, RangesOfDeclaratorsWithTrailingReturnTypes) { auto foo() -> auto(*)(int) -> double*; )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-auto - |-SimpleDeclarator - | |-foo + |-'auto' + |-SimpleDeclarator Declarator + | |-'foo' | `-ParametersAndQualifiers - | |-( - | |-) - | `-TrailingReturnType - | |--> - | |-auto - | `-SimpleDeclarator + | |-'(' OpenParen + | |-')' CloseParen + | `-TrailingReturnType TrailingReturn + | |-'->' ArrowToken + | |-'auto' + | `-SimpleDeclarator Declarator | |-ParenDeclarator - | | |-( - | | |-* - | | `-) + | | |-'(' OpenParen + | | |-'*' + | | `-')' CloseParen | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | `-int - | |-) - | `-TrailingReturnType - | |--> - | |-double - | `-SimpleDeclarator - | `-* - `-; + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | `-SimpleDeclaration ListElement + | | `-'int' + | |-')' CloseParen + | `-TrailingReturnType TrailingReturn + | |-'->' ArrowToken + | |-'double' + | `-SimpleDeclarator Declarator + | `-'*' + `-';' )txt")); } @@ -3993,133 +4647,121 @@ TEST_P(SyntaxTreeTest, MemberPointers) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( struct X {}; -int X::* a; -const int X::* b; +[[int X::* a;]] +[[const int X::* b;]] )cpp", - R"txt( -*: TranslationUnit -|-SimpleDeclaration -| |-struct -| |-X -| |-{ -| |-} -| `-; -|-SimpleDeclaration -| |-int -| |-SimpleDeclarator -| | |-MemberPointer -| | | |-X -| | | |-:: -| | | `-* -| | `-a -| `-; -`-SimpleDeclaration - |-const - |-int - |-SimpleDeclarator - | |-MemberPointer - | | |-X - | | |-:: - | | `-* - | `-b - `-; -)txt")); + {R"txt( +SimpleDeclaration +|-'int' +|-SimpleDeclarator Declarator +| |-MemberPointer +| | |-'X' +| | |-'::' +| | `-'*' +| `-'a' +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'const' +|-'int' +|-SimpleDeclarator Declarator +| |-MemberPointer +| | |-'X' +| | |-'::' +| | `-'*' +| `-'b' +`-';' +)txt"})); } TEST_P(SyntaxTreeTest, MemberFunctionPointer) { if (!GetParam().isCXX()) { return; } - EXPECT_TRUE(treeDumpEqual( + EXPECT_TRUE(treeDumpEqualOnAnnotations( R"cpp( struct X { struct Y {}; }; -void (X::*xp)(); -void (X::**xpp)(const int*); +[[void (X::*xp)();]] +[[void (X::**xpp)(const int*);]] // FIXME: Generate the right syntax tree for this type, // i.e. create a syntax node for the outer member pointer -void (X::Y::*xyp)(const int*, char); +[[void (X::Y::*xyp)(const int*, char);]] )cpp", - R"txt( -*: TranslationUnit -|-SimpleDeclaration -| |-struct -| |-X -| |-{ -| |-SimpleDeclaration -| | |-struct -| | |-Y -| | |-{ -| | |-} -| | `-; -| |-} -| `-; -|-SimpleDeclaration -| |-void -| |-SimpleDeclarator -| | |-ParenDeclarator -| | | |-( -| | | |-MemberPointer -| | | | |-X -| | | | |-:: -| | | | `-* -| | | |-xp -| | | `-) -| | `-ParametersAndQualifiers -| | |-( -| | `-) -| `-; -|-SimpleDeclaration -| |-void -| |-SimpleDeclarator -| | |-ParenDeclarator -| | | |-( -| | | |-MemberPointer -| | | | |-X -| | | | |-:: -| | | | `-* -| | | |-* -| | | |-xpp -| | | `-) -| | `-ParametersAndQualifiers -| | |-( -| | |-SimpleDeclaration -| | | |-const -| | | |-int -| | | `-SimpleDeclarator -| | | `-* -| | `-) -| `-; -`-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-ParenDeclarator - | | |-( - | | |-X - | | |-:: - | | |-MemberPointer - | | | |-Y - | | | |-:: - | | | `-* - | | |-xyp - | | `-) - | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | |-const - | | |-int - | | `-SimpleDeclarator - | | `-* - | |-, - | |-SimpleDeclaration - | | `-char - | `-) - `-; -)txt")); + {R"txt( +SimpleDeclaration +|-'void' +|-SimpleDeclarator Declarator +| |-ParenDeclarator +| | |-'(' OpenParen +| | |-MemberPointer +| | | |-'X' +| | | |-'::' +| | | `-'*' +| | |-'xp' +| | `-')' CloseParen +| `-ParametersAndQualifiers +| |-'(' OpenParen +| `-')' CloseParen +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'void' +|-SimpleDeclarator Declarator +| |-ParenDeclarator +| | |-'(' OpenParen +| | |-MemberPointer +| | | |-'X' +| | | |-'::' +| | | `-'*' +| | |-'*' +| | |-'xpp' +| | `-')' CloseParen +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-ParameterDeclarationList Parameters +| | `-SimpleDeclaration ListElement +| | |-'const' +| | |-'int' +| | `-SimpleDeclarator Declarator +| | `-'*' +| `-')' CloseParen +`-';' +)txt", + R"txt( +SimpleDeclaration +|-'void' +|-SimpleDeclarator Declarator +| |-ParenDeclarator +| | |-'(' OpenParen +| | |-'X' +| | |-'::' +| | |-MemberPointer +| | | |-'Y' +| | | |-'::' +| | | `-'*' +| | |-'xyp' +| | `-')' CloseParen +| `-ParametersAndQualifiers +| |-'(' OpenParen +| |-ParameterDeclarationList Parameters +| | |-SimpleDeclaration ListElement +| | | |-'const' +| | | |-'int' +| | | `-SimpleDeclarator Declarator +| | | `-'*' +| | |-',' ListDelimiter +| | `-SimpleDeclaration ListElement +| | `-'char' +| `-')' CloseParen +`-';' +)txt"})); } TEST_P(SyntaxTreeTest, ComplexDeclarator) { @@ -4128,33 +4770,35 @@ TEST_P(SyntaxTreeTest, ComplexDeclarator) { void x(char a, short (*b)(int)); )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-x + |-'void' + |-SimpleDeclarator Declarator + | |-'x' | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | |-char - | | `-SimpleDeclarator - | | `-a - | |-, - | |-SimpleDeclaration - | | |-short - | | `-SimpleDeclarator - | | |-ParenDeclarator - | | | |-( - | | | |-* - | | | |-b - | | | `-) - | | `-ParametersAndQualifiers - | | |-( - | | |-SimpleDeclaration - | | | `-int - | | `-) - | `-) - `-; + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | |-SimpleDeclaration ListElement + | | | |-'char' + | | | `-SimpleDeclarator Declarator + | | | `-'a' + | | |-',' ListDelimiter + | | `-SimpleDeclaration ListElement + | | |-'short' + | | `-SimpleDeclarator Declarator + | | |-ParenDeclarator + | | | |-'(' OpenParen + | | | |-'*' + | | | |-'b' + | | | `-')' CloseParen + | | `-ParametersAndQualifiers + | | |-'(' OpenParen + | | |-ParameterDeclarationList Parameters + | | | `-SimpleDeclaration ListElement + | | | `-'int' + | | `-')' CloseParen + | `-')' CloseParen + `-';' )txt")); } @@ -4164,49 +4808,52 @@ TEST_P(SyntaxTreeTest, ComplexDeclarator2) { void x(char a, short (*b)(int), long (**c)(long long)); )cpp", R"txt( -*: TranslationUnit +TranslationUnit Detached `-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-x + |-'void' + |-SimpleDeclarator Declarator + | |-'x' | `-ParametersAndQualifiers - | |-( - | |-SimpleDeclaration - | | |-char - | | `-SimpleDeclarator - | | `-a - | |-, - | |-SimpleDeclaration - | | |-short - | | `-SimpleDeclarator - | | |-ParenDeclarator - | | | |-( - | | | |-* - | | | |-b - | | | `-) - | | `-ParametersAndQualifiers - | | |-( - | | |-SimpleDeclaration - | | | `-int - | | `-) - | |-, - | |-SimpleDeclaration - | | |-long - | | `-SimpleDeclarator - | | |-ParenDeclarator - | | | |-( - | | | |-* - | | | |-* - | | | |-c - | | | `-) - | | `-ParametersAndQualifiers - | | |-( - | | |-SimpleDeclaration - | | | |-long - | | | `-long - | | `-) - | `-) - `-; + | |-'(' OpenParen + | |-ParameterDeclarationList Parameters + | | |-SimpleDeclaration ListElement + | | | |-'char' + | | | `-SimpleDeclarator Declarator + | | | `-'a' + | | |-',' ListDelimiter + | | |-SimpleDeclaration ListElement + | | | |-'short' + | | | `-SimpleDeclarator Declarator + | | | |-ParenDeclarator + | | | | |-'(' OpenParen + | | | | |-'*' + | | | | |-'b' + | | | | `-')' CloseParen + | | | `-ParametersAndQualifiers + | | | |-'(' OpenParen + | | | |-ParameterDeclarationList Parameters + | | | | `-SimpleDeclaration ListElement + | | | | `-'int' + | | | `-')' CloseParen + | | |-',' ListDelimiter + | | `-SimpleDeclaration ListElement + | | |-'long' + | | `-SimpleDeclarator Declarator + | | |-ParenDeclarator + | | | |-'(' OpenParen + | | | |-'*' + | | | |-'*' + | | | |-'c' + | | | `-')' CloseParen + | | `-ParametersAndQualifiers + | | |-'(' OpenParen + | | |-ParameterDeclarationList Parameters + | | | `-SimpleDeclaration ListElement + | | | |-'long' + | | | `-'long' + | | `-')' CloseParen + | `-')' CloseParen + `-';' )txt")); } diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp index c5dbb770c5387..ebee0115cb727 100644 --- a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp +++ b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp @@ -171,7 +171,7 @@ ::testing::AssertionResult SyntaxTreeTest::treeDumpEqual(StringRef Code, << "Source file has syntax errors, they were printed to the test " "log"; } - auto Actual = StringRef(Root->dump(*Arena)).trim().str(); + auto Actual = StringRef(Root->dump(Arena->sourceManager())).trim().str(); // EXPECT_EQ shows the diff between the two strings if they are different. EXPECT_EQ(Tree.trim().str(), Actual); if (Actual != Tree.trim().str()) { @@ -205,7 +205,7 @@ SyntaxTreeTest::treeDumpEqualOnAnnotations(StringRef CodeWithAnnotations, auto *AnnotatedNode = nodeByRange(AnnotatedRanges[i], Root); assert(AnnotatedNode); auto AnnotatedNodeDump = - StringRef(AnnotatedNode->dump(*Arena)).trim().str(); + StringRef(AnnotatedNode->dump(Arena->sourceManager())).trim().str(); // EXPECT_EQ shows the diff between the two strings if they are different. EXPECT_EQ(TreeDumps[i].trim().str(), AnnotatedNodeDump) << "Dumps diverged for the code:\n" diff --git a/clang/utils/analyzer/ProjectMap.py b/clang/utils/analyzer/ProjectMap.py index 3daa701405626..1e89ce634e573 100644 --- a/clang/utils/analyzer/ProjectMap.py +++ b/clang/utils/analyzer/ProjectMap.py @@ -1,7 +1,7 @@ import json import os -from enum import Enum +from enum import auto, Enum from typing import Any, Dict, List, NamedTuple, Optional, Tuple @@ -17,6 +17,64 @@ class DownloadType(str, Enum): SCRIPT = "script" +class Size(int, Enum): + """ + Size of the project. + + Sizes do not directly correspond to the number of lines or files in the + project. The key factor that is important for the developers of the + analyzer is the time it takes to analyze the project. Here is how + the following sizes map to times: + + TINY: <1min + SMALL: 1min-10min + BIG: 10min-1h + HUGE: >1h + + The borders are a bit of a blur, especially because analysis time varies + from one machine to another. However, the relative times will stay pretty + similar, and these groupings will still be helpful. + + UNSPECIFIED is a very special case, which is intentionally last in the list + of possible sizes. If the user wants to filter projects by one of the + possible sizes, we want projects with UNSPECIFIED size to be filtered out + for any given size. + """ + TINY = auto() + SMALL = auto() + BIG = auto() + HUGE = auto() + UNSPECIFIED = auto() + + @staticmethod + def from_str(raw_size: Optional[str]) -> "Size": + """ + Construct a Size object from an optional string. + + :param raw_size: optional string representation of the desired Size + object. None will produce UNSPECIFIED size. + + This method is case-insensitive, so raw sizes 'tiny', 'TINY', and + 'TiNy' will produce the same result. + """ + if raw_size is None: + return Size.UNSPECIFIED + + raw_size_upper = raw_size.upper() + # The implementation is decoupled from the actual values of the enum, + # so we can easily add or modify it without bothering about this + # function. + for possible_size in Size: + if possible_size.name == raw_size_upper: + return possible_size + + possible_sizes = [size.name.lower() for size in Size + # no need in showing our users this size + if size != Size.UNSPECIFIED] + raise ValueError(f"Incorrect project size '{raw_size}'. " + f"Available sizes are {possible_sizes}") + + class ProjectInfo(NamedTuple): """ Information about a project to analyze. @@ -27,6 +85,7 @@ class ProjectInfo(NamedTuple): origin: str = "" commit: str = "" enabled: bool = True + size: Size = Size.UNSPECIFIED def with_fields(self, **kwargs) -> "ProjectInfo": """ @@ -98,6 +157,7 @@ def _parse_project(raw_project: JSON) -> ProjectInfo: build_mode: int = raw_project["mode"] enabled: bool = raw_project.get("enabled", True) source: DownloadType = raw_project.get("source", "zip") + size = Size.from_str(raw_project.get("size", None)) if source == DownloadType.GIT: origin, commit = ProjectMap._get_git_params(raw_project) @@ -105,7 +165,7 @@ def _parse_project(raw_project: JSON) -> ProjectInfo: origin, commit = "", "" return ProjectInfo(name, build_mode, source, origin, commit, - enabled) + enabled, size) except KeyError as e: raise ValueError( diff --git a/clang/utils/analyzer/SATest.py b/clang/utils/analyzer/SATest.py index 86571902502f9..176fe40a2b171 100755 --- a/clang/utils/analyzer/SATest.py +++ b/clang/utils/analyzer/SATest.py @@ -37,7 +37,7 @@ def build(parser, args): SATestBuild.VERBOSE = args.verbose - projects = get_projects(parser, args.projects) + projects = get_projects(parser, args) tester = SATestBuild.RegressionTester(args.jobs, projects, args.override_compiler, @@ -84,7 +84,7 @@ def update(parser, args): def benchmark(parser, args): from SATestBenchmark import Benchmark - projects = get_projects(parser, args.projects) + projects = get_projects(parser, args) benchmark = Benchmark(projects, args.iterations, args.output) benchmark.run() @@ -94,14 +94,19 @@ def benchmark_compare(parser, args): SATestBenchmark.compare(args.old, args.new, args.output) -def get_projects(parser, projects_str): - from ProjectMap import ProjectMap +def get_projects(parser, args): + from ProjectMap import ProjectMap, Size project_map = ProjectMap() projects = project_map.projects - if projects_str: - projects_arg = projects_str.split(",") + def filter_projects(projects, predicate, force=False): + return [project.with_fields(enabled=(force or project.enabled) and + predicate(project)) + for project in projects] + + if args.projects: + projects_arg = args.projects.split(",") available_projects = [project.name for project in projects] @@ -113,8 +118,17 @@ def get_projects(parser, projects_str): "{all}.".format(project=manual_project, all=available_projects)) - projects = [project.with_fields(enabled=project.name in projects_arg) - for project in projects] + projects = filter_projects(projects, lambda project: + project.name in projects_arg, + force=True) + + try: + max_size = Size.from_str(args.max_size) + except ValueError as e: + parser.error("{}".format(e)) + + projects = filter_projects(projects, lambda project: + project.size <= max_size) return projects @@ -238,6 +252,8 @@ def main(): help="Arguments passed to to -analyzer-config") build_parser.add_argument("--projects", action="store", default="", help="Comma-separated list of projects to test") + build_parser.add_argument("--max-size", action="store", default=None, + help="Maximum size for the projects to test") build_parser.add_argument("-v", "--verbose", action="count", default=0) build_parser.set_defaults(func=build) @@ -318,6 +334,8 @@ def main(): help="Output csv file for the benchmark results") bench_parser.add_argument("--projects", action="store", default="", help="Comma-separated list of projects to test") + bench_parser.add_argument("--max-size", action="store", default=None, + help="Maximum size for the projects to test") bench_parser.set_defaults(func=benchmark) bench_subparsers = bench_parser.add_subparsers() diff --git a/clang/utils/analyzer/projects/projects.json b/clang/utils/analyzer/projects/projects.json index 84b741035f46c..80b61ecd38741 100644 --- a/clang/utils/analyzer/projects/projects.json +++ b/clang/utils/analyzer/projects/projects.json @@ -4,139 +4,159 @@ "mode": 1, "source": "git", "origin": "https://github.com/jarro2783/cxxopts.git", - "commit": "794c975" + "commit": "794c975", + "size": "tiny" }, { "name": "box2d", "mode": 1, "source": "git", "origin": "https://github.com/erincatto/box2d.git", - "commit": "1025f9a" + "commit": "1025f9a", + "size": "small" }, { "name": "tinyexpr", "mode": 1, "source": "git", "origin": "https://github.com/codeplea/tinyexpr.git", - "commit": "ffb0d41" + "commit": "ffb0d41", + "size": "tiny" }, { "name": "symengine", "mode": 1, "source": "git", "origin": "https://github.com/symengine/symengine.git", - "commit": "4f669d59" + "commit": "4f669d59", + "size": "small" }, { "name": "termbox", "mode": 1, "source": "git", "origin": "https://github.com/nsf/termbox.git", - "commit": "0df1355" + "commit": "0df1355", + "size": "tiny" }, { "name": "tinyvm", "mode": 1, "source": "git", "origin": "https://github.com/jakogut/tinyvm.git", - "commit": "10c25d8" + "commit": "10c25d8", + "size": "tiny" }, { "name": "tinyspline", "mode": 1, "source": "git", "origin": "https://github.com/msteinbeck/tinyspline.git", - "commit": "f8b1ab7" + "commit": "f8b1ab7", + "size": "tiny" }, { "name": "oatpp", "mode": 1, "source": "git", "origin": "https://github.com/oatpp/oatpp.git", - "commit": "d3e60fb" + "commit": "d3e60fb", + "size": "small" }, { "name": "libsoundio", "mode": 1, "source": "git", "origin": "https://github.com/andrewrk/libsoundio.git", - "commit": "b810bf2" + "commit": "b810bf2", + "size": "tiny" }, { "name": "zstd", "mode": 1, "source": "git", "origin": "https://github.com/facebook/zstd.git", - "commit": "2af4e073" + "commit": "2af4e073", + "size": "small" }, { "name": "simbody", "mode": 1, "source": "git", "origin": "https://github.com/simbody/simbody.git", - "commit": "5cf513d" + "commit": "5cf513d", + "size": "big" }, { "name": "duckdb", "mode": 1, "source": "git", "origin": "https://github.com/cwida/duckdb.git", - "commit": "d098c9f" + "commit": "d098c9f", + "size": "big" }, { "name": "drogon", "mode": 1, "source": "git", "origin": "https://github.com/an-tao/drogon.git", - "commit": "fd2a612" + "commit": "fd2a612", + "size": "small" }, { "name": "fmt", "mode": 1, "source": "git", "origin": "https://github.com/fmtlib/fmt.git", - "commit": "5e7c70e" + "commit": "5e7c70e", + "size": "small" }, { "name": "re2", "mode": 1, "source": "git", "origin": "https://github.com/google/re2.git", - "commit": "2b25567" + "commit": "2b25567", + "size": "small" }, { "name": "cppcheck", "mode": 1, "source": "git", "origin": "https://github.com/danmar/cppcheck.git", - "commit": "5fa3d53" + "commit": "5fa3d53", + "size": "small" }, { "name": "harfbuzz", "mode": 1, "source": "git", "origin": "https://github.com/harfbuzz/harfbuzz.git", - "commit": "f8d345e" + "commit": "f8d345e", + "size": "small" }, { "name": "capnproto", "mode": 1, "source": "git", "origin": "https://github.com/capnproto/capnproto.git", - "commit": "8be1c9f" + "commit": "8be1c9f", + "size": "small" }, { "name": "tmux", "mode": 1, "source": "git", "origin": "https://github.com/tmux/tmux.git", - "commit": "a5f99e1" + "commit": "a5f99e1", + "size": "big" }, { "name": "faiss", "mode": 1, "source": "git", "origin": "https://github.com/facebookresearch/faiss.git", - "commit": "9e5d5b7" + "commit": "9e5d5b7", + "size": "small" } ] diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt index cfbd07a40e15f..c7e86946bcf35 100644 --- a/compiler-rt/CMakeLists.txt +++ b/compiler-rt/CMakeLists.txt @@ -112,7 +112,7 @@ if (COMPILER_RT_STANDALONE_BUILD) endif() # Ensure that fat libraries are built correctly on Darwin - if(CMAKE_HOST_APPLE AND APPLE) + if(APPLE) include(UseLibtool) endif() diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake index efb6608182705..f2f0b5ecde590 100644 --- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake +++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake @@ -109,11 +109,11 @@ endfunction() function(add_asm_sources output) set(${output} ${ARGN} PARENT_SCOPE) - # Xcode will try to compile asm files as C ('clang -x c'), and that will fail. - if (${CMAKE_GENERATOR} STREQUAL "Xcode") - enable_language(ASM) - else() - # Pass ASM file directly to the C++ compiler. + # CMake doesn't pass the correct architecture for Apple prior to CMake 3.19. https://gitlab.kitware.com/cmake/cmake/-/issues/20771 + # MinGW didn't work correctly with assembly prior to CMake 3.17. https://gitlab.kitware.com/cmake/cmake/-/merge_requests/4287 and https://reviews.llvm.org/rGb780df052dd2b246a760d00e00f7de9ebdab9d09 + # Workaround these two issues by compiling as C. + # Same workaround used in libunwind. Also update there if changed here. + if((APPLE AND CMAKE_VERSION VERSION_LESS 3.19) OR (MINGW AND CMAKE_VERSION VERSION_LESS 3.17)) set_source_files_properties(${ARGN} PROPERTIES LANGUAGE C) endif() endfunction() diff --git a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake index a31bcc3963e17..f6689c2e79ad5 100644 --- a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake +++ b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake @@ -399,7 +399,8 @@ macro(darwin_add_builtin_libraries) ../profile/InstrProfilingBuffer ../profile/InstrProfilingPlatformDarwin ../profile/InstrProfilingWriter - ../profile/InstrProfilingInternal) + ../profile/InstrProfilingInternal + ../profile/InstrProfilingVersionVar) foreach (os ${ARGN}) list_intersect(DARWIN_BUILTIN_ARCHS DARWIN_${os}_BUILTIN_ARCHS BUILTIN_SUPPORTED_ARCH) foreach (arch ${DARWIN_BUILTIN_ARCHS}) diff --git a/compiler-rt/cmake/Modules/UseLibtool.cmake b/compiler-rt/cmake/Modules/UseLibtool.cmake index 38d197d4846fd..130810c970d89 100644 --- a/compiler-rt/cmake/Modules/UseLibtool.cmake +++ b/compiler-rt/cmake/Modules/UseLibtool.cmake @@ -34,6 +34,14 @@ if(CMAKE_LIBTOOL) set(CMAKE_${lang}_CREATE_STATIC_LIBRARY "\"${CMAKE_LIBTOOL}\" -static ${LIBTOOL_NO_WARNING_FLAG} -o ") endforeach() + + # By default, CMake invokes ranlib on a static library after installing it. + # libtool will have produced the table of contents for us already, and ranlib + # does not understanding universal binaries, so skip this step. It's important + # to set it to empty instead of unsetting it to shadow the cache variable, and + # we don't want to unset the cache variable to not affect anything outside + # this scope. + set(CMAKE_RANLIB "") endif() # If DYLD_LIBRARY_PATH is set we need to set it on archiver commands diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index 1428a514b55a4..5f9e868de5fd8 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -133,18 +133,17 @@ check_library_exists(pthread pthread_create "" COMPILER_RT_HAS_LIBPTHREAD) check_library_exists(execinfo backtrace "" COMPILER_RT_HAS_LIBEXECINFO) # Look for terminfo library, used in unittests that depend on LLVMSupport. -if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON) - set(MAYBE_REQUIRED REQUIRED) -else() - set(MAYBE_REQUIRED) -endif() if(LLVM_ENABLE_TERMINFO) - find_library(COMPILER_RT_TERMINFO_LIB NAMES terminfo tinfo curses ncurses ncursesw ${MAYBE_REQUIRED}) -endif() -if(COMPILER_RT_TERMINFO_LIB) - set(LLVM_ENABLE_TERMINFO 1) -else() - set(LLVM_ENABLE_TERMINFO 0) + foreach(library terminfo tinfo curses ncurses ncursesw) + string(TOUPPER ${library} library_suffix) + check_library_exists( + ${library} setupterm "" COMPILER_RT_HAS_TERMINFO_${library_suffix}) + if(COMPILER_RT_HAS_TERMINFO_${library_suffix}) + set(COMPILER_RT_HAS_TERMINFO TRUE) + set(COMPILER_RT_TERMINFO_LIB "${library}") + break() + endif() + endforeach() endif() if (ANDROID AND COMPILER_RT_HAS_LIBDL) diff --git a/compiler-rt/include/sanitizer/msan_interface.h b/compiler-rt/include/sanitizer/msan_interface.h index d40c556a46d93..eeb39fbed8b49 100644 --- a/compiler-rt/include/sanitizer/msan_interface.h +++ b/compiler-rt/include/sanitizer/msan_interface.h @@ -114,6 +114,9 @@ extern "C" { call to __msan_scoped_disable_interceptor_checks. */ void __msan_scoped_enable_interceptor_checks(void); + void __msan_start_switch_fiber(const void *bottom, size_t size); + void __msan_finish_switch_fiber(const void **bottom_old, size_t *size_old); + #ifdef __cplusplus } // extern "C" #endif diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 9c272afdc0c7e..8dbe15364ab8e 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -20,7 +20,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) if(APPLE) include(CompilerRTDarwinUtils) endif() - if(CMAKE_HOST_APPLE AND APPLE) + if(APPLE) include(UseLibtool) endif() include(AddCompilerRT) @@ -553,11 +553,8 @@ set(mips64el_SOURCES ${GENERIC_TF_SOURCES} set(powerpc64_SOURCES ppc/divtc3.c - ppc/fixtfti.c ppc/fixtfdi.c - ppc/fixunstfti.c ppc/fixunstfdi.c - ppc/floattitf.c ppc/floatditf.c ppc/floatunditf.c ppc/gcc_qadd.c @@ -567,6 +564,15 @@ set(powerpc64_SOURCES ppc/multc3.c ${GENERIC_SOURCES} ) +# These routines require __int128, which isn't supported on AIX. +if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") + set(powerpc64_SOURCES + ppc/floattitf.c + ppc/fixtfti.c + ppc/fixunstfti.c + ${powerpc64_SOURCES} + ) +endif() set(powerpc64le_SOURCES ${powerpc64_SOURCES}) set(riscv_SOURCES ${GENERIC_SOURCES} ${GENERIC_TF_SOURCES}) diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c index 468bcc84cbcb3..d6dc73b88bc29 100644 --- a/compiler-rt/lib/builtins/cpu_model.c +++ b/compiler-rt/lib/builtins/cpu_model.c @@ -84,6 +84,7 @@ enum ProcessorSubtypes { INTEL_COREI7_CASCADELAKE, INTEL_COREI7_TIGERLAKE, INTEL_COREI7_COOPERLAKE, + INTEL_COREI7_SAPPHIRERAPIDS, CPU_SUBTYPE_MAX }; @@ -407,6 +408,13 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, *Subtype = INTEL_COREI7_ICELAKE_SERVER; break; + // Sapphire Rapids: + case 0x8f: + CPU = "sapphirerapids"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_SAPPHIRERAPIDS; + break; + case 0x1c: // Most 45 nm Intel Atom processors case 0x26: // 45 nm Atom Lincroft case 0x27: // 32 nm Atom Medfield diff --git a/compiler-rt/lib/builtins/int_mulo_impl.inc b/compiler-rt/lib/builtins/int_mulo_impl.inc new file mode 100644 index 0000000000000..567d8b9e6e603 --- /dev/null +++ b/compiler-rt/lib/builtins/int_mulo_impl.inc @@ -0,0 +1,49 @@ +//===-- int_mulo_impl.inc - Implement __mulo[sdt]i4 ---------------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Helper used by __mulosi4, __mulodi4 and __muloti4. +// +//===----------------------------------------------------------------------===// + +#include "int_lib.h" + +// Returns: a * b + +// Effects: sets *overflow to 1 if a * b overflows + +static __inline fixint_t __muloXi4(fixint_t a, fixint_t b, int *overflow) { + const int N = (int)(sizeof(fixint_t) * CHAR_BIT); + const fixint_t MIN = (fixint_t)1 << (N - 1); + const fixint_t MAX = ~MIN; + *overflow = 0; + fixint_t result = a * b; + if (a == MIN) { + if (b != 0 && b != 1) + *overflow = 1; + return result; + } + if (b == MIN) { + if (a != 0 && a != 1) + *overflow = 1; + return result; + } + fixint_t sa = a >> (N - 1); + fixint_t abs_a = (a ^ sa) - sa; + fixint_t sb = b >> (N - 1); + fixint_t abs_b = (b ^ sb) - sb; + if (abs_a < 2 || abs_b < 2) + return result; + if (sa == sb) { + if (abs_a > MAX / abs_b) + *overflow = 1; + } else { + if (abs_a > MIN / -abs_b) + *overflow = 1; + } + return result; +} diff --git a/compiler-rt/lib/builtins/int_mulv_impl.inc b/compiler-rt/lib/builtins/int_mulv_impl.inc new file mode 100644 index 0000000000000..1e920716ec499 --- /dev/null +++ b/compiler-rt/lib/builtins/int_mulv_impl.inc @@ -0,0 +1,47 @@ +//===-- int_mulv_impl.inc - Implement __mulv[sdt]i3 ---------------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Helper used by __mulvsi3, __mulvdi3 and __mulvti3. +// +//===----------------------------------------------------------------------===// + +#include "int_lib.h" + +// Returns: a * b + +// Effects: aborts if a * b overflows + +static __inline fixint_t __mulvXi3(fixint_t a, fixint_t b) { + const int N = (int)(sizeof(fixint_t) * CHAR_BIT); + const fixint_t MIN = (fixint_t)1 << (N - 1); + const fixint_t MAX = ~MIN; + if (a == MIN) { + if (b == 0 || b == 1) + return a * b; + compilerrt_abort(); + } + if (b == MIN) { + if (a == 0 || a == 1) + return a * b; + compilerrt_abort(); + } + fixint_t sa = a >> (N - 1); + fixint_t abs_a = (a ^ sa) - sa; + fixint_t sb = b >> (N - 1); + fixint_t abs_b = (b ^ sb) - sb; + if (abs_a < 2 || abs_b < 2) + return a * b; + if (sa == sb) { + if (abs_a > MAX / abs_b) + compilerrt_abort(); + } else { + if (abs_a > MIN / -abs_b) + compilerrt_abort(); + } + return a * b; +} diff --git a/compiler-rt/lib/builtins/mulodi4.c b/compiler-rt/lib/builtins/mulodi4.c index 23f5571ac4689..7209676a327e4 100644 --- a/compiler-rt/lib/builtins/mulodi4.c +++ b/compiler-rt/lib/builtins/mulodi4.c @@ -10,40 +10,13 @@ // //===----------------------------------------------------------------------===// -#include "int_lib.h" +#define fixint_t di_int +#include "int_mulo_impl.inc" // Returns: a * b // Effects: sets *overflow to 1 if a * b overflows COMPILER_RT_ABI di_int __mulodi4(di_int a, di_int b, int *overflow) { - const int N = (int)(sizeof(di_int) * CHAR_BIT); - const di_int MIN = (di_int)1 << (N - 1); - const di_int MAX = ~MIN; - *overflow = 0; - di_int result = a * b; - if (a == MIN) { - if (b != 0 && b != 1) - *overflow = 1; - return result; - } - if (b == MIN) { - if (a != 0 && a != 1) - *overflow = 1; - return result; - } - di_int sa = a >> (N - 1); - di_int abs_a = (a ^ sa) - sa; - di_int sb = b >> (N - 1); - di_int abs_b = (b ^ sb) - sb; - if (abs_a < 2 || abs_b < 2) - return result; - if (sa == sb) { - if (abs_a > MAX / abs_b) - *overflow = 1; - } else { - if (abs_a > MIN / -abs_b) - *overflow = 1; - } - return result; + return __muloXi4(a, b, overflow); } diff --git a/compiler-rt/lib/builtins/mulosi4.c b/compiler-rt/lib/builtins/mulosi4.c index fea4311296f8b..4e03c24455d67 100644 --- a/compiler-rt/lib/builtins/mulosi4.c +++ b/compiler-rt/lib/builtins/mulosi4.c @@ -10,40 +10,13 @@ // //===----------------------------------------------------------------------===// -#include "int_lib.h" +#define fixint_t si_int +#include "int_mulo_impl.inc" // Returns: a * b // Effects: sets *overflow to 1 if a * b overflows COMPILER_RT_ABI si_int __mulosi4(si_int a, si_int b, int *overflow) { - const int N = (int)(sizeof(si_int) * CHAR_BIT); - const si_int MIN = (si_int)1 << (N - 1); - const si_int MAX = ~MIN; - *overflow = 0; - si_int result = a * b; - if (a == MIN) { - if (b != 0 && b != 1) - *overflow = 1; - return result; - } - if (b == MIN) { - if (a != 0 && a != 1) - *overflow = 1; - return result; - } - si_int sa = a >> (N - 1); - si_int abs_a = (a ^ sa) - sa; - si_int sb = b >> (N - 1); - si_int abs_b = (b ^ sb) - sb; - if (abs_a < 2 || abs_b < 2) - return result; - if (sa == sb) { - if (abs_a > MAX / abs_b) - *overflow = 1; - } else { - if (abs_a > MIN / -abs_b) - *overflow = 1; - } - return result; + return __muloXi4(a, b, overflow); } diff --git a/compiler-rt/lib/builtins/muloti4.c b/compiler-rt/lib/builtins/muloti4.c index 9bdd5b649908b..9a7aa85b022bf 100644 --- a/compiler-rt/lib/builtins/muloti4.c +++ b/compiler-rt/lib/builtins/muloti4.c @@ -18,36 +18,11 @@ // Effects: sets *overflow to 1 if a * b overflows +#define fixint_t ti_int +#include "int_mulo_impl.inc" + COMPILER_RT_ABI ti_int __muloti4(ti_int a, ti_int b, int *overflow) { - const int N = (int)(sizeof(ti_int) * CHAR_BIT); - const ti_int MIN = (ti_int)1 << (N - 1); - const ti_int MAX = ~MIN; - *overflow = 0; - ti_int result = a * b; - if (a == MIN) { - if (b != 0 && b != 1) - *overflow = 1; - return result; - } - if (b == MIN) { - if (a != 0 && a != 1) - *overflow = 1; - return result; - } - ti_int sa = a >> (N - 1); - ti_int abs_a = (a ^ sa) - sa; - ti_int sb = b >> (N - 1); - ti_int abs_b = (b ^ sb) - sb; - if (abs_a < 2 || abs_b < 2) - return result; - if (sa == sb) { - if (abs_a > MAX / abs_b) - *overflow = 1; - } else { - if (abs_a > MIN / -abs_b) - *overflow = 1; - } - return result; + return __muloXi4(a, b, overflow); } #endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/builtins/mulvdi3.c b/compiler-rt/lib/builtins/mulvdi3.c index cecc97ccf22ec..1d672c6dc155d 100644 --- a/compiler-rt/lib/builtins/mulvdi3.c +++ b/compiler-rt/lib/builtins/mulvdi3.c @@ -10,38 +10,11 @@ // //===----------------------------------------------------------------------===// -#include "int_lib.h" +#define fixint_t di_int +#include "int_mulv_impl.inc" // Returns: a * b // Effects: aborts if a * b overflows -COMPILER_RT_ABI di_int __mulvdi3(di_int a, di_int b) { - const int N = (int)(sizeof(di_int) * CHAR_BIT); - const di_int MIN = (di_int)1 << (N - 1); - const di_int MAX = ~MIN; - if (a == MIN) { - if (b == 0 || b == 1) - return a * b; - compilerrt_abort(); - } - if (b == MIN) { - if (a == 0 || a == 1) - return a * b; - compilerrt_abort(); - } - di_int sa = a >> (N - 1); - di_int abs_a = (a ^ sa) - sa; - di_int sb = b >> (N - 1); - di_int abs_b = (b ^ sb) - sb; - if (abs_a < 2 || abs_b < 2) - return a * b; - if (sa == sb) { - if (abs_a > MAX / abs_b) - compilerrt_abort(); - } else { - if (abs_a > MIN / -abs_b) - compilerrt_abort(); - } - return a * b; -} +COMPILER_RT_ABI di_int __mulvdi3(di_int a, di_int b) { return __mulvXi3(a, b); } diff --git a/compiler-rt/lib/builtins/mulvsi3.c b/compiler-rt/lib/builtins/mulvsi3.c index 0d6b18ad01a40..00b2e50eeca91 100644 --- a/compiler-rt/lib/builtins/mulvsi3.c +++ b/compiler-rt/lib/builtins/mulvsi3.c @@ -10,38 +10,11 @@ // //===----------------------------------------------------------------------===// -#include "int_lib.h" +#define fixint_t si_int +#include "int_mulv_impl.inc" // Returns: a * b // Effects: aborts if a * b overflows -COMPILER_RT_ABI si_int __mulvsi3(si_int a, si_int b) { - const int N = (int)(sizeof(si_int) * CHAR_BIT); - const si_int MIN = (si_int)1 << (N - 1); - const si_int MAX = ~MIN; - if (a == MIN) { - if (b == 0 || b == 1) - return a * b; - compilerrt_abort(); - } - if (b == MIN) { - if (a == 0 || a == 1) - return a * b; - compilerrt_abort(); - } - si_int sa = a >> (N - 1); - si_int abs_a = (a ^ sa) - sa; - si_int sb = b >> (N - 1); - si_int abs_b = (b ^ sb) - sb; - if (abs_a < 2 || abs_b < 2) - return a * b; - if (sa == sb) { - if (abs_a > MAX / abs_b) - compilerrt_abort(); - } else { - if (abs_a > MIN / -abs_b) - compilerrt_abort(); - } - return a * b; -} +COMPILER_RT_ABI si_int __mulvsi3(si_int a, si_int b) { return __mulvXi3(a, b); } diff --git a/compiler-rt/lib/builtins/mulvti3.c b/compiler-rt/lib/builtins/mulvti3.c index 03963a0ca694f..ba355149f9a76 100644 --- a/compiler-rt/lib/builtins/mulvti3.c +++ b/compiler-rt/lib/builtins/mulvti3.c @@ -18,34 +18,9 @@ // Effects: aborts if a * b overflows -COMPILER_RT_ABI ti_int __mulvti3(ti_int a, ti_int b) { - const int N = (int)(sizeof(ti_int) * CHAR_BIT); - const ti_int MIN = (ti_int)1 << (N - 1); - const ti_int MAX = ~MIN; - if (a == MIN) { - if (b == 0 || b == 1) - return a * b; - compilerrt_abort(); - } - if (b == MIN) { - if (a == 0 || a == 1) - return a * b; - compilerrt_abort(); - } - ti_int sa = a >> (N - 1); - ti_int abs_a = (a ^ sa) - sa; - ti_int sb = b >> (N - 1); - ti_int abs_b = (b ^ sb) - sb; - if (abs_a < 2 || abs_b < 2) - return a * b; - if (sa == sb) { - if (abs_a > MAX / abs_b) - compilerrt_abort(); - } else { - if (abs_a > MIN / -abs_b) - compilerrt_abort(); - } - return a * b; -} +#define fixint_t ti_int +#include "int_mulv_impl.inc" + +COMPILER_RT_ABI ti_int __mulvti3(ti_int a, ti_int b) { return __mulvXi3(a, b); } #endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp index bed9e84de67ae..4669b12786fc2 100644 --- a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp @@ -250,6 +250,13 @@ static void WorkerThread(const Command &BaseCmd, std::atomic *Counter, } } +static void ValidateDirectoryExists(const std::string &Path) { + if (!Path.empty() && !IsDirectory(Path)) { + Printf("ERROR: The required directory \"%s\" does not exist\n", Path.c_str()); + exit(1); + } +} + std::string CloneArgsWithoutX(const Vector &Args, const char *X1, const char *X2) { std::string Cmd; @@ -678,13 +685,32 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) { Options.MallocLimitMb = Options.RssLimitMb; if (Flags.runs >= 0) Options.MaxNumberOfRuns = Flags.runs; - if (!Inputs->empty() && !Flags.minimize_crash_internal_step) - Options.OutputCorpus = (*Inputs)[0]; + if (!Inputs->empty() && !Flags.minimize_crash_internal_step) { + // Ensure output corpus assumed to be the first arbitrary argument input + // is not a path to an existing file. + std::string OutputCorpusDir = (*Inputs)[0]; + if (!IsFile(OutputCorpusDir)) { + Options.OutputCorpus = OutputCorpusDir; + ValidateDirectoryExists(Options.OutputCorpus); + } + } Options.ReportSlowUnits = Flags.report_slow_units; - if (Flags.artifact_prefix) + if (Flags.artifact_prefix) { Options.ArtifactPrefix = Flags.artifact_prefix; - if (Flags.exact_artifact_path) + + // Since the prefix could be a full path to a file name prefix, assume + // that if the path ends with the platform's separator that a directory + // is desired + std::string ArtifactPathDir = Options.ArtifactPrefix; + if (!IsSeparator(ArtifactPathDir[ArtifactPathDir.length() - 1])) { + ArtifactPathDir = DirName(ArtifactPathDir); + } + ValidateDirectoryExists(ArtifactPathDir); + } + if (Flags.exact_artifact_path) { Options.ExactArtifactPath = Flags.exact_artifact_path; + ValidateDirectoryExists(DirName(Options.ExactArtifactPath)); + } Vector Dictionary; if (Flags.dict) if (!ParseDictionaryFile(FileToString(Flags.dict), &Dictionary)) @@ -707,8 +733,10 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) { Options.FocusFunction = Flags.focus_function; if (Flags.data_flow_trace) Options.DataFlowTrace = Flags.data_flow_trace; - if (Flags.features_dir) + if (Flags.features_dir) { Options.FeaturesDir = Flags.features_dir; + ValidateDirectoryExists(Options.FeaturesDir); + } if (Flags.collect_data_flow) Options.CollectDataFlow = Flags.collect_data_flow; if (Flags.stop_file) diff --git a/compiler-rt/lib/fuzzer/FuzzerIO.h b/compiler-rt/lib/fuzzer/FuzzerIO.h index 6e4368b971fa0..8def2e96304e7 100644 --- a/compiler-rt/lib/fuzzer/FuzzerIO.h +++ b/compiler-rt/lib/fuzzer/FuzzerIO.h @@ -58,6 +58,7 @@ void RawPrint(const char *Str); // Platform specific functions: bool IsFile(const std::string &Path); +bool IsDirectory(const std::string &Path); size_t FileSize(const std::string &Path); void ListFilesInDirRecursive(const std::string &Dir, long *Epoch, @@ -82,6 +83,7 @@ struct SizedFile { void GetSizedFilesFromDir(const std::string &Dir, Vector *V); char GetSeparator(); +bool IsSeparator(char C); // Similar to the basename utility: returns the file name w/o the dir prefix. std::string Basename(const std::string &Path); diff --git a/compiler-rt/lib/fuzzer/FuzzerIOPosix.cpp b/compiler-rt/lib/fuzzer/FuzzerIOPosix.cpp index aac85b08727ab..0da063a18ff7d 100644 --- a/compiler-rt/lib/fuzzer/FuzzerIOPosix.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerIOPosix.cpp @@ -31,7 +31,7 @@ bool IsFile(const std::string &Path) { return S_ISREG(St.st_mode); } -static bool IsDirectory(const std::string &Path) { +bool IsDirectory(const std::string &Path) { struct stat St; if (stat(Path.c_str(), &St)) return false; @@ -104,6 +104,10 @@ char GetSeparator() { return '/'; } +bool IsSeparator(char C) { + return C == '/'; +} + FILE* OpenFile(int Fd, const char* Mode) { return fdopen(Fd, Mode); } diff --git a/compiler-rt/lib/fuzzer/FuzzerIOWindows.cpp b/compiler-rt/lib/fuzzer/FuzzerIOWindows.cpp index 651283a551cf0..61ad35e281f57 100644 --- a/compiler-rt/lib/fuzzer/FuzzerIOWindows.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerIOWindows.cpp @@ -76,6 +76,18 @@ static bool IsDir(DWORD FileAttrs) { return FileAttrs & FILE_ATTRIBUTE_DIRECTORY; } +bool IsDirectory(const std::string &Path) { + DWORD Att = GetFileAttributesA(Path.c_str()); + + if (Att == INVALID_FILE_ATTRIBUTES) { + Printf("GetFileAttributesA() failed for \"%s\" (Error code: %lu).\n", + Path.c_str(), GetLastError()); + return false; + } + + return IsDir(Att); +} + std::string Basename(const std::string &Path) { size_t Pos = Path.find_last_of("/\\"); if (Pos == std::string::npos) return Path; @@ -227,7 +239,7 @@ intptr_t GetHandleFromFd(int fd) { return _get_osfhandle(fd); } -static bool IsSeparator(char C) { +bool IsSeparator(char C) { return C == '\\' || C == '/'; } diff --git a/compiler-rt/lib/fuzzer/FuzzerMutate.cpp b/compiler-rt/lib/fuzzer/FuzzerMutate.cpp index 29541eac5dc60..df9ada45bb039 100644 --- a/compiler-rt/lib/fuzzer/FuzzerMutate.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerMutate.cpp @@ -425,26 +425,26 @@ size_t MutationDispatcher::Mutate_CrossOver(uint8_t *Data, size_t Size, if (!CrossOverWith) return 0; const Unit &O = *CrossOverWith; if (O.empty()) return 0; - MutateInPlaceHere.resize(MaxSize); - auto &U = MutateInPlaceHere; size_t NewSize = 0; switch(Rand(3)) { case 0: - NewSize = CrossOver(Data, Size, O.data(), O.size(), U.data(), U.size()); + MutateInPlaceHere.resize(MaxSize); + NewSize = CrossOver(Data, Size, O.data(), O.size(), + MutateInPlaceHere.data(), MaxSize); + memcpy(Data, MutateInPlaceHere.data(), NewSize); break; case 1: - NewSize = InsertPartOf(O.data(), O.size(), U.data(), U.size(), MaxSize); + NewSize = InsertPartOf(O.data(), O.size(), Data, Size, MaxSize); if (!NewSize) - NewSize = CopyPartOf(O.data(), O.size(), U.data(), U.size()); + NewSize = CopyPartOf(O.data(), O.size(), Data, Size); break; case 2: - NewSize = CopyPartOf(O.data(), O.size(), U.data(), U.size()); + NewSize = CopyPartOf(O.data(), O.size(), Data, Size); break; default: assert(0); } assert(NewSize > 0 && "CrossOver returned empty unit"); assert(NewSize <= MaxSize && "CrossOver returned overisized unit"); - memcpy(Data, U.data(), NewSize); return NewSize; } diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp index a1ad5c4f1abc8..3028f79f041c3 100644 --- a/compiler-rt/lib/msan/msan.cpp +++ b/compiler-rt/lib/msan/msan.cpp @@ -695,6 +695,37 @@ void __msan_set_death_callback(void (*callback)(void)) { SetUserDieCallback(callback); } +void __msan_start_switch_fiber(const void *bottom, uptr size) { + MsanThread *t = GetCurrentThread(); + if (!t) { + VReport(1, "__msan_start_switch_fiber called from unknown thread\n"); + return; + } + t->StartSwitchFiber((uptr)bottom, size); +} + +void __msan_finish_switch_fiber(const void **bottom_old, uptr *size_old) { + MsanThread *t = GetCurrentThread(); + if (!t) { + VReport(1, "__msan_finish_switch_fiber called from unknown thread\n"); + return; + } + t->FinishSwitchFiber((uptr *)bottom_old, (uptr *)size_old); + + internal_memset(__msan_param_tls, 0, sizeof(__msan_param_tls)); + internal_memset(__msan_retval_tls, 0, sizeof(__msan_retval_tls)); + internal_memset(__msan_va_arg_tls, 0, sizeof(__msan_va_arg_tls)); + + if (__msan_get_track_origins()) { + internal_memset(__msan_param_origin_tls, 0, + sizeof(__msan_param_origin_tls)); + internal_memset(&__msan_retval_origin_tls, 0, + sizeof(__msan_retval_origin_tls)); + internal_memset(__msan_va_arg_origin_tls, 0, + sizeof(__msan_va_arg_origin_tls)); + } +} + #if !SANITIZER_SUPPORTS_WEAK_HOOKS extern "C" { SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE diff --git a/compiler-rt/lib/msan/msan_interface_internal.h b/compiler-rt/lib/msan/msan_interface_internal.h index 9e3db06bd64d7..17922a888b9c9 100644 --- a/compiler-rt/lib/msan/msan_interface_internal.h +++ b/compiler-rt/lib/msan/msan_interface_internal.h @@ -187,6 +187,12 @@ void __msan_scoped_disable_interceptor_checks(); SANITIZER_INTERFACE_ATTRIBUTE void __msan_scoped_enable_interceptor_checks(); + +SANITIZER_INTERFACE_ATTRIBUTE +void __msan_start_switch_fiber(const void *bottom, uptr size); + +SANITIZER_INTERFACE_ATTRIBUTE +void __msan_finish_switch_fiber(const void **bottom_old, uptr *size_old); } // extern "C" #endif // MSAN_INTERFACE_INTERNAL_H diff --git a/compiler-rt/lib/msan/msan_thread.cpp b/compiler-rt/lib/msan/msan_thread.cpp index 0ba4993500642..456901c6789bb 100644 --- a/compiler-rt/lib/msan/msan_thread.cpp +++ b/compiler-rt/lib/msan/msan_thread.cpp @@ -22,9 +22,9 @@ MsanThread *MsanThread::Create(thread_callback_t start_routine, void MsanThread::SetThreadStackAndTls() { uptr tls_size = 0; uptr stack_size = 0; - GetThreadStackAndTls(IsMainThread(), &stack_bottom_, &stack_size, - &tls_begin_, &tls_size); - stack_top_ = stack_bottom_ + stack_size; + GetThreadStackAndTls(IsMainThread(), &stack_.bottom, &stack_size, &tls_begin_, + &tls_size); + stack_.top = stack_.bottom + stack_size; tls_end_ = tls_begin_ + tls_size; int local; @@ -32,7 +32,7 @@ void MsanThread::SetThreadStackAndTls() { } void MsanThread::ClearShadowForThreadStackAndTLS() { - __msan_unpoison((void *)stack_bottom_, stack_top_ - stack_bottom_); + __msan_unpoison((void *)stack_.bottom, stack_.top - stack_.bottom); if (tls_begin_ != tls_end_) __msan_unpoison((void *)tls_begin_, tls_end_ - tls_begin_); DTLS *dtls = DTLS_Get(); @@ -43,8 +43,8 @@ void MsanThread::ClearShadowForThreadStackAndTLS() { void MsanThread::Init() { SetThreadStackAndTls(); - CHECK(MEM_IS_APP(stack_bottom_)); - CHECK(MEM_IS_APP(stack_top_ - 1)); + CHECK(MEM_IS_APP(stack_.bottom)); + CHECK(MEM_IS_APP(stack_.top - 1)); ClearShadowForThreadStackAndTLS(); } @@ -79,4 +79,45 @@ thread_return_t MsanThread::ThreadStart() { return res; } +MsanThread::StackBounds MsanThread::GetStackBounds() const { + if (!stack_switching_) + return {stack_.bottom, stack_.top}; + const uptr cur_stack = GET_CURRENT_FRAME(); + // Note: need to check next stack first, because FinishSwitchFiber + // may be in process of overwriting stack_.top/bottom_. But in such case + // we are already on the next stack. + if (cur_stack >= next_stack_.bottom && cur_stack < next_stack_.top) + return {next_stack_.bottom, next_stack_.top}; + return {stack_.bottom, stack_.top}; +} + +uptr MsanThread::stack_top() { return GetStackBounds().top; } + +uptr MsanThread::stack_bottom() { return GetStackBounds().bottom; } + +bool MsanThread::AddrIsInStack(uptr addr) { + const auto bounds = GetStackBounds(); + return addr >= bounds.bottom && addr < bounds.top; +} + +void MsanThread::StartSwitchFiber(uptr bottom, uptr size) { + CHECK(!stack_switching_); + next_stack_.bottom = bottom; + next_stack_.top = bottom + size; + stack_switching_ = true; +} + +void MsanThread::FinishSwitchFiber(uptr *bottom_old, uptr *size_old) { + CHECK(stack_switching_); + if (bottom_old) + *bottom_old = stack_.bottom; + if (size_old) + *size_old = stack_.top - stack_.bottom; + stack_.bottom = next_stack_.bottom; + stack_.top = next_stack_.top; + stack_switching_ = false; + next_stack_.top = 0; + next_stack_.bottom = 0; +} + } // namespace __msan diff --git a/compiler-rt/lib/msan/msan_thread.h b/compiler-rt/lib/msan/msan_thread.h index 808780cd57b92..fe795e3a547ad 100644 --- a/compiler-rt/lib/msan/msan_thread.h +++ b/compiler-rt/lib/msan/msan_thread.h @@ -27,20 +27,21 @@ class MsanThread { void Init(); // Should be called from the thread itself. thread_return_t ThreadStart(); - uptr stack_top() { return stack_top_; } - uptr stack_bottom() { return stack_bottom_; } + uptr stack_top(); + uptr stack_bottom(); uptr tls_begin() { return tls_begin_; } uptr tls_end() { return tls_end_; } bool IsMainThread() { return start_routine_ == nullptr; } - bool AddrIsInStack(uptr addr) { - return addr >= stack_bottom_ && addr < stack_top_; - } + bool AddrIsInStack(uptr addr); bool InSignalHandler() { return in_signal_handler_; } void EnterSignalHandler() { in_signal_handler_++; } void LeaveSignalHandler() { in_signal_handler_--; } + void StartSwitchFiber(uptr bottom, uptr size); + void FinishSwitchFiber(uptr *bottom_old, uptr *size_old); + MsanThreadLocalMallocStorage &malloc_storage() { return malloc_storage_; } int destructor_iterations_; @@ -50,10 +51,19 @@ class MsanThread { // via mmap() and *must* be valid in zero-initialized state. void SetThreadStackAndTls(); void ClearShadowForThreadStackAndTLS(); + struct StackBounds { + uptr bottom; + uptr top; + }; + StackBounds GetStackBounds() const; thread_callback_t start_routine_; void *arg_; - uptr stack_top_; - uptr stack_bottom_; + + bool stack_switching_; + + StackBounds stack_; + StackBounds next_stack_; + uptr tls_begin_; uptr tls_end_; diff --git a/compiler-rt/lib/profile/InstrProfilingPort.h b/compiler-rt/lib/profile/InstrProfilingPort.h index 4493dd512ff0d..cb66c5964ad1c 100644 --- a/compiler-rt/lib/profile/InstrProfilingPort.h +++ b/compiler-rt/lib/profile/InstrProfilingPort.h @@ -24,11 +24,17 @@ #define COMPILER_RT_ALWAYS_INLINE __forceinline #define COMPILER_RT_CLEANUP(x) #elif __GNUC__ -#define COMPILER_RT_ALIGNAS(x) __attribute__((aligned(x))) +#ifdef _WIN32 +#define COMPILER_RT_FTRUNCATE(f, l) _chsize(fileno(f), l) +#define COMPILER_RT_VISIBILITY +#define COMPILER_RT_WEAK __attribute__((selectany)) +#else +#define COMPILER_RT_FTRUNCATE(f, l) ftruncate(fileno(f), l) #define COMPILER_RT_VISIBILITY __attribute__((visibility("hidden"))) #define COMPILER_RT_WEAK __attribute__((weak)) +#endif +#define COMPILER_RT_ALIGNAS(x) __attribute__((aligned(x))) #define COMPILER_RT_ALLOCA __builtin_alloca -#define COMPILER_RT_FTRUNCATE(f,l) ftruncate(fileno(f),l) #define COMPILER_RT_ALWAYS_INLINE inline __attribute((always_inline)) #define COMPILER_RT_CLEANUP(x) __attribute__((cleanup(x))) #endif diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp index 964d7e7ff66a1..b8b75c20d9f9a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp @@ -293,7 +293,7 @@ uptr SignalContext::GetAddress() const { bool SignalContext::IsMemoryAccess() const { auto si = static_cast(siginfo); - return si->si_signo == SIGSEGV; + return si->si_signo == SIGSEGV || si->si_signo == SIGBUS; } int SignalContext::GetType() const { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h index 4d0d96a64f622..a288068bf9438 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h @@ -18,4 +18,6 @@ #define ptrauth_string_discriminator(__string) ((int)0) #endif +#define STRIP_PC(pc) ((uptr)ptrauth_strip(pc, 0)) + #endif // SANITIZER_PTRAUTH_H diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h index e41b949d8d0e6..0a985fb67beae 100644 --- a/compiler-rt/lib/scudo/standalone/primary32.h +++ b/compiler-rt/lib/scudo/standalone/primary32.h @@ -483,12 +483,15 @@ class SizeClassAllocator32 { } } uptr TotalReleasedBytes = 0; + auto SkipRegion = [this, First, ClassId](uptr RegionIndex) { + return (PossibleRegions[First + RegionIndex] - 1U) != ClassId; + }; if (First && Last) { const uptr Base = First * RegionSize; const uptr NumberOfRegions = Last - First + 1U; ReleaseRecorder Recorder(Base); releaseFreeMemoryToOS(Sci->FreeList, Base, RegionSize, NumberOfRegions, - BlockSize, &Recorder); + BlockSize, &Recorder, SkipRegion); if (Recorder.getReleasedRangesCount() > 0) { Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks; Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount(); diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h index ad92ae250e1f4..933b1ee7c9670 100644 --- a/compiler-rt/lib/scudo/standalone/primary64.h +++ b/compiler-rt/lib/scudo/standalone/primary64.h @@ -479,9 +479,11 @@ class SizeClassAllocator64 { } } + auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; }; ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data); releaseFreeMemoryToOS(Region->FreeList, Region->RegionBeg, - Region->AllocatedUser, 1U, BlockSize, &Recorder); + Region->AllocatedUser, 1U, BlockSize, &Recorder, + SkipRegion); if (Recorder.getReleasedRangesCount() > 0) { Region->ReleaseInfo.PushedBlocksAtLastRelease = diff --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h index 748e1c0011530..cd9e66d63b368 100644 --- a/compiler-rt/lib/scudo/standalone/release.h +++ b/compiler-rt/lib/scudo/standalone/release.h @@ -156,6 +156,11 @@ template class FreePagesRangeTracker { CurrentPage++; } + void skipPages(uptr N) { + closeOpenedRange(); + CurrentPage += N; + } + void finish() { closeOpenedRange(); } private: @@ -174,11 +179,11 @@ template class FreePagesRangeTracker { uptr CurrentRangeStatePage = 0; }; -template +template NOINLINE void releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, uptr RegionSize, uptr NumberOfRegions, uptr BlockSize, - ReleaseRecorderT *Recorder) { + ReleaseRecorderT *Recorder, SkipRegionT SkipRegion) { const uptr PageSize = getPageSizeCached(); // Figure out the number of chunks per page and whether we can take a fast @@ -283,10 +288,15 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, FreePagesRangeTracker RangeTracker(Recorder); if (SameBlockCountPerPage) { // Fast path, every page has the same number of chunks affecting it. - for (uptr I = 0; I < NumberOfRegions; I++) + for (uptr I = 0; I < NumberOfRegions; I++) { + if (SkipRegion(I)) { + RangeTracker.skipPages(PagesCount); + continue; + } for (uptr J = 0; J < PagesCount; J++) RangeTracker.processNextPage(Counters.get(I, J) == FullPagesBlockCountMax); + } } else { // Slow path, go through the pages keeping count how many chunks affect // each page. @@ -298,6 +308,10 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, // up the number of chunks on the current page and checking on every step // whether the page boundary was crossed. for (uptr I = 0; I < NumberOfRegions; I++) { + if (SkipRegion(I)) { + RangeTracker.skipPages(PagesCount); + continue; + } uptr PrevPageBoundary = 0; uptr CurrentBoundary = 0; for (uptr J = 0; J < PagesCount; J++) { diff --git a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp index a693b97f80da6..9e991a7054ed7 100644 --- a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp @@ -190,9 +190,10 @@ template void testReleaseFreeMemoryToOS() { } // Release the memory. + auto SkipRegion = [](UNUSED scudo::uptr RegionIndex) { return false; }; ReleasedPagesRecorder Recorder; releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, 1U, BlockSize, - &Recorder); + &Recorder, SkipRegion); // Verify that there are no released pages touched by used chunks and all // ranges of free chunks big enough to contain the entire memory pages had diff --git a/compiler-rt/lib/tsan/rtl/tsan_external.cpp b/compiler-rt/lib/tsan/rtl/tsan_external.cpp index 0faa1ee93a139..466b2bf0f66ce 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_external.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_external.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "tsan_rtl.h" #include "tsan_interceptors.h" +#include "sanitizer_common/sanitizer_ptrauth.h" namespace __tsan { @@ -57,13 +58,13 @@ uptr TagFromShadowStackFrame(uptr pc) { #if !SANITIZER_GO typedef void(*AccessFunc)(ThreadState *, uptr, uptr, int); -void ExternalAccess(void *addr, void *caller_pc, void *tag, AccessFunc access) { +void ExternalAccess(void *addr, uptr caller_pc, void *tag, AccessFunc access) { CHECK_LT(tag, atomic_load(&used_tags, memory_order_relaxed)); ThreadState *thr = cur_thread(); - if (caller_pc) FuncEntry(thr, (uptr)caller_pc); + if (caller_pc) FuncEntry(thr, caller_pc); InsertShadowStackFrameForTag(thr, (uptr)tag); bool in_ignored_lib; - if (!caller_pc || !libignore()->IsIgnored((uptr)caller_pc, &in_ignored_lib)) { + if (!caller_pc || !libignore()->IsIgnored(caller_pc, &in_ignored_lib)) { access(thr, CALLERPC, (uptr)addr, kSizeLog1); } FuncExit(thr); @@ -110,12 +111,12 @@ void __tsan_external_assign_tag(void *addr, void *tag) { SANITIZER_INTERFACE_ATTRIBUTE void __tsan_external_read(void *addr, void *caller_pc, void *tag) { - ExternalAccess(addr, caller_pc, tag, MemoryRead); + ExternalAccess(addr, STRIP_PC(caller_pc), tag, MemoryRead); } SANITIZER_INTERFACE_ATTRIBUTE void __tsan_external_write(void *addr, void *caller_pc, void *tag) { - ExternalAccess(addr, caller_pc, tag, MemoryWrite); + ExternalAccess(addr, STRIP_PC(caller_pc), tag, MemoryWrite); } } // extern "C" diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_libdispatch.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_libdispatch.cpp index 99ec275011008..b56cc2dab7044 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_libdispatch.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_libdispatch.cpp @@ -235,9 +235,15 @@ SANITIZER_WEAK_IMPORT void dispatch_async_and_wait( dispatch_queue_t queue, DISPATCH_NOESCAPE dispatch_block_t block); SANITIZER_WEAK_IMPORT void dispatch_async_and_wait_f( dispatch_queue_t queue, void *context, dispatch_function_t work); +SANITIZER_WEAK_IMPORT void dispatch_barrier_async_and_wait( + dispatch_queue_t queue, DISPATCH_NOESCAPE dispatch_block_t block); +SANITIZER_WEAK_IMPORT void dispatch_barrier_async_and_wait_f( + dispatch_queue_t queue, void *context, dispatch_function_t work); DISPATCH_INTERCEPT_SYNC_B(dispatch_async_and_wait, false) DISPATCH_INTERCEPT_SYNC_F(dispatch_async_and_wait_f, false) +DISPATCH_INTERCEPT_SYNC_B(dispatch_barrier_async_and_wait, true) +DISPATCH_INTERCEPT_SYNC_F(dispatch_barrier_async_and_wait_f, true) #endif @@ -770,6 +776,8 @@ void InitializeLibdispatchInterceptors() { INTERCEPT_FUNCTION(dispatch_barrier_sync_f); INTERCEPT_FUNCTION(dispatch_async_and_wait); INTERCEPT_FUNCTION(dispatch_async_and_wait_f); + INTERCEPT_FUNCTION(dispatch_barrier_async_and_wait); + INTERCEPT_FUNCTION(dispatch_barrier_async_and_wait_f); INTERCEPT_FUNCTION(dispatch_after); INTERCEPT_FUNCTION(dispatch_after_f); INTERCEPT_FUNCTION(dispatch_once); diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_mach_vm.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_mach_vm.cpp index cd318f8af93f0..6d62ff6a83825 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_mach_vm.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_mach_vm.cpp @@ -19,12 +19,11 @@ namespace __tsan { -static bool intersects_with_shadow(mach_vm_address_t *address, +static bool intersects_with_shadow(mach_vm_address_t address, mach_vm_size_t size, int flags) { // VM_FLAGS_FIXED is 0x0, so we have to test for VM_FLAGS_ANYWHERE. if (flags & VM_FLAGS_ANYWHERE) return false; - uptr ptr = *address; - return !IsAppMem(ptr) || !IsAppMem(ptr + size - 1); + return !IsAppMem(address) || !IsAppMem(address + size - 1); } TSAN_INTERCEPTOR(kern_return_t, mach_vm_allocate, vm_map_t target, @@ -32,12 +31,12 @@ TSAN_INTERCEPTOR(kern_return_t, mach_vm_allocate, vm_map_t target, SCOPED_TSAN_INTERCEPTOR(mach_vm_allocate, target, address, size, flags); if (target != mach_task_self()) return REAL(mach_vm_allocate)(target, address, size, flags); - if (intersects_with_shadow(address, size, flags)) + if (address && intersects_with_shadow(*address, size, flags)) return KERN_NO_SPACE; - kern_return_t res = REAL(mach_vm_allocate)(target, address, size, flags); - if (res == KERN_SUCCESS) + kern_return_t kr = REAL(mach_vm_allocate)(target, address, size, flags); + if (kr == KERN_SUCCESS) MemoryRangeImitateWriteOrResetRange(thr, pc, *address, size); - return res; + return kr; } TSAN_INTERCEPTOR(kern_return_t, mach_vm_deallocate, vm_map_t target, @@ -45,8 +44,10 @@ TSAN_INTERCEPTOR(kern_return_t, mach_vm_deallocate, vm_map_t target, SCOPED_TSAN_INTERCEPTOR(mach_vm_deallocate, target, address, size); if (target != mach_task_self()) return REAL(mach_vm_deallocate)(target, address, size); - UnmapShadow(thr, address, size); - return REAL(mach_vm_deallocate)(target, address, size); + kern_return_t kr = REAL(mach_vm_deallocate)(target, address, size); + if (kr == KERN_SUCCESS && address) + UnmapShadow(thr, address, size); + return kr; } } // namespace __tsan diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface.cpp b/compiler-rt/lib/tsan/rtl/tsan_interface.cpp index 2b3a0889b70a4..5c2a617a24c3d 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interface.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interface.cpp @@ -14,6 +14,7 @@ #include "tsan_interface_ann.h" #include "tsan_rtl.h" #include "sanitizer_common/sanitizer_internal_defs.h" +#include "sanitizer_common/sanitizer_ptrauth.h" #define CALLERPC ((uptr)__builtin_return_address(0)) @@ -43,13 +44,13 @@ void __tsan_write16(void *addr) { } void __tsan_read16_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog8); - MemoryRead(cur_thread(), (uptr)pc, (uptr)addr + 8, kSizeLog8); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr + 8, kSizeLog8); } void __tsan_write16_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog8); - MemoryWrite(cur_thread(), (uptr)pc, (uptr)addr + 8, kSizeLog8); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr + 8, kSizeLog8); } // __tsan_unaligned_read/write calls are emitted by compiler. diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h b/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h index f955ddf99247c..f5d743c10772e 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h +++ b/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h @@ -12,6 +12,7 @@ #include "tsan_interface.h" #include "tsan_rtl.h" +#include "sanitizer_common/sanitizer_ptrauth.h" #define CALLERPC ((uptr)__builtin_return_address(0)) @@ -50,35 +51,35 @@ void __tsan_write8(void *addr) { } void __tsan_read1_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog1); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog1); } void __tsan_read2_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog2); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog2); } void __tsan_read4_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog4); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog4); } void __tsan_read8_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog8); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); } void __tsan_write1_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog1); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog1); } void __tsan_write2_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog2); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog2); } void __tsan_write4_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog4); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog4); } void __tsan_write8_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), (uptr)pc, (uptr)addr, kSizeLog8); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); } void __tsan_vptr_update(void **vptr_p, void *new_val) { @@ -100,7 +101,7 @@ void __tsan_vptr_read(void **vptr_p) { } void __tsan_func_entry(void *pc) { - FuncEntry(cur_thread(), (uptr)pc); + FuncEntry(cur_thread(), STRIP_PC(pc)); } void __tsan_func_exit() { @@ -124,9 +125,9 @@ void __tsan_write_range(void *addr, uptr size) { } void __tsan_read_range_pc(void *addr, uptr size, void *pc) { - MemoryAccessRange(cur_thread(), (uptr)pc, (uptr)addr, size, false); + MemoryAccessRange(cur_thread(), STRIP_PC(pc), (uptr)addr, size, false); } void __tsan_write_range_pc(void *addr, uptr size, void *pc) { - MemoryAccessRange(cur_thread(), (uptr)pc, (uptr)addr, size, true); + MemoryAccessRange(cur_thread(), STRIP_PC(pc), (uptr)addr, size, true); } diff --git a/compiler-rt/lib/xray/tests/CMakeLists.txt b/compiler-rt/lib/xray/tests/CMakeLists.txt index 96a9db1ef8777..a1fbccaeb6d26 100644 --- a/compiler-rt/lib/xray/tests/CMakeLists.txt +++ b/compiler-rt/lib/xray/tests/CMakeLists.txt @@ -55,7 +55,7 @@ set(XRAY_UNITTEST_LINK_FLAGS if (NOT APPLE) # Needed by LLVMSupport. append_list_if( - LLVM_ENABLE_TERMINFO + COMPILER_RT_HAS_TERMINFO -l${COMPILER_RT_TERMINFO_LIB} XRAY_UNITTEST_LINK_FLAGS) if (COMPILER_RT_STANDALONE_BUILD) diff --git a/compiler-rt/test/asan/TestCases/Posix/high-address-dereference.c b/compiler-rt/test/asan/TestCases/Posix/high-address-dereference.c index 78503302891b5..845e126d3f89b 100644 --- a/compiler-rt/test/asan/TestCases/Posix/high-address-dereference.c +++ b/compiler-rt/test/asan/TestCases/Posix/high-address-dereference.c @@ -25,8 +25,8 @@ int main(int argc, const char *argv[]) { // ZERO: SEGV on unknown address 0x000000000000 (pc // LOW1: SEGV on unknown address 0x000000000fff (pc // LOW2: SEGV on unknown address 0x000000001000 (pc -// HIGH: SEGV on unknown address (pc -// MAX: SEGV on unknown address (pc +// HIGH: {{BUS|SEGV}} on unknown address (pc +// MAX: {{BUS|SEGV}} on unknown address (pc // HINT-PAGE0-NOT: Hint: this fault was caused by a dereference of a high value address // HINT-PAGE0: Hint: address points to the zero page. @@ -40,8 +40,8 @@ int main(int argc, const char *argv[]) { // ZERO: SCARINESS: 10 (null-deref) // LOW1: SCARINESS: 10 (null-deref) // LOW2: SCARINESS: 20 (wild-addr-read) -// HIGH: SCARINESS: 20 (wild-addr-read) -// MAX: SCARINESS: 20 (wild-addr-read) +// HIGH: SCARINESS: {{(20 \(wild-addr-read\))|(60 \(wild-jump\))}} +// MAX: SCARINESS: {{(20 \(wild-addr-read\))|(60 \(wild-jump\))}} // TODO: Currently, register values are only printed on Mac. Once this changes, // remove the 'TODO_' prefix in the following lines. diff --git a/compiler-rt/test/asan/Unit/lit.site.cfg.py.in b/compiler-rt/test/asan/Unit/lit.site.cfg.py.in index d1fd640e7385a..aae5078affadb 100644 --- a/compiler-rt/test/asan/Unit/lit.site.cfg.py.in +++ b/compiler-rt/test/asan/Unit/lit.site.cfg.py.in @@ -16,6 +16,15 @@ def push_ld_library_path(config, new_path): (new_path, config.environment.get('LD_32_LIBRARY_PATH', ''))) config.environment['LD_32_LIBRARY_PATH'] = new_ld_32_library_path + if platform.system() == 'SunOS': + new_ld_library_path_32 = os.path.pathsep.join( + (new_path, config.environment.get('LD_LIBRARY_PATH_32', ''))) + config.environment['LD_32_LIBRARY_PATH'] = new_ld_library_path_32 + + new_ld_library_path_64 = os.path.pathsep.join( + (new_path, config.environment.get('LD_LIBRARY_PATH_64', ''))) + config.environment['LD_64_LIBRARY_PATH'] = new_ld_library_path_64 + # Setup config name. config.name = 'AddressSanitizer-Unit' diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py index 63c02f7ddeeb0..16a73d65b2b1a 100644 --- a/compiler-rt/test/asan/lit.cfg.py +++ b/compiler-rt/test/asan/lit.cfg.py @@ -42,6 +42,17 @@ def push_dynamic_library_lookup_path(config, new_path): (new_path, config.environment.get(dynamic_library_lookup_var, ''))) config.environment[dynamic_library_lookup_var] = new_ld_32_library_path + if platform.system() == 'SunOS': + dynamic_library_lookup_var = 'LD_LIBRARY_PATH_32' + new_ld_library_path_32 = os.path.pathsep.join( + (new_path, config.environment.get(dynamic_library_lookup_var, ''))) + config.environment[dynamic_library_lookup_var] = new_ld_library_path_32 + + dynamic_library_lookup_var = 'LD_LIBRARY_PATH_64' + new_ld_library_path_64 = os.path.pathsep.join( + (new_path, config.environment.get(dynamic_library_lookup_var, ''))) + config.environment[dynamic_library_lookup_var] = new_ld_library_path_64 + # Setup config name. config.name = 'AddressSanitizer' + config.name_suffix diff --git a/compiler-rt/test/builtins/Unit/divdf3_test.c b/compiler-rt/test/builtins/Unit/divdf3_test.c index 3bfde95c32980..8d6503909060f 100644 --- a/compiler-rt/test/builtins/Unit/divdf3_test.c +++ b/compiler-rt/test/builtins/Unit/divdf3_test.c @@ -24,11 +24,72 @@ int test__divdf3(double a, double b, uint64_t expected) int main() { + // Returned NaNs are assumed to be qNaN by default + + // qNaN / any = qNaN + if (test__divdf3(makeQNaN64(), 3., UINT64_C(0x7ff8000000000000))) + return 1; + // NaN / any = NaN + if (test__divdf3(makeNaN64(UINT64_C(0x123)), 3., UINT64_C(0x7ff8000000000000))) + return 1; + // any / qNaN = qNaN + if (test__divdf3(3., makeQNaN64(), UINT64_C(0x7ff8000000000000))) + return 1; + // any / NaN = NaN + if (test__divdf3(3., makeNaN64(UINT64_C(0x123)), UINT64_C(0x7ff8000000000000))) + return 1; + + // +Inf / positive = +Inf + if (test__divdf3(makeInf64(), 3., UINT64_C(0x7ff0000000000000))) + return 1; + // +Inf / negative = -Inf + if (test__divdf3(makeInf64(), -3., UINT64_C(0xfff0000000000000))) + return 1; + // -Inf / positive = -Inf + if (test__divdf3(makeNegativeInf64(), 3., UINT64_C(0xfff0000000000000))) + return 1; + // -Inf / negative = +Inf + if (test__divdf3(makeNegativeInf64(), -3., UINT64_C(0x7ff0000000000000))) + return 1; + + // Inf / Inf = NaN + if (test__divdf3(makeInf64(), makeInf64(), UINT64_C(0x7ff8000000000000))) + return 1; + // 0.0 / 0.0 = NaN + if (test__divdf3(+0x0.0p+0, +0x0.0p+0, UINT64_C(0x7ff8000000000000))) + return 1; + // +0.0 / +Inf = +0.0 + if (test__divdf3(+0x0.0p+0, makeInf64(), UINT64_C(0x0))) + return 1; + // +Inf / +0.0 = +Inf + if (test__divdf3(makeInf64(), +0x0.0p+0, UINT64_C(0x7ff0000000000000))) + return 1; + + // positive / +0.0 = +Inf + if (test__divdf3(+1.0, +0x0.0p+0, UINT64_C(0x7ff0000000000000))) + return 1; + // positive / -0.0 = -Inf + if (test__divdf3(+1.0, -0x0.0p+0, UINT64_C(0xfff0000000000000))) + return 1; + // negative / +0.0 = -Inf + if (test__divdf3(-1.0, +0x0.0p+0, UINT64_C(0xfff0000000000000))) + return 1; + // negative / -0.0 = +Inf + if (test__divdf3(-1.0, -0x0.0p+0, UINT64_C(0x7ff0000000000000))) + return 1; + // 1/3 - if (test__divdf3(1., 3., 0x3fd5555555555555ULL)) + if (test__divdf3(1., 3., UINT64_C(0x3fd5555555555555))) return 1; // smallest normal result - if (test__divdf3(4.450147717014403e-308, 2., 0x10000000000000ULL)) + if (test__divdf3(0x1.0p-1021, 2., UINT64_C(0x10000000000000))) + return 1; + + // divisor is exactly 1.0 + if (test__divdf3(0x1.0p+0, 0x1.0p+0, UINT64_C(0x3ff0000000000000))) + return 1; + // divisor is truncated to exactly 1.0 in UQ1.31 + if (test__divdf3(0x1.0p+0, 0x1.00000001p+0, UINT64_C(0x3fefffffffe00000))) return 1; return 0; diff --git a/compiler-rt/test/builtins/Unit/divsf3_test.c b/compiler-rt/test/builtins/Unit/divsf3_test.c index e352b2284cd78..197aad729f8f9 100644 --- a/compiler-rt/test/builtins/Unit/divsf3_test.c +++ b/compiler-rt/test/builtins/Unit/divsf3_test.c @@ -24,11 +24,72 @@ int test__divsf3(float a, float b, uint32_t expected) int main() { + // Returned NaNs are assumed to be qNaN by default + + // qNaN / any = qNaN + if (test__divsf3(makeQNaN32(), 3.F, UINT32_C(0x7fc00000))) + return 1; + // NaN / any = NaN + if (test__divsf3(makeNaN32(UINT32_C(0x123)), 3.F, UINT32_C(0x7fc00000))) + return 1; + // any / qNaN = qNaN + if (test__divsf3(3.F, makeQNaN32(), UINT32_C(0x7fc00000))) + return 1; + // any / NaN = NaN + if (test__divsf3(3.F, makeNaN32(UINT32_C(0x123)), UINT32_C(0x7fc00000))) + return 1; + + // +Inf / positive = +Inf + if (test__divsf3(makeInf32(), 3.F, UINT32_C(0x7f800000))) + return 1; + // +Inf / negative = -Inf + if (test__divsf3(makeInf32(), -3.F, UINT32_C(0xff800000))) + return 1; + // -Inf / positive = -Inf + if (test__divsf3(makeNegativeInf32(), 3.F, UINT32_C(0xff800000))) + return 1; + // -Inf / negative = +Inf + if (test__divsf3(makeNegativeInf32(), -3.F, UINT32_C(0x7f800000))) + return 1; + + // Inf / Inf = NaN + if (test__divsf3(makeInf32(), makeInf32(), UINT32_C(0x7fc00000))) + return 1; + // 0.0 / 0.0 = NaN + if (test__divsf3(+0x0.0p+0F, +0x0.0p+0F, UINT32_C(0x7fc00000))) + return 1; + // +0.0 / +Inf = +0.0 + if (test__divsf3(+0x0.0p+0F, makeInf32(), UINT32_C(0x0))) + return 1; + // +Inf / +0.0 = +Inf + if (test__divsf3(makeInf32(), +0x0.0p+0F, UINT32_C(0x7f800000))) + return 1; + + // positive / +0.0 = +Inf + if (test__divsf3(+1.F, +0x0.0p+0F, UINT32_C(0x7f800000))) + return 1; + // positive / -0.0 = -Inf + if (test__divsf3(+1.F, -0x0.0p+0F, UINT32_C(0xff800000))) + return 1; + // negative / +0.0 = -Inf + if (test__divsf3(-1.F, +0x0.0p+0F, UINT32_C(0xff800000))) + return 1; + // negative / -0.0 = +Inf + if (test__divsf3(-1.F, -0x0.0p+0F, UINT32_C(0x7f800000))) + return 1; + // 1/3 - if (test__divsf3(1.f, 3.f, 0x3EAAAAABU)) + if (test__divsf3(1.F, 3.F, UINT32_C(0x3eaaaaab))) return 1; // smallest normal result - if (test__divsf3(2.3509887e-38, 2., 0x00800000U)) + if (test__divsf3(0x1.0p-125F, 2.F, UINT32_C(0x00800000))) + return 1; + + // divisor is exactly 1.0 + if (test__divsf3(0x1.0p+0F, 0x1.0p+0F, UINT32_C(0x3f800000))) + return 1; + // divisor is truncated to exactly 1.0 in UQ1.15 + if (test__divsf3(0x1.0p+0F, 0x1.0001p+0F, UINT32_C(0x3f7fff00))) return 1; return 0; diff --git a/compiler-rt/test/builtins/Unit/divtf3_test.c b/compiler-rt/test/builtins/Unit/divtf3_test.c index 41d2809a7d8f0..b0bba02ab9587 100644 --- a/compiler-rt/test/builtins/Unit/divtf3_test.c +++ b/compiler-rt/test/builtins/Unit/divtf3_test.c @@ -32,6 +32,8 @@ char assumption_1[sizeof(long double) * CHAR_BIT == 128] = {0}; int main() { #if __LDBL_MANT_DIG__ == 113 + // Returned NaNs are assumed to be qNaN by default + // qNaN / any = qNaN if (test__divtf3(makeQNaN128(), 0x1.23456789abcdefp+5L, @@ -39,17 +41,111 @@ int main() UINT64_C(0x0))) return 1; // NaN / any = NaN - if (test__divtf3(makeNaN128(UINT64_C(0x800030000000)), + if (test__divtf3(makeNaN128(UINT64_C(0x30000000)), 0x1.23456789abcdefp+5L, UINT64_C(0x7fff800000000000), UINT64_C(0x0))) return 1; - // inf / any = inf - if (test__divtf3(makeInf128(), - 0x1.23456789abcdefp+5L, + // any / qNaN = qNaN + if (test__divtf3(0x1.23456789abcdefp+5L, + makeQNaN128(), + UINT64_C(0x7fff800000000000), + UINT64_C(0x0))) + return 1; + // any / NaN = NaN + if (test__divtf3(0x1.23456789abcdefp+5L, + makeNaN128(UINT64_C(0x30000000)), + UINT64_C(0x7fff800000000000), + UINT64_C(0x0))) + return 1; + + // +Inf / positive = +Inf + if (test__divtf3(makeInf128(), 3.L, UINT64_C(0x7fff000000000000), UINT64_C(0x0))) return 1; + // +Inf / negative = -Inf + if (test__divtf3(makeInf128(), -3.L, + UINT64_C(0xffff000000000000), + UINT64_C(0x0))) + return 1; + // -Inf / positive = -Inf + if (test__divtf3(makeNegativeInf128(), 3.L, + UINT64_C(0xffff000000000000), + UINT64_C(0x0))) + return 1; + // -Inf / negative = +Inf + if (test__divtf3(makeNegativeInf128(), -3.L, + UINT64_C(0x7fff000000000000), + UINT64_C(0x0))) + return 1; + + // Inf / Inf = NaN + if (test__divtf3(makeInf128(), makeInf128(), + UINT64_C(0x7fff800000000000), + UINT64_C(0x0))) + return 1; + // 0.0 / 0.0 = NaN + if (test__divtf3(+0x0.0p+0L, +0x0.0p+0L, + UINT64_C(0x7fff800000000000), + UINT64_C(0x0))) + return 1; + // +0.0 / +Inf = +0.0 + if (test__divtf3(+0x0.0p+0L, makeInf128(), + UINT64_C(0x0), + UINT64_C(0x0))) + return 1; + // +Inf / +0.0 = +Inf + if (test__divtf3(makeInf128(), +0x0.0p+0L, + UINT64_C(0x7fff000000000000), + UINT64_C(0x0))) + return 1; + + // positive / +0.0 = +Inf + if (test__divtf3(+1.0L, +0x0.0p+0L, + UINT64_C(0x7fff000000000000), + UINT64_C(0x0))) + return 1; + // positive / -0.0 = -Inf + if (test__divtf3(+1.0L, -0x0.0p+0L, + UINT64_C(0xffff000000000000), + UINT64_C(0x0))) + return 1; + // negative / +0.0 = -Inf + if (test__divtf3(-1.0L, +0x0.0p+0L, + UINT64_C(0xffff000000000000), + UINT64_C(0x0))) + return 1; + // negative / -0.0 = +Inf + if (test__divtf3(-1.0L, -0x0.0p+0L, + UINT64_C(0x7fff000000000000), + UINT64_C(0x0))) + return 1; + + // 1/3 + if (test__divtf3(1.L, 3.L, + UINT64_C(0x3ffd555555555555), + UINT64_C(0x5555555555555555))) + return 1; + // smallest normal result + if (test__divtf3(0x1.0p-16381L, 2.L, + UINT64_C(0x0001000000000000), + UINT64_C(0x0))) + return 1; + + // divisor is exactly 1.0 + if (test__divtf3(0x1.0p+0L, + 0x1.0p+0L, + UINT64_C(0x3fff000000000000), + UINT64_C(0x0))) + return 1; + // divisor is truncated to exactly 1.0 in UQ1.63 + if (test__divtf3(0x1.0p+0L, + 0x1.0000000000000001p+0L, + UINT64_C(0x3ffeffffffffffff), + UINT64_C(0xfffe000000000000))) + return 1; + // any / any if (test__divtf3(0x1.a23b45362464523375893ab4cdefp+5L, 0x1.eedcbaba3a94546558237654321fp-1L, diff --git a/compiler-rt/test/builtins/Unit/fp_test.h b/compiler-rt/test/builtins/Unit/fp_test.h index cef5e99be4d16..59d4ae5cf9db2 100644 --- a/compiler-rt/test/builtins/Unit/fp_test.h +++ b/compiler-rt/test/builtins/Unit/fp_test.h @@ -253,14 +253,29 @@ static inline float makeInf32(void) return fromRep32(0x7f800000U); } +static inline float makeNegativeInf32(void) +{ + return fromRep32(0xff800000U); +} + static inline double makeInf64(void) { return fromRep64(0x7ff0000000000000UL); } +static inline double makeNegativeInf64(void) +{ + return fromRep64(0xfff0000000000000UL); +} + #if __LDBL_MANT_DIG__ == 113 static inline long double makeInf128(void) { return fromRep128(0x7fff000000000000UL, 0x0UL); } + +static inline long double makeNegativeInf128(void) +{ + return fromRep128(0xffff000000000000UL, 0x0UL); +} #endif diff --git a/compiler-rt/test/fuzzer/CrossOverTest.cpp b/compiler-rt/test/fuzzer/CrossOverTest.cpp index a7643570a92b2..b4506f665dc76 100644 --- a/compiler-rt/test/fuzzer/CrossOverTest.cpp +++ b/compiler-rt/test/fuzzer/CrossOverTest.cpp @@ -4,10 +4,11 @@ // Test for a fuzzer. The fuzzer must find the string // ABCDEFGHIJ -// We use it as a test for CrossOver functionality -// by passing two inputs to it: -// ABCDE00000 -// ZZZZZFGHIJ +// We use it as a test for each of CrossOver functionalities +// by passing the following sets of two inputs to it: +// {ABCDE00000, ZZZZZFGHIJ} +// {ABCDEHIJ, ZFG} to specifically test InsertPartOf +// {ABCDE00HIJ, ZFG} to specifically test CopyPartOf // #include #include @@ -42,13 +43,11 @@ static const uint32_t ExpectedHash = 0xe1677acb; extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { // fprintf(stderr, "ExpectedHash: %x\n", ExpectedHash); - if (Size != 10) return 0; + if (Size == 10 && ExpectedHash == simple_hash(Data, Size)) + *NullPtr = 0; if (*Data == 'A') Sink++; if (*Data == 'Z') Sink--; - if (ExpectedHash == simple_hash(Data, Size)) - *NullPtr = 0; return 0; } - diff --git a/compiler-rt/test/fuzzer/LoadTest.cpp b/compiler-rt/test/fuzzer/LoadTest.cpp index 9cf101542cb49..2b58c4efcf1bb 100644 --- a/compiler-rt/test/fuzzer/LoadTest.cpp +++ b/compiler-rt/test/fuzzer/LoadTest.cpp @@ -9,15 +9,16 @@ #include #include -static volatile int Sink; -const int kArraySize = 1234567; -int array[kArraySize]; +static volatile uint8_t Sink; +const int kArraySize = 32505854; // 0x01effffe +uint8_t array[kArraySize]; extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { - if (Size < 8) return 0; + if (Size != 8) + return 0; uint64_t a = 0; - memcpy(&a, Data, 8); + memcpy(&a, Data, sizeof(a)); + a &= 0x1fffffff; Sink = array[a % (kArraySize + 1)]; return 0; } - diff --git a/compiler-rt/test/fuzzer/SimpleCmpTest.cpp b/compiler-rt/test/fuzzer/SimpleCmpTest.cpp index 5768493d8ef4d..0876c9229cc28 100644 --- a/compiler-rt/test/fuzzer/SimpleCmpTest.cpp +++ b/compiler-rt/test/fuzzer/SimpleCmpTest.cpp @@ -18,20 +18,21 @@ bool PrintOnce(int Line) { } extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { - if (Size != 24) return 0; + if (Size != 21) + return 0; uint64_t x = 0; int64_t y = 0; int32_t z = 0; - uint32_t a = 0; + uint8_t a = 0; memcpy(&x, Data, 8); // 8 memcpy(&y, Data + 8, 8); // 16 memcpy(&z, Data + 16, sizeof(z)); // 20 - memcpy(&a, Data + 20, sizeof(a)); // 24 + memcpy(&a, Data + 20, sizeof(a)); // 21 const bool k32bit = sizeof(void*) == 4; if ((k32bit || x > 1234567890) && PrintOnce(__LINE__) && (k32bit || x < 1234567895) && PrintOnce(__LINE__) && - a == 0x4242 && PrintOnce(__LINE__) && + a == 0x42 && PrintOnce(__LINE__) && (k32bit || y >= 987654321) && PrintOnce(__LINE__) && (k32bit || y <= 987654325) && PrintOnce(__LINE__) && z < -10000 && PrintOnce(__LINE__) && diff --git a/compiler-rt/test/fuzzer/cross_over.test b/compiler-rt/test/fuzzer/cross_over.test index 058b5eb2c85cd..64e06e8cd3667 100644 --- a/compiler-rt/test/fuzzer/cross_over.test +++ b/compiler-rt/test/fuzzer/cross_over.test @@ -12,7 +12,7 @@ RUN: echo -n ABCDE00000 > %t-corpus/A RUN: echo -n ZZZZZFGHIJ > %t-corpus/B -RUN: not %run %t-CrossOverTest -max_len=10 -seed=1 -runs=10000000 %t-corpus +RUN: not %run %t-CrossOverTest -max_len=10 -reduce_inputs=0 -seed=1 -runs=10000000 %t-corpus # Test the same thing but using -seed_inputs instead of passing the corpus dir. -RUN: not %run %t-CrossOverTest -max_len=10 -seed=1 -runs=10000000 -seed_inputs=%t-corpus/A,%t-corpus/B +RUN: not %run %t-CrossOverTest -max_len=10 -reduce_inputs=0 -seed=1 -runs=10000000 -seed_inputs=%t-corpus/A,%t-corpus/B diff --git a/compiler-rt/test/fuzzer/cross_over_copy.test b/compiler-rt/test/fuzzer/cross_over_copy.test new file mode 100644 index 0000000000000..24b2f9b3b1132 --- /dev/null +++ b/compiler-rt/test/fuzzer/cross_over_copy.test @@ -0,0 +1,20 @@ +# Tests CrossOver CopyPartOf. +# We want to make sure that the test can find the input +# ABCDEFGHIJ when given two other inputs in the seed corpus: +# ABCDE00HIJ and +# (Z) FG +# +RUN: %cpp_compiler %S/CrossOverTest.cpp -o %t-CrossOverTest + +RUN: rm -rf %t-corpus +RUN: mkdir %t-corpus +RUN: echo -n ABCDE00HIJ > %t-corpus/A +RUN: echo -n ZFG > %t-corpus/B + + +RUN: not %run %t-CrossOverTest -mutate_depth=1 -max_len=1024 -reduce_inputs=0 -seed=1 -runs=10000000 %t-corpus 2>&1 | FileCheck %s + +# Test the same thing but using -seed_inputs instead of passing the corpus dir. +RUN: not %run %t-CrossOverTest -mutate_depth=1 -max_len=1024 -reduce_inputs=0 -seed=1 -runs=10000000 -seed_inputs=%t-corpus/A,%t-corpus/B 2>&1 | FileCheck %s + +CHECK: MS: 1 CrossOver- diff --git a/compiler-rt/test/fuzzer/cross_over_insert.test b/compiler-rt/test/fuzzer/cross_over_insert.test new file mode 100644 index 0000000000000..cb7d4fab81ef7 --- /dev/null +++ b/compiler-rt/test/fuzzer/cross_over_insert.test @@ -0,0 +1,20 @@ +# Tests CrossOver InsertPartOf. +# We want to make sure that the test can find the input +# ABCDEFGHIJ when given two other inputs in the seed corpus: +# ABCDE HIJ and +# (Z) FG +# +RUN: %cpp_compiler %S/CrossOverTest.cpp -o %t-CrossOverTest + +RUN: rm -rf %t-corpus +RUN: mkdir %t-corpus +RUN: echo -n ABCDEHIJ > %t-corpus/A +RUN: echo -n ZFG > %t-corpus/B + + +RUN: not %run %t-CrossOverTest -mutate_depth=1 -max_len=1024 -reduce_inputs=0 -seed=1 -runs=10000000 %t-corpus 2>&1 | FileCheck %s + +# Test the same thing but using -seed_inputs instead of passing the corpus dir. +RUN: not %run %t-CrossOverTest -mutate_depth=1 -max_len=1024 -reduce_inputs=0 -seed=1 -runs=10000000 -seed_inputs=%t-corpus/A,%t-corpus/B 2>&1 | FileCheck %s + +CHECK: MS: 1 CrossOver- diff --git a/compiler-rt/test/fuzzer/fuzzer-custommutator.test b/compiler-rt/test/fuzzer/fuzzer-custommutator.test index 87e69a0d8cf3a..25f5fe697b43f 100644 --- a/compiler-rt/test/fuzzer/fuzzer-custommutator.test +++ b/compiler-rt/test/fuzzer/fuzzer-custommutator.test @@ -6,7 +6,7 @@ LLVMFuzzerCustomMutator: {{.*}} lim: 4096 {{.*}} LLVMFuzzerCustomMutator: BINGO # len_control is disabled for custom mutators by default, test that it can be enabled. -RUN: not %run %t-CustomMutatorTest -len_control=100 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomMutatorWithLenControl +RUN: not %run %t-CustomMutatorTest -len_control=1000 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomMutatorWithLenControl LLVMFuzzerCustomMutatorWithLenControl: INFO: found LLVMFuzzerCustomMutator LLVMFuzzerCustomMutatorWithLenControl: In LLVMFuzzerCustomMutator LLVMFuzzerCustomMutatorWithLenControl: {{.*}} lim: {{[1-9][0-9]?}} {{.*}} diff --git a/compiler-rt/test/fuzzer/fuzzer-dirs.test b/compiler-rt/test/fuzzer/fuzzer-dirs.test index 3c742b52da4c5..2bf2a8b143300 100644 --- a/compiler-rt/test/fuzzer/fuzzer-dirs.test +++ b/compiler-rt/test/fuzzer/fuzzer-dirs.test @@ -16,6 +16,10 @@ RUN: %run %t-SimpleTest %t/SUB1 -runs=0 2>&1 | FileCheck %s --check-prefix=LONG LONG: INFO: -max_len is not provided; libFuzzer will not generate inputs larger than 8192 bytes RUN: rm -rf %t/SUB1 -RUN: not %run %t-SimpleTest NONEXISTENT_DIR 2>&1 | FileCheck %s --check-prefix=NONEXISTENT_DIR -NONEXISTENT_DIR: No such file or directory: NONEXISTENT_DIR; exiting - +RUN: rm -rf %t.dir && mkdir -p %t.dir +RUN: not %run %t-SimpleTest -artifact_prefix=%t.dir/NONEXISTENT_DIR/ 2>&1 | FileCheck %s --check-prefix=NONEXISTENT_DIR_RGX +RUN: not %run %t-SimpleTest -artifact_prefix=%t.dir/NONEXISTENT_DIR/myprefix 2>&1 | FileCheck %s --check-prefix=NONEXISTENT_DIR_RGX +RUN: not %run %t-SimpleTest -features_dir=%t.dir/NONEXISTENT_DIR/ 2>&1 | FileCheck %s --check-prefix=NONEXISTENT_DIR_RGX +RUN: not %run %t-SimpleTest %t.dir/NONEXISTENT_DIR 2>&1 | FileCheck %s --check-prefix=NONEXISTENT_DIR_RGX +RUN: not %run %t-SimpleTest -exact_artifact_path=%t.dir/NONEXISTENT_DIR/myprefix 2>&1 | FileCheck %s --check-prefix=NONEXISTENT_DIR_RGX +NONEXISTENT_DIR_RGX: ERROR: The required directory "{{.*/NONEXISTENT_DIR/?}}" does not exist diff --git a/compiler-rt/test/fuzzer/msan.test b/compiler-rt/test/fuzzer/msan.test index ae1c449878657..2e0339bb8ff7b 100644 --- a/compiler-rt/test/fuzzer/msan.test +++ b/compiler-rt/test/fuzzer/msan.test @@ -1,5 +1,3 @@ -FIXME: Fails to find BINGO on s390x. -XFAIL: s390x REQUIRES: msan RUN: %msan_compiler %S/SimpleTestStdio.cpp -o %t RUN: not %run %t -seed=1 -runs=10000000 2>&1 | FileCheck %s --check-prefix=NO-REPORT diff --git a/compiler-rt/test/fuzzer/value-profile-load.test b/compiler-rt/test/fuzzer/value-profile-load.test index 607b81cd527fe..bf51e7f56c9ee 100644 --- a/compiler-rt/test/fuzzer/value-profile-load.test +++ b/compiler-rt/test/fuzzer/value-profile-load.test @@ -1,3 +1,3 @@ CHECK: AddressSanitizer: global-buffer-overflow RUN: %cpp_compiler %S/LoadTest.cpp -fsanitize-coverage=trace-gep -o %t-LoadTest -RUN: not %run %t-LoadTest -seed=2 -use_cmp=0 -use_value_profile=1 -runs=20000000 2>&1 | FileCheck %s +RUN: not %run %t-LoadTest -seed=1 -use_cmp=0 -use_value_profile=1 -runs=20000000 2>&1 | FileCheck %s diff --git a/compiler-rt/test/msan/Linux/swapcontext_annotation.cpp b/compiler-rt/test/msan/Linux/swapcontext_annotation.cpp new file mode 100644 index 0000000000000..16bdd28fdec91 --- /dev/null +++ b/compiler-rt/test/msan/Linux/swapcontext_annotation.cpp @@ -0,0 +1,68 @@ +// RUN: %clangxx_msan -O0 %s -o %t && %run %t + +#include +#include +#include +#include +#include + +#include + +namespace { + +const int kStackSize = 1 << 20; +char fiber_stack[kStackSize] = {}; + +ucontext_t main_ctx; +ucontext_t fiber_ctx; + +void fiber() { + printf("%s: entering fiber\n", __FUNCTION__); + + // This fiber was switched into from main. Verify the details of main's stack + // have been populated by MSAN. + const void *previous_stack_bottom = nullptr; + size_t previous_stack_size = 0; + __msan_finish_switch_fiber(&previous_stack_bottom, &previous_stack_size); + assert(previous_stack_bottom != nullptr); + assert(previous_stack_size != 0); + + printf("%s: implicitly swapcontext to main\n", __FUNCTION__); + __msan_start_switch_fiber(previous_stack_bottom, previous_stack_size); +} + +} // namespace + +// Set up a fiber, switch to it, and switch back, invoking __msan_*_switch_fiber +// functions along the way. At each step, validate the correct stack addresses and +// sizes are returned from those functions. +int main(int argc, char **argv) { + if (getcontext(&fiber_ctx) == -1) { + perror("getcontext"); + _exit(1); + } + fiber_ctx.uc_stack.ss_sp = fiber_stack; + fiber_ctx.uc_stack.ss_size = sizeof(fiber_stack); + fiber_ctx.uc_link = &main_ctx; + makecontext(&fiber_ctx, fiber, 0); + + // Tell MSAN a fiber switch is about to occur, then perform the switch + printf("%s: swapcontext to fiber\n", __FUNCTION__); + __msan_start_switch_fiber(fiber_stack, kStackSize); + if (swapcontext(&main_ctx, &fiber_ctx) == -1) { + perror("swapcontext"); + _exit(1); + } + + // The fiber switched to above now switched back here. Tell MSAN that switch + // is complete and verify the fiber details return by MSAN are correct. + const void *previous_stack_bottom = nullptr; + size_t previous_stack_size = 0; + __msan_finish_switch_fiber(&previous_stack_bottom, &previous_stack_size); + assert(previous_stack_bottom == fiber_stack); + assert(previous_stack_size == kStackSize); + + printf("%s: exiting\n", __FUNCTION__); + + return 0; +} diff --git a/compiler-rt/test/msan/Linux/swapcontext_annotation_reset.cpp b/compiler-rt/test/msan/Linux/swapcontext_annotation_reset.cpp new file mode 100644 index 0000000000000..342ef735daccf --- /dev/null +++ b/compiler-rt/test/msan/Linux/swapcontext_annotation_reset.cpp @@ -0,0 +1,65 @@ +// RUN: %clangxx_msan -fno-sanitize=memory -c %s -o %t-main.o +// RUN: %clangxx_msan %t-main.o %s -o %t +// RUN: %run %t + +#include +#include +#include +#include +#include + +#include + +#if __has_feature(memory_sanitizer) + +__attribute__((noinline)) int bar(int a, int b) { + volatile int zero = 0; + return zero; +} + +void foo(int x, int y, int expected) { + assert(__msan_test_shadow(&x, sizeof(x)) == expected); + assert(__msan_test_shadow(&y, sizeof(y)) == expected); + + // Poisons parameter shadow in TLS so that the next call (to foo) from + // uninstrumented main has params 1 and 2 poisoned no matter what. + int a, b; + (void)bar(a, b); +} + +#else + +// This code is not instrumented by MemorySanitizer to prevent it from modifying +// MSAN TLS data for this test. + +int foo(int, int, int); + +int main(int argc, char **argv) { + int x, y; + // The parameters should _not_ be poisoned; this is the first call to foo. + foo(x, y, -1); + // The parameters should be poisoned; the prior call to foo left them so. + foo(x, y, 0); + + ucontext_t ctx; + if (getcontext(&ctx) == -1) { + perror("getcontext"); + _exit(1); + } + + // Simulate a fiber switch occurring from MSAN's perspective (though no switch + // actually occurs). + const void *previous_stack_bottom = nullptr; + size_t previous_stack_size = 0; + __msan_start_switch_fiber(ctx.uc_stack.ss_sp, ctx.uc_stack.ss_size); + __msan_finish_switch_fiber(&previous_stack_bottom, &previous_stack_size); + + // The simulated fiber switch will reset the TLS parameter shadow. So even + // though the most recent call to foo left the parameter shadow poisoned, the + // parameters are _not_ expected to be poisoned now. + foo(x, y, -1); + + return 0; +} + +#endif diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/crypt.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/crypt.cpp index 17ab6965b20b8..3a8faaa1ae768 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/crypt.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/crypt.cpp @@ -6,7 +6,9 @@ #include #include #include +#if __has_include() #include +#endif int main (int argc, char** argv) diff --git a/compiler-rt/test/tsan/Darwin/mach_vm_allocate.c b/compiler-rt/test/tsan/Darwin/mach_vm_allocate.c index 43703747ddf04..d101e3f3f8301 100644 --- a/compiler-rt/test/tsan/Darwin/mach_vm_allocate.c +++ b/compiler-rt/test/tsan/Darwin/mach_vm_allocate.c @@ -13,28 +13,28 @@ const mach_vm_size_t alloc_size = sizeof(int); static int *global_ptr; -static bool realloc_success = false; static int *alloc() { mach_vm_address_t addr; - kern_return_t res = + kern_return_t kr = mach_vm_allocate(mach_task_self(), &addr, alloc_size, VM_FLAGS_ANYWHERE); - assert(res == KERN_SUCCESS); + assert(kr == KERN_SUCCESS); return (int *)addr; } static void alloc_fixed(int *ptr) { mach_vm_address_t addr = (mach_vm_address_t)ptr; // Re-allocation via VM_FLAGS_FIXED sporadically fails. - kern_return_t res = + kern_return_t kr = mach_vm_allocate(mach_task_self(), &addr, alloc_size, VM_FLAGS_FIXED); - realloc_success = res == KERN_SUCCESS; + if (kr != KERN_SUCCESS) + global_ptr = NULL; } static void dealloc(int *ptr) { - kern_return_t res = + kern_return_t kr = mach_vm_deallocate(mach_task_self(), (mach_vm_address_t)ptr, alloc_size); - assert(res == KERN_SUCCESS); + assert(kr == KERN_SUCCESS); } static void *Thread(void *arg) { @@ -53,26 +53,30 @@ static void *Thread(void *arg) { return NULL; } -static void try_realloc_on_same_address() { +static bool try_realloc_on_same_address() { barrier_init(&barrier, 2); global_ptr = alloc(); pthread_t t; pthread_create(&t, NULL, Thread, NULL); barrier_wait(&barrier); - *global_ptr = 8; // Assignment 2 + if (global_ptr) + *global_ptr = 8; // Assignment 2 pthread_join(t, NULL); dealloc(global_ptr); + + return global_ptr != NULL; } int main(int argc, const char *argv[]) { + bool success; for (int i = 0; i < 10; i++) { - try_realloc_on_same_address(); - if (realloc_success) break; + success = try_realloc_on_same_address(); + if (success) break; } - if (!realloc_success) + if (!success) fprintf(stderr, "Unable to set up testing condition; silently pass test\n"); printf("Done.\n"); diff --git a/compiler-rt/test/ubsan/TestCases/Integer/unsigned-shift.cpp b/compiler-rt/test/ubsan/TestCases/Integer/unsigned-shift.cpp new file mode 100644 index 0000000000000..b49504e393acc --- /dev/null +++ b/compiler-rt/test/ubsan/TestCases/Integer/unsigned-shift.cpp @@ -0,0 +1,54 @@ +// RUN: %clangxx -fsanitize=unsigned-shift-base %s -o %t1 && not %run %t1 2>&1 | FileCheck %s +// RUN: %clangxx -fsanitize=unsigned-shift-base,shift-exponent %s -o %t1 && not %run %t1 2>&1 | FileCheck %s + +#define shift(val, amount) ({ \ + volatile unsigned _v = (val); \ + volatile unsigned _a = (amount); \ + unsigned res = _v << _a; \ + res; \ +}) + +int main() { + + shift(0b00000000'00000000'00000000'00000000, 31); + shift(0b00000000'00000000'00000000'00000001, 31); + shift(0b00000000'00000000'00000000'00000010, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 2 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00000000'00000100, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 4 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00000000'00001000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 8 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00000000'00010000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 16 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00000000'00100000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 32 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00000000'01000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 64 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00000000'10000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 128 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00000001'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 256 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00000010'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 512 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00000100'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 1024 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00001000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 2048 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00010000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 4096 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'00100000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 8192 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'01000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 16384 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000000'10000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 32768 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000001'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 65536 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000010'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 131072 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00000100'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 262144 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00001000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 524288 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00010000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 1048576 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'00100000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 2097152 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'01000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 4194304 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000000'10000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 8388608 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000001'00000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 16777216 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000010'00000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 33554432 by 31 places cannot be represented in type 'unsigned int' + shift(0b00000100'00000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 67108864 by 31 places cannot be represented in type 'unsigned int' + shift(0b00001000'00000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 134217728 by 31 places cannot be represented in type 'unsigned int' + shift(0b00010000'00000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 268435456 by 31 places cannot be represented in type 'unsigned int' + shift(0b00100000'00000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 536870912 by 31 places cannot be represented in type 'unsigned int' + shift(0b01000000'00000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 1073741824 by 31 places cannot be represented in type 'unsigned int' + shift(0b10000000'00000000'00000000'00000000, 31); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 2147483648 by 31 places cannot be represented in type 'unsigned int' + + shift(0b10000000'00000000'00000000'00000000, 00); + shift(0b10000000'00000000'00000000'00000000, 01); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 2147483648 by 1 places cannot be represented in type 'unsigned int' + + shift(0xffff'ffff, 0); + shift(0xffff'ffff, 1); // CHECK: unsigned-shift.cpp:[[@LINE]]:3: runtime error: left shift of 4294967295 by 1 places cannot be represented in type 'unsigned int' + + return 1; +} diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 73c2db55e8f86..03440b72ec8ca 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -135,13 +135,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) if (FLANG_INCLUDE_TESTS) set(UNITTEST_DIR ${LLVM_BUILD_MAIN_SRC_DIR}/utils/unittest) if(EXISTS ${UNITTEST_DIR}/googletest/include/gtest/gtest.h) - if (TARGET gtest) - # LLVM Doesn't export gtest's include directorys, so do that here - set_target_properties(gtest - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${UNITTEST_DIR}/googletest/include;${UNITTEST_DIR}/googlemock/include" - ) - else() + if (NOT TARGET gtest) add_library(gtest ${UNITTEST_DIR}/googletest/src/gtest-all.cc ${UNITTEST_DIR}/googlemock/src/gmock-all.cc diff --git a/flang/README.md b/flang/README.md index 44573ae4b9b6b..fafc1f91a421f 100644 --- a/flang/README.md +++ b/flang/README.md @@ -33,6 +33,9 @@ read the [style guide](docs/C++style.md) and also review [how flang uses modern C++ features](docs/C++17.md). +If you are interested in writing new documentation, follow +[markdown style guide from LLVM](https://github.com/llvm/llvm-project/blob/master/llvm/docs/MarkdownQuickstartTemplate.md). + ## Supported C++ compilers Flang is written in C++17. @@ -216,3 +219,25 @@ It will generate html in /tools/flang/docs/doxygen/html # for flang docs ``` +## Generate Sphinx-based Documentation + +Flang documentation should preferably be written in `markdown(.md)` syntax (they can be in `reStructuredText(.rst)` format as well but markdown is recommended in first place), it +is mostly meant to be processed by the Sphinx documentation generation +system to create HTML pages which would be hosted on the webpage of flang and +updated periodically. + +If you would like to generate and view the HTML locally, install +Sphinx and then: + +- Pass `-DLLVM_ENABLE_SPHINX=ON -DSPHINX_WARNINGS_AS_ERRORS=OFF` to the cmake command. + +``` +cd ~/llvm-project/build +cmake -DLLVM_ENABLE_SPHINX=ON -DSPHINX_WARNINGS_AS_ERRORS=OFF ../llvm +make docs-flang-html + +It will generate html in + + $BROWSER /tools/flang/docs/html/ +``` diff --git a/flang/docs/conf.py b/flang/docs/conf.py index bbe37a68cc281..045d0a2c41678 100644 --- a/flang/docs/conf.py +++ b/flang/docs/conf.py @@ -21,7 +21,6 @@ # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' - # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.todo', 'sphinx.ext.mathjax', 'sphinx.ext.intersphinx'] @@ -30,13 +29,29 @@ templates_path = ['_templates'] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = { + '.rst': 'restructuredtext', +} +try: + import recommonmark +except ImportError: + # manpages do not use any .md sources + if not tags.has('builder-man'): + raise +else: + import sphinx + if sphinx.version_info >= (3, 0): + # This requires 0.5 or later. + extensions.append('recommonmark') + else: + source_parsers = {'.md': 'recommonmark.parser.CommonMarkParser'} + source_suffix['.md'] = 'markdown' # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'ReleaseNotes' +master_doc = 'Overview' # General information about the project. project = u'Flang' @@ -196,7 +211,7 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('ReleaseNotes', 'Flang.tex', u'Flang Documentation', + ('Overview', 'Flang.tex', u'Flang Documentation', u'The Flang Team', 'manual'), ] @@ -237,8 +252,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('ReleaseNotes', 'Flang', u'Flang Documentation', - u'The Flang Team', 'Flang', 'One line description of project.', + ('Overview', 'Flang', u'Flang Documentation', + u'The Flang Team', 'Flang', 'A Fortran front end for LLVM.', 'Miscellaneous'), ] diff --git a/flang/include/flang/Parser/characters.h b/flang/include/flang/Parser/characters.h index eefb524e5d879..120560625da29 100644 --- a/flang/include/flang/Parser/characters.h +++ b/flang/include/flang/Parser/characters.h @@ -151,6 +151,33 @@ inline constexpr std::optional BackslashEscapeChar(char ch) { } } +// Does not include spaces or line ending characters. +inline constexpr bool IsValidFortranTokenCharacter(char ch) { + switch (ch) { + case '"': + case '%': + case '\'': + case '(': + case ')': + case '*': + case '+': + case ',': + case '-': + case '.': + case '/': + case ':': + case ';': + case '<': + case '=': + case '>': + case '[': + case ']': + return true; + default: + return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch); + } +} + struct EncodedCharacter { static constexpr int maxEncodingBytes{6}; char buffer[maxEncodingBytes]; diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index e8c6244d7474f..41ff9631d1011 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -82,9 +82,12 @@ class ParseTreeDumper { NODE(parser, AccObjectListWithModifier) NODE(parser, AccObjectListWithReduction) NODE(parser, AccReductionOperator) + NODE_ENUM(parser::AccReductionOperator, Operator) NODE(parser, AccSizeExpr) NODE(parser, AccSizeExprList) NODE(parser, AccStandaloneDirective) + NODE(parser, AccTileExpr) + NODE(parser, AccTileExprList) NODE(parser, AccLoopDirective) NODE(parser, AccWaitArgument) static std::string GetNodeName(const llvm::acc::Directive &x) { diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 49e91789fdce6..7f9984bc50481 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -547,7 +547,8 @@ struct ProgramUnit { std::variant, common::Indirection, common::Indirection, common::Indirection, - common::Indirection, common::Indirection> + common::Indirection, common::Indirection, + common::Indirection> u; }; @@ -3840,8 +3841,10 @@ struct AccObjectListWithModifier { // 2.5.13: + | * | max | min | iand | ior | ieor | .and. | .or. | .eqv. | .neqv. struct AccReductionOperator { - UNION_CLASS_BOILERPLATE(AccReductionOperator); - std::variant u; + ENUM_CLASS( + Operator, Plus, Multiply, Max, Min, Iand, Ior, Ieor, And, Or, Eqv, Neqv) + WRAPPER_CLASS_BOILERPLATE(AccReductionOperator, Operator); + CharBlock source; }; struct AccObjectListWithReduction { @@ -3854,6 +3857,16 @@ struct AccWaitArgument { std::tuple, std::list> t; }; +struct AccTileExpr { + TUPLE_CLASS_BOILERPLATE(AccTileExpr); + CharBlock source; + std::tuple> t; // if null then * +}; + +struct AccTileExprList { + WRAPPER_CLASS_BOILERPLATE(AccTileExprList, std::list); +}; + struct AccSizeExpr { TUPLE_CLASS_BOILERPLATE(AccSizeExpr); CharBlock source; diff --git a/flang/include/flang/Parser/tools.h b/flang/include/flang/Parser/tools.h index fa6ecd08a3186..66c8793399c93 100644 --- a/flang/include/flang/Parser/tools.h +++ b/flang/include/flang/Parser/tools.h @@ -74,6 +74,15 @@ struct UnwrapperHelper { } } + template + static const A *Unwrap(const UnlabeledStatement &x) { + return Unwrap(x.statement); + } + template + static const A *Unwrap(const Statement &x) { + return Unwrap(x.statement); + } + template static const A *Unwrap(B &x) { if constexpr (std::is_same_v, std::decay_t>) { return &x; diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h index c0a50364b63db..15732e0c68378 100644 --- a/flang/include/flang/Semantics/symbol.h +++ b/flang/include/flang/Semantics/symbol.h @@ -386,6 +386,8 @@ class HostAssocDetails { public: HostAssocDetails(const Symbol &symbol) : symbol_{symbol} {} const Symbol &symbol() const { return symbol_; } + bool implicitOrSpecExprError{false}; + bool implicitOrExplicitTypeError{false}; private: SymbolRef symbol_; @@ -481,6 +483,7 @@ class Symbol { Subroutine, // symbol is a subroutine StmtFunction, // symbol is a statement function (Function is set too) Implicit, // symbol is implicitly typed + ImplicitOrError, // symbol must be implicitly typed or it's an error ModFile, // symbol came from .mod file ParentComp, // symbol is the "parent component" of an extended type CrayPointer, CrayPointee, @@ -488,14 +491,12 @@ class Symbol { LocalityLocalInit, // named in LOCAL_INIT locality-spec LocalityShared, // named in SHARED locality-spec InDataStmt, // initialized in a DATA statement - // OpenACC data-sharing attribute AccPrivate, AccFirstPrivate, AccShared, // OpenACC data-mapping attribute AccCopyIn, AccCopyOut, AccCreate, AccDelete, AccPresent, // OpenACC miscellaneous flags AccCommonBlock, AccThreadPrivate, AccReduction, AccNone, AccPreDetermined, - // OpenMP data-sharing attribute OmpShared, OmpPrivate, OmpLinear, OmpFirstPrivate, OmpLastPrivate, // OpenMP data-mapping attribute diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h index f63e4ccbc687c..adc722c3847f7 100644 --- a/flang/include/flang/Semantics/tools.h +++ b/flang/include/flang/Semantics/tools.h @@ -100,6 +100,7 @@ bool HasIntrinsicTypeName(const Symbol &); bool IsSeparateModuleProcedureInterface(const Symbol *); bool IsAutomatic(const Symbol &); bool HasAlternateReturns(const Symbol &); +bool InCommonBlock(const Symbol &); // Return an ultimate component of type that matches predicate, or nullptr. const Symbol *FindUltimateComponent(const DerivedTypeSpec &type, diff --git a/flang/lib/Parser/openacc-parsers.cpp b/flang/lib/Parser/openacc-parsers.cpp index 4198ccbcc6acc..01c258325f2ce 100644 --- a/flang/lib/Parser/openacc-parsers.cpp +++ b/flang/lib/Parser/openacc-parsers.cpp @@ -102,7 +102,7 @@ TYPE_PARSER("AUTO" >> construct(construct()) || maybe(parenthesized(scalarLogicalExpr)))) || "SEQ" >> construct(construct()) || "TILE" >> construct(construct( - parenthesized(Parser{}))) || + parenthesized(Parser{}))) || "USE_DEVICE" >> construct(construct( parenthesized(Parser{}))) || "VECTOR_LENGTH" >> construct(construct( @@ -131,22 +131,40 @@ TYPE_PARSER(construct(maybe("DEVNUM:" >> scalarIntExpr / ":"), "QUEUES:" >> nonemptyList(scalarIntExpr) || nonemptyList(scalarIntExpr))) // 2.9 (1609) size-expr is one of: +// * (represented as an empty std::optional) // int-expr TYPE_PARSER(construct(scalarIntExpr) || - construct("*" >> maybe(scalarIntExpr))) + construct("*" >> construct>())) TYPE_PARSER(construct(nonemptyList(Parser{}))) -// 2.9 (1607) gang-arg is one of: -// [num:]int-expr -// static:size-expr -TYPE_PARSER(construct(maybe(scalarIntExpr), - maybe(","_tok / "STATIC:" >> Parser{})) || - construct(maybe("NUM:" >> scalarIntExpr), - maybe(","_tok / "STATIC:" >> Parser{}))) +// tile size is one of: +// * (represented as an empty std::optional) +// constant-int-expr +TYPE_PARSER(construct(scalarIntConstantExpr) || + construct( + "*" >> construct>())) +TYPE_PARSER(construct(nonemptyList(Parser{}))) + +// 2.9 (1607) gang-arg is: +// [[num:]int-expr][[,]static:size-expr] +TYPE_PARSER(construct( + maybe(("NUM:"_tok >> scalarIntExpr || scalarIntExpr)), + maybe(", STATIC:" >> Parser{}))) // 2.5.13 Reduction -TYPE_PARSER(construct(Parser{}) || - construct(Parser{})) +// Operator for reduction +TYPE_PARSER(sourced(construct( + first("+" >> pure(AccReductionOperator::Operator::Plus), + "*" >> pure(AccReductionOperator::Operator::Multiply), + "MAX" >> pure(AccReductionOperator::Operator::Max), + "MIN" >> pure(AccReductionOperator::Operator::Min), + "IAND" >> pure(AccReductionOperator::Operator::Iand), + "IOR" >> pure(AccReductionOperator::Operator::Ior), + "IEOR" >> pure(AccReductionOperator::Operator::Ieor), + ".AND." >> pure(AccReductionOperator::Operator::And), + ".OR." >> pure(AccReductionOperator::Operator::Or), + ".EQV." >> pure(AccReductionOperator::Operator::Eqv), + ".NEQV." >> pure(AccReductionOperator::Operator::Neqv))))) // 2.5.14 Default clause TYPE_PARSER(construct( diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 9e90f7f6228fe..5e6f13797646b 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -184,7 +184,8 @@ void Prescanner::Statement() { case LineClassification::Kind::PreprocessorDirective: Say(preprocessed->GetProvenanceRange(), "Preprocessed line resembles a preprocessor directive"_en_US); - preprocessed->ToLowerCase().Emit(cooked_); + preprocessed->ToLowerCase().CheckBadFortranCharacters(messages_).Emit( + cooked_); break; case LineClassification::Kind::CompilerDirective: if (preprocessed->HasRedundantBlanks()) { @@ -193,7 +194,9 @@ void Prescanner::Statement() { NormalizeCompilerDirectiveCommentMarker(*preprocessed); preprocessed->ToLowerCase(); SourceFormChange(preprocessed->ToString()); - preprocessed->ClipComment(true /* skip first ! */).Emit(cooked_); + preprocessed->ClipComment(true /* skip first ! */) + .CheckBadFortranCharacters(messages_) + .Emit(cooked_); break; case LineClassification::Kind::Source: if (inFixedForm_) { @@ -205,7 +208,10 @@ void Prescanner::Statement() { preprocessed->RemoveRedundantBlanks(); } } - preprocessed->ToLowerCase().ClipComment().Emit(cooked_); + preprocessed->ToLowerCase() + .ClipComment() + .CheckBadFortranCharacters(messages_) + .Emit(cooked_); break; } } else { @@ -213,7 +219,7 @@ void Prescanner::Statement() { if (line.kind == LineClassification::Kind::CompilerDirective) { SourceFormChange(tokens.ToString()); } - tokens.Emit(cooked_); + tokens.CheckBadFortranCharacters(messages_).Emit(cooked_); } if (omitNewline_) { omitNewline_ = false; @@ -245,8 +251,9 @@ void Prescanner::NextLine() { } } -void Prescanner::LabelField(TokenSequence &token, int outCol) { +void Prescanner::LabelField(TokenSequence &token) { const char *bad{nullptr}; + int outCol{1}; for (; *at_ != '\n' && column_ <= 6; ++at_) { if (*at_ == '\t') { ++at_; @@ -256,20 +263,26 @@ void Prescanner::LabelField(TokenSequence &token, int outCol) { if (*at_ != ' ' && !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space EmitChar(token, *at_); + ++outCol; if (!bad && !IsDecimalDigit(*at_)) { bad = at_; } - ++outCol; } ++column_; } - if (outCol > 1) { + if (outCol == 1) { // empty label field + // Emit a space so that, if the line is rescanned after preprocessing, + // a leading 'C' or 'D' won't be left-justified and then accidentally + // misinterpreted as a comment card. + EmitChar(token, ' '); + ++outCol; + } else { if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) { Say(GetProvenance(bad), "Character in fixed-form label field must be a digit"_en_US); } - token.CloseToken(); } + token.CloseToken(); SkipToNextSignificantCharacter(); if (IsDecimalDigit(*at_)) { Say(GetProvenance(at_), diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h index 595f8f701185b..0b5b64792004a 100644 --- a/flang/lib/Parser/prescan.h +++ b/flang/lib/Parser/prescan.h @@ -147,7 +147,7 @@ class Prescanner { common::LanguageFeature::ClassicCComments))); } - void LabelField(TokenSequence &, int outCol = 1); + void LabelField(TokenSequence &); void SkipToEndOfLine(); bool MustSkipToEndOfLine() const; void NextChar(); diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp index 278cc6fdb51a5..dee359e240cfa 100644 --- a/flang/lib/Parser/program-parsers.cpp +++ b/flang/lib/Parser/program-parsers.cpp @@ -20,20 +20,6 @@ namespace Fortran::parser { -// R501 program -> program-unit [program-unit]... -// This is the top-level production for the Fortran language. -// F'2018 6.3.1 defines a program unit as a sequence of one or more lines, -// implying that a line can't be part of two distinct program units. -// Consequently, a program unit END statement should be the last statement -// on its line. We parse those END statements via unterminatedStatement() -// and then skip over the end of the line here. -TYPE_PARSER(construct( - extension(skipStuffBeforeStatement >> - !nextCh >> pure>()) || - some(StartNewSubprogram{} >> Parser{} / skipMany(";"_tok) / - space / recovery(endOfLine, SkipPast<'\n'>{})) / - skipStuffBeforeStatement)) - // R502 program-unit -> // main-program | external-subprogram | module | submodule | block-data // R503 external-subprogram -> function-subprogram | subroutine-subprogram @@ -49,12 +35,30 @@ TYPE_PARSER(construct( // variant parsers for several productions; giving the "module" production // priority here is a cleaner solution, though regrettably subtle. Enforcing // C1547 is done in semantics. -TYPE_PARSER(construct(indirect(Parser{})) || +static constexpr auto programUnit{ + construct(indirect(Parser{})) || construct(indirect(functionSubprogram)) || construct(indirect(subroutineSubprogram)) || construct(indirect(Parser{})) || construct(indirect(Parser{})) || - construct(indirect(Parser{}))) + construct(indirect(Parser{}))}; +static constexpr auto normalProgramUnit{StartNewSubprogram{} >> programUnit / + skipMany(";"_tok) / space / recovery(endOfLine, SkipPast<'\n'>{})}; +static constexpr auto globalCompilerDirective{ + construct(indirect(compilerDirective))}; + +// R501 program -> program-unit [program-unit]... +// This is the top-level production for the Fortran language. +// F'2018 6.3.1 defines a program unit as a sequence of one or more lines, +// implying that a line can't be part of two distinct program units. +// Consequently, a program unit END statement should be the last statement +// on its line. We parse those END statements via unterminatedStatement() +// and then skip over the end of the line here. +TYPE_PARSER(construct( + extension(skipStuffBeforeStatement >> + !nextCh >> pure>()) || + some(globalCompilerDirective || normalProgramUnit) / + skipStuffBeforeStatement)) // R504 specification-part -> // [use-stmt]... [import-stmt]... [implicit-part] diff --git a/flang/lib/Parser/token-sequence.cpp b/flang/lib/Parser/token-sequence.cpp index 07c5b12e5f759..4797cb759a72b 100644 --- a/flang/lib/Parser/token-sequence.cpp +++ b/flang/lib/Parser/token-sequence.cpp @@ -8,6 +8,7 @@ #include "token-sequence.h" #include "flang/Parser/characters.h" +#include "flang/Parser/message.h" #include "llvm/Support/raw_ostream.h" namespace Fortran::parser { @@ -310,4 +311,25 @@ ProvenanceRange TokenSequence::GetIntervalProvenanceRange( ProvenanceRange TokenSequence::GetProvenanceRange() const { return GetIntervalProvenanceRange(0, start_.size()); } + +const TokenSequence &TokenSequence::CheckBadFortranCharacters( + Messages &messages) const { + std::size_t tokens{SizeInTokens()}; + for (std::size_t j{0}; j < tokens; ++j) { + CharBlock token{TokenAt(j)}; + char ch{token.FirstNonBlank()}; + if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) { + if (ch == '!' && j == 0) { + // allow in !dir$ + } else if (ch < ' ' || ch >= '\x7f') { + messages.Say(GetTokenProvenanceRange(j), + "bad character (0x%02x) in Fortran token"_err_en_US, ch & 0xff); + } else { + messages.Say(GetTokenProvenanceRange(j), + "bad character ('%c') in Fortran token"_err_en_US, ch); + } + } + } + return *this; +} } // namespace Fortran::parser diff --git a/flang/lib/Parser/token-sequence.h b/flang/lib/Parser/token-sequence.h index d98c0b955c5e9..6a10ef1977d38 100644 --- a/flang/lib/Parser/token-sequence.h +++ b/flang/lib/Parser/token-sequence.h @@ -27,6 +27,8 @@ class raw_ostream; namespace Fortran::parser { +class Messages; + // Buffers a contiguous sequence of characters that has been partitioned into // a sequence of preprocessing tokens with provenances. class TokenSequence { @@ -115,6 +117,7 @@ class TokenSequence { TokenSequence &RemoveBlanks(std::size_t firstChar = 0); TokenSequence &RemoveRedundantBlanks(std::size_t firstChar = 0); TokenSequence &ClipComment(bool skipFirst = false); + const TokenSequence &CheckBadFortranCharacters(Messages &) const; void Emit(CookedSource &) const; void Dump(llvm::raw_ostream &) const; diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index faf2c9f1eb876..e26795d0825bb 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -1923,6 +1923,9 @@ class UnparseVisitor { x.u); } void Unparse(const AccObjectList &x) { Walk(x.v, ","); } + void Unparse(const AccReductionOperator::Operator &x) { + Word(AccReductionOperator::EnumToString(x)); + } void Unparse(const AccObjectListWithReduction &x) { Walk(std::get(x.t)); Put(":"); diff --git a/flang/lib/Semantics/canonicalize-acc.cpp b/flang/lib/Semantics/canonicalize-acc.cpp index 8cf04910ba6e1..4916f2269cebd 100644 --- a/flang/lib/Semantics/canonicalize-acc.cpp +++ b/flang/lib/Semantics/canonicalize-acc.cpp @@ -48,6 +48,40 @@ class CanonicalizationOfAcc { } private: + // Check constraint in 2.9.7 + // If there are n tile sizes in the list, the loop construct must be + // immediately followed by n tightly-nested loops. + template + void CheckTileClauseRestriction(const C &x) { + const auto &beginLoopDirective = std::get(x.t); + const auto &accClauseList = + std::get(beginLoopDirective.t); + for (const auto &clause : accClauseList.v) { + if (const auto *tileClause = + std::get_if(&clause.u)) { + const parser::AccTileExprList &tileExprList = tileClause->v; + const std::list &listTileExpr = tileExprList.v; + std::size_t tileArgNb = listTileExpr.size(); + + const auto &outer{std::get>(x.t)}; + for (const parser::DoConstruct *loop{&*outer}; loop && tileArgNb > 0; + --tileArgNb) { + const auto &block{std::get(loop->t)}; + const auto it{block.begin()}; + loop = it != block.end() ? parser::Unwrap(*it) + : nullptr; + } + + if (tileArgNb > 0) { + messages_.Say(beginLoopDirective.source, + "The loop construct with the TILE clause must be followed by %d " + "tightly-nested loops"_err_en_US, + listTileExpr.size()); + } + } + } + } + void RewriteOpenACCLoopConstruct(parser::OpenACCLoopConstruct &x, parser::Block &block, parser::Block::iterator it) { // Check the sequence of DoConstruct in the same iteration @@ -78,6 +112,8 @@ class CanonicalizationOfAcc { "DO loop after the %s directive must have loop control"_err_en_US, parser::ToUpperCaseLetters(dir.source.ToString())); } + CheckTileClauseRestriction(x); return; // found do-loop } } @@ -127,6 +163,8 @@ class CanonicalizationOfAcc { "DO loop after the %s directive must have loop control"_err_en_US, parser::ToUpperCaseLetters(dir.source.ToString())); } + CheckTileClauseRestriction(x); return; // found do-loop } } diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index 4e91235938e6e..d5fa7b9ab3705 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -61,6 +61,7 @@ class CheckHelper { void CheckSubprogram(const Symbol &, const SubprogramDetails &); void CheckAssumedTypeEntity(const Symbol &, const ObjectEntityDetails &); void CheckDerivedType(const Symbol &, const DerivedTypeDetails &); + void CheckHostAssoc(const Symbol &, const HostAssocDetails &); void CheckGeneric(const Symbol &, const GenericDetails &); std::optional> Characterize(const SymbolVector &); bool CheckDefinedOperator(const SourceName &, const GenericKind &, @@ -147,7 +148,10 @@ void CheckHelper::Check(const Symbol &symbol) { CheckVolatile(symbol, isAssociated, derived); } if (isAssociated) { - return; // only care about checking VOLATILE on associated symbols + if (const auto *details{symbol.detailsIf()}) { + CheckHostAssoc(symbol, *details); + } + return; // no other checks on associated symbols } if (IsPointer(symbol)) { CheckPointer(symbol); @@ -758,6 +762,21 @@ void CheckHelper::CheckDerivedType( } } +void CheckHelper::CheckHostAssoc( + const Symbol &symbol, const HostAssocDetails &details) { + const Symbol &hostSymbol{details.symbol()}; + if (hostSymbol.test(Symbol::Flag::ImplicitOrError)) { + if (details.implicitOrSpecExprError) { + messages_.Say("Implicitly typed local entity '%s' not allowed in" + " specification expression"_err_en_US, + symbol.name()); + } else if (details.implicitOrExplicitTypeError) { + messages_.Say( + "No explicit type declared for '%s'"_err_en_US, symbol.name()); + } + } +} + void CheckHelper::CheckGeneric( const Symbol &symbol, const GenericDetails &details) { const SymbolVector &specifics{details.specificProcs()}; diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp index 250ad492ebc92..d00f56c38042d 100644 --- a/flang/lib/Semantics/check-io.cpp +++ b/flang/lib/Semantics/check-io.cpp @@ -155,7 +155,8 @@ void IoChecker::Enter(const parser::ConnectSpec::CharExpr &spec) { } } -void IoChecker::Enter(const parser::ConnectSpec::Newunit &) { +void IoChecker::Enter(const parser::ConnectSpec::Newunit &var) { + CheckForDefinableVariable(var, "NEWUNIT"); SetSpecifier(IoSpecKind::Newunit); } @@ -266,10 +267,11 @@ void IoChecker::Enter(const parser::IdExpr &) { SetSpecifier(IoSpecKind::Id); } void IoChecker::Enter(const parser::IdVariable &spec) { SetSpecifier(IoSpecKind::Id); - auto expr{GetExpr(spec)}; + const auto *expr{GetExpr(spec)}; if (!expr || !expr->GetType()) { return; } + CheckForDefinableVariable(spec, "ID"); int kind{expr->GetType()->kind()}; int defaultKind{context_.GetDefaultKind(TypeCategory::Integer)}; if (kind < defaultKind) { @@ -281,21 +283,18 @@ void IoChecker::Enter(const parser::IdVariable &spec) { void IoChecker::Enter(const parser::InputItem &spec) { flags_.set(Flag::DataList); - if (const parser::Variable * var{std::get_if(&spec.u)}) { - const parser::Name &name{GetLastName(*var)}; - if (name.symbol) { - if (auto *details{name.symbol->detailsIf()}) { - // TODO: Determine if this check is needed at all, and if so, replace - // the false subcondition with a check for a whole array. Otherwise, - // the check incorrectly flags array element and section references. - if (details->IsAssumedSize() && false) { - // This check may be superseded by C928 or C1002. - context_.Say(name.source, - "'%s' must not be a whole assumed size array"_err_en_US, - name.source); // C1231 - } - } - } + const parser::Variable *var{std::get_if(&spec.u)}; + if (!var) { + return; + } + CheckForDefinableVariable(*var, "Input"); + const auto &name{GetLastName(*var)}; + const auto *expr{GetExpr(*var)}; + if (name.symbol && IsAssumedSizeArray(*name.symbol) && expr && + !evaluate::IsArrayElement(*GetExpr(*var))) { + context_.Say(name.source, + "Whole assumed size array '%s' may not be an input item"_err_en_US, + name.source); // C1231 } } @@ -386,6 +385,8 @@ void IoChecker::Enter(const parser::InquireSpec::CharVar &spec) { specKind = IoSpecKind::Dispose; break; } + CheckForDefinableVariable(std::get(spec.t), + parser::ToUpperCaseLetters(common::EnumToString(specKind))); SetSpecifier(specKind); } @@ -412,6 +413,8 @@ void IoChecker::Enter(const parser::InquireSpec::IntVar &spec) { specKind = IoSpecKind::Size; break; } + CheckForDefinableVariable(std::get(spec.t), + parser::ToUpperCaseLetters(common::EnumToString(specKind))); SetSpecifier(specKind); } @@ -500,17 +503,23 @@ void IoChecker::Enter(const parser::IoControlSpec::Rec &) { SetSpecifier(IoSpecKind::Rec); } -void IoChecker::Enter(const parser::IoControlSpec::Size &) { +void IoChecker::Enter(const parser::IoControlSpec::Size &var) { + CheckForDefinableVariable(var, "SIZE"); SetSpecifier(IoSpecKind::Size); } void IoChecker::Enter(const parser::IoUnit &spec) { if (const parser::Variable * var{std::get_if(&spec.u)}) { - // TODO: C1201 - internal file variable must not be an array section ... - if (auto expr{GetExpr(*var)}) { - if (!ExprTypeKindIsDefault(*expr, context_)) { + if (stmt_ == IoStmtKind::Write) { + CheckForDefinableVariable(*var, "Internal file"); + } + if (const auto *expr{GetExpr(*var)}) { + if (HasVectorSubscript(*expr)) { + context_.Say(parser::FindSourceLocation(*var), // C1201 + "Internal file must not have a vector subscript"_err_en_US); + } else if (!ExprTypeKindIsDefault(*expr, context_)) { // This may be too restrictive; other kinds may be valid. - context_.Say( // C1202 + context_.Say(parser::FindSourceLocation(*var), // C1202 "Invalid character kind for an internal file variable"_err_en_US); } } @@ -522,13 +531,26 @@ void IoChecker::Enter(const parser::IoUnit &spec) { } } -void IoChecker::Enter(const parser::MsgVariable &) { +void IoChecker::Enter(const parser::MsgVariable &var) { + if (stmt_ == IoStmtKind::None) { + // allocate, deallocate, image control + CheckForDefinableVariable(var, "ERRMSG"); + return; + } + CheckForDefinableVariable(var, "IOMSG"); SetSpecifier(IoSpecKind::Iomsg); } -void IoChecker::Enter(const parser::OutputItem &) { +void IoChecker::Enter(const parser::OutputItem &item) { flags_.set(Flag::DataList); - // TODO: C1233 - output item must not be a procedure pointer + if (const auto *x{std::get_if(&item.u)}) { + if (const auto *expr{GetExpr(*x)}) { + if (IsProcedurePointer(*expr)) { + context_.Say(parser::FindSourceLocation(*x), + "Output item must not be a procedure pointer"_err_en_US); // C1233 + } + } + } } void IoChecker::Enter(const parser::StatusExpr &spec) { @@ -555,12 +577,14 @@ void IoChecker::Enter(const parser::StatusExpr &spec) { } } -void IoChecker::Enter(const parser::StatVariable &) { +void IoChecker::Enter(const parser::StatVariable &var) { if (stmt_ == IoStmtKind::None) { - // ALLOCATE & DEALLOCATE - } else { - SetSpecifier(IoSpecKind::Iostat); + // allocate, deallocate, image control + CheckForDefinableVariable(var, "STAT"); + return; } + CheckForDefinableVariable(var, "IOSTAT"); + SetSpecifier(IoSpecKind::Iostat); } void IoChecker::Leave(const parser::BackspaceStmt &) { @@ -808,7 +832,7 @@ void IoChecker::CheckStringValue(IoSpecKind specKind, const std::string &value, // CheckForRequiredSpecifier and CheckForProhibitedSpecifier functions // need conditions to check, and string arguments to insert into a message. -// A IoSpecKind provides both an absence/presence condition and a string +// An IoSpecKind provides both an absence/presence condition and a string // argument (its name). A (condition, string) pair provides an arbitrary // condition and an arbitrary string. @@ -893,6 +917,17 @@ void IoChecker::CheckForProhibitedSpecifier( } } +template +void IoChecker::CheckForDefinableVariable( + const A &var, const std::string &s) const { + const Symbol *sym{ + GetFirstName(*parser::Unwrap(var)).symbol}; + if (WhyNotModifiable(*sym, context_.FindScope(*context_.location()))) { + context_.Say(parser::FindSourceLocation(var), + "%s variable '%s' must be definable"_err_en_US, s, sym->name()); + } +} + void IoChecker::CheckForPureSubprogram() const { // C1597 CHECK(context_.location()); if (FindPureProcedureContaining(context_.FindScope(*context_.location()))) { diff --git a/flang/lib/Semantics/check-io.h b/flang/lib/Semantics/check-io.h index b5e8f12b5ee64..01bbcd9ba24ff 100644 --- a/flang/lib/Semantics/check-io.h +++ b/flang/lib/Semantics/check-io.h @@ -122,6 +122,11 @@ class IoChecker : public virtual BaseChecker { void CheckForProhibitedSpecifier(IoSpecKind, bool, const std::string &) const; void CheckForProhibitedSpecifier(bool, const std::string &, IoSpecKind) const; + template + void CheckForDefinableVariable(const A &var, const std::string &s) const; + + void CheckForPureSubprogram() const; + void Init(IoStmtKind s) { stmt_ = s; specifierSet_.reset(); @@ -130,8 +135,6 @@ class IoChecker : public virtual BaseChecker { void Done() { stmt_ = IoStmtKind::None; } - void CheckForPureSubprogram() const; - SemanticsContext &context_; IoStmtKind stmt_{IoStmtKind::None}; common::EnumSet specifierSet_; diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp index 8d90bf99fe273..f2a3a10bb4fa6 100644 --- a/flang/lib/Semantics/compute-offsets.cpp +++ b/flang/lib/Semantics/compute-offsets.cpp @@ -81,11 +81,6 @@ void ComputeOffsetsHelper::Compute(Scope &scope) { equivalenceBlock_.clear(); } -static bool InCommonBlock(const Symbol &symbol) { - const auto *details{symbol.detailsIf()}; - return details && details->commonBlock(); -} - void ComputeOffsetsHelper::DoScope(Scope &scope) { if (scope.symbol() && scope.IsParameterizedDerivedType()) { return; // only process instantiations of parameterized derived types @@ -300,9 +295,8 @@ std::size_t ComputeOffsetsHelper::ComputeOffset( } void ComputeOffsetsHelper::DoSymbol(Symbol &symbol) { - if (symbol.has() || symbol.has() || - symbol.has() || symbol.has()) { - return; // these have type but no size + if (!symbol.has() && !symbol.has()) { + return; } SizeAndAlignment s{GetSizeAndAlignment(symbol)}; if (s.size == 0) { @@ -329,7 +323,7 @@ auto ComputeOffsetsHelper::GetSizeAndAlignment(const Symbol &symbol) auto ComputeOffsetsHelper::GetElementSize(const Symbol &symbol) -> SizeAndAlignment { const DeclTypeSpec *type{symbol.GetType()}; - if (!type) { + if (!evaluate::DynamicType::From(type).has_value()) { return {}; } // TODO: The size of procedure pointers is not yet known diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index cfb908179c3a9..747c663255d68 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -680,7 +680,10 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::Name &n) { if (std::optional kind{IsImpliedDo(n.source)}) { return AsMaybeExpr(ConvertToKind( *kind, AsExpr(ImpliedDoIndex{n.source}))); - } else if (context_.HasError(n) || !n.symbol) { + } else if (context_.HasError(n)) { + return std::nullopt; + } else if (!n.symbol) { + SayAt(n, "Internal error: unresolved name '%s'"_err_en_US, n.source); return std::nullopt; } else { const Symbol &ultimate{n.symbol->GetUltimate()}; diff --git a/flang/lib/Semantics/program-tree.cpp b/flang/lib/Semantics/program-tree.cpp index e6dfd9b2f51cd..9466a748567e1 100644 --- a/flang/lib/Semantics/program-tree.cpp +++ b/flang/lib/Semantics/program-tree.cpp @@ -112,6 +112,10 @@ ProgramTree ProgramTree::Build(const parser::BlockData &x) { return result.set_stmt(stmt).set_endStmt(end); } +ProgramTree ProgramTree::Build(const parser::CompilerDirective &) { + DIE("ProgramTree::Build() called for CompilerDirective"); +} + const parser::ParentIdentifier &ProgramTree::GetParentId() const { const auto *stmt{ std::get *>(stmt_)}; diff --git a/flang/lib/Semantics/program-tree.h b/flang/lib/Semantics/program-tree.h index 69c133bfbde7b..6b07452282017 100644 --- a/flang/lib/Semantics/program-tree.h +++ b/flang/lib/Semantics/program-tree.h @@ -38,6 +38,7 @@ class ProgramTree { static ProgramTree Build(const parser::Module &); static ProgramTree Build(const parser::Submodule &); static ProgramTree Build(const parser::BlockData &); + static ProgramTree Build(const parser::CompilerDirective &); ENUM_CLASS(Kind, // kind of node Program, Function, Subroutine, MpSubprogram, Module, Submodule, BlockData) diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 7feca5b00a8fe..36c38c1cd99c4 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -68,8 +68,8 @@ class ImplicitRules { void set_isImplicitNoneType(bool x) { isImplicitNoneType_ = x; } void set_isImplicitNoneExternal(bool x) { isImplicitNoneExternal_ = x; } void set_inheritFromParent(bool x) { inheritFromParent_ = x; } - // Get the implicit type for identifiers starting with ch. May be null. - const DeclTypeSpec *GetType(char ch) const; + // Get the implicit type for this name. May be null. + const DeclTypeSpec *GetType(SourceName) const; // Record the implicit type for the range of characters [fromLetter, // toLetter]. void SetTypeMapping(const DeclTypeSpec &type, parser::Location fromLetter, @@ -385,13 +385,20 @@ class ImplicitRulesVisitor : public DeclTypeSpecVisitor { bool Pre(const parser::ImplicitSpec &); void Post(const parser::ImplicitSpec &); - ImplicitRules &implicitRules() { return *implicitRules_; } - const ImplicitRules &implicitRules() const { return *implicitRules_; } + const DeclTypeSpec *GetType(SourceName name) { + return implicitRules_->GetType(name); + } bool isImplicitNoneType() const { - return implicitRules().isImplicitNoneType(); + return implicitRules_->isImplicitNoneType(); + } + bool isImplicitNoneType(const Scope &scope) const { + return implicitRulesMap_->at(&scope).isImplicitNoneType(); } bool isImplicitNoneExternal() const { - return implicitRules().isImplicitNoneExternal(); + return implicitRules_->isImplicitNoneExternal(); + } + void set_inheritFromParent(bool x) { + implicitRules_->set_inheritFromParent(x); } protected: @@ -452,6 +459,8 @@ class ScopeHandler : public ImplicitRulesVisitor { using ImplicitRulesVisitor::Pre; Scope &currScope() { return DEREF(currScope_); } + // The enclosing host procedure if current scope is in an internal procedure + Scope *GetHostProcedure(); // The enclosing scope, skipping blocks and derived types. // TODO: Will return the scope of a FORALL or implied DO loop; is this ok? // If not, should call FindProgramUnitContaining() instead. @@ -583,6 +592,8 @@ class ScopeHandler : public ImplicitRulesVisitor { const DeclTypeSpec &MakeLogicalType( const std::optional &); + bool inExecutionPart_{false}; + private: Scope *currScope_{nullptr}; }; @@ -689,7 +700,6 @@ class SubprogramVisitor : public virtual ScopeHandler, public InterfaceVisitor { protected: // Set when we see a stmt function that is really an array element assignment bool badStmtFuncFound_{false}; - bool inExecutionPart_{false}; private: // Info about the current function: parse tree of the type in the PrefixSpec; @@ -799,7 +809,6 @@ class DeclarationVisitor : public ArraySpecVisitor, bool Pre(const parser::NamelistStmt::Group &); bool Pre(const parser::IoControlSpec &); bool Pre(const parser::CommonStmt::Block &); - void Post(const parser::CommonStmt::Block &); bool Pre(const parser::CommonBlockObject &); void Post(const parser::CommonBlockObject &); bool Pre(const parser::EquivalenceStmt &); @@ -820,7 +829,7 @@ class DeclarationVisitor : public ArraySpecVisitor, protected: bool BeginDecl(); void EndDecl(); - Symbol &DeclareObjectEntity(const parser::Name &, Attrs); + Symbol &DeclareObjectEntity(const parser::Name &, Attrs = Attrs{}); // Make sure that there's an entity in an enclosing scope called Name Symbol &FindOrDeclareEnclosingEntity(const parser::Name &); // Declare a LOCAL/LOCAL_INIT entity. If there isn't a type specified @@ -832,6 +841,8 @@ class DeclarationVisitor : public ArraySpecVisitor, // Return pointer to the new symbol, or nullptr on error. Symbol *DeclareStatementEntity( const parser::Name &, const std::optional &); + Symbol &MakeCommonBlockSymbol(const parser::Name &); + Symbol &MakeCommonBlockSymbol(const std::optional &); bool CheckUseError(const parser::Name &); void CheckAccessibility(const SourceName &, bool, Symbol &); void CheckCommonBlocks(); @@ -869,11 +880,8 @@ class DeclarationVisitor : public ArraySpecVisitor, } derivedTypeInfo_; // Collect equivalence sets and process at end of specification part std::vector *> equivalenceSets_; - // Info about common blocks in the current scope - struct { - Symbol *curr{nullptr}; // common block currently being processed - std::set names; // names in any common block of scope - } commonBlockInfo_; + // Names of all common block objects in the scope + std::set commonBlockObjects_; // Info about about SAVE statements and attributes in current scope struct { std::optional saveAll; // "SAVE" without entity list @@ -904,7 +912,6 @@ class DeclarationVisitor : public ArraySpecVisitor, bool OkToAddComponent(const parser::Name &, const Symbol * = nullptr); ParamValue GetParamValue( const parser::TypeParamValue &, common::TypeParamAttr attr); - Symbol &MakeCommonBlockSymbol(const parser::Name &); void CheckCommonBlockDerivedType(const SourceName &, const Symbol &); std::optional CheckSaveAttr(const Symbol &); Attrs HandleSaveName(const SourceName &, Attrs); @@ -918,22 +925,25 @@ class DeclarationVisitor : public ArraySpecVisitor, void Initialization(const parser::Name &, const parser::Initialization &, bool inComponentDecl); bool PassesLocalityChecks(const parser::Name &name, Symbol &symbol); + bool CheckForHostAssociatedImplicit(const parser::Name &); // Declare an object or procedure entity. // T is one of: EntityDetails, ObjectEntityDetails, ProcEntityDetails template Symbol &DeclareEntity(const parser::Name &name, Attrs attrs) { Symbol &symbol{MakeSymbol(name, attrs)}; - if (symbol.has()) { - // OK + if (context().HasError(symbol) || symbol.has()) { + return symbol; // OK or error already reported } else if (symbol.has()) { symbol.set_details(T{}); + return symbol; } else if (auto *details{symbol.detailsIf()}) { symbol.set_details(T{std::move(*details)}); + return symbol; } else if (std::is_same_v && (symbol.has() || symbol.has())) { - // OK + return symbol; // OK } else if (auto *details{symbol.detailsIf()}) { Say(name.source, "'%s' is use-associated from module '%s' and cannot be re-declared"_err_en_US, @@ -956,11 +966,17 @@ class DeclarationVisitor : public ArraySpecVisitor, name, symbol, "'%s' is already declared as a procedure"_err_en_US); } else if (std::is_same_v && symbol.has()) { - SayWithDecl( - name, symbol, "'%s' is already declared as an object"_err_en_US); + if (InCommonBlock(symbol)) { + SayWithDecl(name, symbol, + "'%s' may not be a procedure as it is in a COMMON block"_err_en_US); + } else { + SayWithDecl( + name, symbol, "'%s' is already declared as an object"_err_en_US); + } } else { SayAlreadyDeclared(name, symbol); } + context().SetError(symbol); return symbol; } }; @@ -1334,6 +1350,7 @@ class ResolveNamesVisitor : public virtual ScopeHandler, std::optional prevImportStmt_; void PreSpecificationConstruct(const parser::SpecificationConstruct &); + void CreateCommonBlockSymbols(const parser::CommonStmt &); void CreateGeneric(const parser::GenericSpec &); void FinishSpecificationPart(const std::list &); void AnalyzeStmtFunctionStmt(const parser::StmtFunctionStmt &); @@ -1372,13 +1389,14 @@ bool ImplicitRules::isImplicitNoneExternal() const { } } -const DeclTypeSpec *ImplicitRules::GetType(char ch) const { +const DeclTypeSpec *ImplicitRules::GetType(SourceName name) const { + char ch{name.begin()[0]}; if (isImplicitNoneType_) { return nullptr; } else if (auto it{map_.find(ch)}; it != map_.end()) { return &*it->second; } else if (inheritFromParent_) { - return parent_->GetType(ch); + return parent_->GetType(name); } else if (ch >= 'i' && ch <= 'n') { return &context_.MakeNumericType(TypeCategory::Integer); } else if (ch >= 'a' && ch <= 'z') { @@ -1684,7 +1702,7 @@ bool ImplicitRulesVisitor::Pre(const parser::ImplicitStmt &x) { "IMPLICIT NONE(TYPE) statement"_err_en_US); return false; } - implicitRules().set_isImplicitNoneType(false); + implicitRules_->set_isImplicitNoneType(false); return true; }, }, @@ -1704,7 +1722,7 @@ bool ImplicitRulesVisitor::Pre(const parser::LetterSpec &x) { return false; } } - implicitRules().SetTypeMapping(*GetDeclTypeSpec(), loLoc, hiLoc); + implicitRules_->SetTypeMapping(*GetDeclTypeSpec(), loLoc, hiLoc); return false; } @@ -1749,7 +1767,7 @@ bool ImplicitRulesVisitor::HandleImplicitNone( if (nameSpecs.empty()) { if (!implicitNoneTypeNever) { prevImplicitNoneType_ = currStmtSource(); - implicitRules().set_isImplicitNoneType(true); + implicitRules_->set_isImplicitNoneType(true); if (prevImplicit_) { Say("IMPLICIT NONE statement after IMPLICIT statement"_err_en_US); return false; @@ -1761,13 +1779,13 @@ bool ImplicitRulesVisitor::HandleImplicitNone( for (const auto noneSpec : nameSpecs) { switch (noneSpec) { case ImplicitNoneNameSpec::External: - implicitRules().set_isImplicitNoneExternal(true); + implicitRules_->set_isImplicitNoneExternal(true); ++sawExternal; break; case ImplicitNoneNameSpec::Type: if (!implicitNoneTypeNever) { prevImplicitNoneType_ = currStmtSource(); - implicitRules().set_isImplicitNoneType(true); + implicitRules_->set_isImplicitNoneType(true); if (prevImplicit_) { Say("IMPLICIT NONE(TYPE) after IMPLICIT statement"_err_en_US); return false; @@ -1915,14 +1933,22 @@ void ScopeHandler::Say2(const parser::Name &name, MessageFixedText &&msg1, context().SetError(symbol, msg1.isFatal()); } -Scope &ScopeHandler::InclusiveScope() { - for (auto *scope{&currScope()};; scope = &scope->parent()) { - if (scope->kind() != Scope::Kind::Block && !scope->IsDerivedType() && - !scope->IsStmtFunction()) { - return *scope; +// T may be `Scope` or `const Scope` +template static T &GetInclusiveScope(T &scope) { + for (T *s{&scope}; !s->IsGlobal(); s = &s->parent()) { + if (s->kind() != Scope::Kind::Block && !s->IsDerivedType() && + !s->IsStmtFunction()) { + return *s; } } - DIE("inclusive scope not found"); + return scope; +} + +Scope &ScopeHandler::InclusiveScope() { return GetInclusiveScope(currScope()); } + +Scope *ScopeHandler::GetHostProcedure() { + Scope &parent{InclusiveScope().parent()}; + return parent.kind() == Scope::Kind::Subprogram ? &parent : nullptr; } Scope &ScopeHandler::NonDerivedTypeScope() { @@ -2082,7 +2108,8 @@ void ScopeHandler::ApplyImplicitRules(Symbol &symbol) { } const DeclTypeSpec *ScopeHandler::GetImplicitType(Symbol &symbol) { - const DeclTypeSpec *type{implicitRules().GetType(symbol.name().begin()[0])}; + const auto *type{implicitRulesMap_->at(&GetInclusiveScope(symbol.owner())) + .GetType(symbol.name())}; if (type) { if (const DerivedTypeSpec * derived{type->AsDerived()}) { // Resolve any forward-referenced derived type; a quick no-op else. @@ -2992,7 +3019,7 @@ Symbol &SubprogramVisitor::PushSubprogramScope( if (isGeneric()) { GetGenericDetails().AddSpecificProc(*symbol, name.source); } - implicitRules().set_inheritFromParent(false); + set_inheritFromParent(false); } FindSymbol(name)->set(subpFlag); // PushScope() created symbol return *symbol; @@ -3098,12 +3125,10 @@ void DeclarationVisitor::Post(const parser::TypeDeclarationStmt &) { } void DeclarationVisitor::Post(const parser::DimensionStmt::Declaration &x) { - const auto &name{std::get(x.t)}; - DeclareObjectEntity(name, Attrs{}); + DeclareObjectEntity(std::get(x.t)); } void DeclarationVisitor::Post(const parser::CodimensionDecl &x) { - const auto &name{std::get(x.t)}; - DeclareObjectEntity(name, Attrs{}); + DeclareObjectEntity(std::get(x.t)); } bool DeclarationVisitor::Pre(const parser::Initialization &) { @@ -4211,44 +4236,23 @@ bool DeclarationVisitor::Pre(const parser::IoControlSpec &x) { bool DeclarationVisitor::Pre(const parser::CommonStmt::Block &x) { CheckNotInBlock("COMMON"); // C1107 - const auto &optName{std::get>(x.t)}; - parser::Name blankCommon; - blankCommon.source = - SourceName{currStmtSource().value().begin(), std::size_t{0}}; - CHECK(!commonBlockInfo_.curr); - commonBlockInfo_.curr = - &MakeCommonBlockSymbol(optName ? *optName : blankCommon); return true; } -void DeclarationVisitor::Post(const parser::CommonStmt::Block &) { - commonBlockInfo_.curr = nullptr; -} - bool DeclarationVisitor::Pre(const parser::CommonBlockObject &) { BeginArraySpec(); return true; } void DeclarationVisitor::Post(const parser::CommonBlockObject &x) { - CHECK(commonBlockInfo_.curr); const auto &name{std::get(x.t)}; - auto &symbol{DeclareObjectEntity(name, Attrs{})}; - ClearArraySpec(); - ClearCoarraySpec(); - auto *details{symbol.detailsIf()}; - if (!details) { - return; // error was reported - } - commonBlockInfo_.curr->get().add_object(symbol); - auto pair{commonBlockInfo_.names.insert(name.source)}; + DeclareObjectEntity(name); + auto pair{commonBlockObjects_.insert(name.source)}; if (!pair.second) { const SourceName &prev{*pair.first}; Say2(name.source, "'%s' is already in a COMMON block"_err_en_US, prev, "Previous occurrence of '%s' in a COMMON block"_en_US); - return; } - details->set_commonBlock(*commonBlockInfo_.curr); } bool DeclarationVisitor::Pre(const parser::EquivalenceStmt &x) { @@ -4409,7 +4413,7 @@ void DeclarationVisitor::CheckCommonBlocks() { } } // check objects in common blocks - for (const auto &name : commonBlockInfo_.names) { + for (const auto &name : commonBlockObjects_) { const auto *symbol{currScope().FindSymbol(name)}; if (!symbol) { continue; @@ -4443,12 +4447,20 @@ void DeclarationVisitor::CheckCommonBlocks() { } } } - commonBlockInfo_ = {}; + commonBlockObjects_ = {}; } Symbol &DeclarationVisitor::MakeCommonBlockSymbol(const parser::Name &name) { return Resolve(name, currScope().MakeCommonBlock(name.source)); } +Symbol &DeclarationVisitor::MakeCommonBlockSymbol( + const std::optional &name) { + if (name) { + return MakeCommonBlockSymbol(*name); + } else { + return MakeCommonBlockSymbol(parser::Name{}); + } +} bool DeclarationVisitor::NameIsKnownOrIntrinsic(const parser::Name &name) { return FindSymbol(name) || HandleUnrestrictedSpecificIntrinsicFunction(name); @@ -4824,8 +4836,7 @@ void ConstructVisitor::ResolveIndexName( } name.symbol = nullptr; } - auto &symbol{DeclareObjectEntity(name, {})}; - + auto &symbol{DeclareObjectEntity(name)}; if (symbol.GetType()) { // type came from explicit type-spec } else if (!prev) { @@ -5419,10 +5430,15 @@ const parser::Name *DeclarationVisitor::ResolveDataRef( // If implicit types are allowed, ensure name is in the symbol table. // Otherwise, report an error if it hasn't been declared. const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) { - if (Symbol * symbol{FindSymbol(name)}) { + FindSymbol(name); + if (CheckForHostAssociatedImplicit(name)) { + return &name; + } + if (Symbol * symbol{name.symbol}) { if (CheckUseError(name)) { return nullptr; // reported an error } + symbol->set(Symbol::Flag::ImplicitOrError, false); if (IsUplevelReference(*symbol)) { MakeHostAssocSymbol(name, *symbol); } else if (IsDummy(*symbol) || @@ -5449,6 +5465,44 @@ const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) { return &name; } +// A specification expression may refer to a symbol in the host procedure that +// is implicitly typed. Because specification parts are processed before +// execution parts, this may be the first time we see the symbol. It can't be a +// local in the current scope (because it's in a specification expression) so +// either it is implicitly declared in the host procedure or it is an error. +// We create a symbol in the host assuming it is the former; if that proves to +// be wrong we report an error later in CheckDeclarations(). +bool DeclarationVisitor::CheckForHostAssociatedImplicit( + const parser::Name &name) { + if (inExecutionPart_) { + return false; + } + if (name.symbol) { + ApplyImplicitRules(*name.symbol); + } + Symbol *hostSymbol; + Scope *host{GetHostProcedure()}; + if (!host || isImplicitNoneType(*host)) { + return false; + } else if (!name.symbol) { + hostSymbol = &MakeSymbol(*host, name.source, Attrs{}); + ConvertToObjectEntity(*hostSymbol); + ApplyImplicitRules(*hostSymbol); + hostSymbol->set(Symbol::Flag::ImplicitOrError); + } else if (name.symbol->test(Symbol::Flag::ImplicitOrError)) { + hostSymbol = name.symbol; + } else { + return false; + } + Symbol &symbol{MakeHostAssocSymbol(name, *hostSymbol)}; + if (isImplicitNoneType()) { + symbol.get().implicitOrExplicitTypeError = true; + } else { + symbol.get().implicitOrSpecExprError = true; + } + return true; +} + bool DeclarationVisitor::IsUplevelReference(const Symbol &symbol) { const Scope *symbolUnit{FindProgramUnitContaining(symbol)}; if (symbolUnit == FindProgramUnitContaining(currScope())) { @@ -5897,13 +5951,14 @@ static bool NeedsExplicitType(const Symbol &symbol) { } bool ResolveNamesVisitor::Pre(const parser::SpecificationPart &x) { - Walk(std::get<0>(x.t)); - Walk(std::get<1>(x.t)); - Walk(std::get<2>(x.t)); - Walk(std::get<3>(x.t)); - Walk(std::get<4>(x.t)); - Walk(std::get<5>(x.t)); - const std::list &decls{std::get<6>(x.t)}; + const auto &[accDecls, ompDecls, compilerDirectives, useStmts, importStmts, + implicitPart, decls] = x.t; + Walk(accDecls); + Walk(ompDecls); + Walk(compilerDirectives); + Walk(useStmts); + Walk(importStmts); + Walk(implicitPart); for (const auto &decl : decls) { if (const auto *spec{ std::get_if(&decl.u)}) { @@ -5920,17 +5975,19 @@ void ResolveNamesVisitor::PreSpecificationConstruct( const parser::SpecificationConstruct &spec) { std::visit( common::visitors{ - [&](const Indirection &) {}, [&](const parser::Statement> &y) { CreateGeneric(std::get(y.statement.value().t)); }, [&](const Indirection &y) { const auto &stmt{std::get>( y.value().t)}; - const auto *spec{std::get_if>( - &stmt.statement.u)}; - if (spec && *spec) { - CreateGeneric(**spec); + if (const auto *spec{parser::Unwrap(stmt)}) { + CreateGeneric(*spec); + } + }, + [&](const parser::Statement &y) { + if (const auto *commonStmt{parser::Unwrap(y)}) { + CreateCommonBlockSymbols(*commonStmt); } }, [&](const auto &) {}, @@ -5938,6 +5995,21 @@ void ResolveNamesVisitor::PreSpecificationConstruct( spec.u); } +void ResolveNamesVisitor::CreateCommonBlockSymbols( + const parser::CommonStmt &commonStmt) { + for (const parser::CommonStmt::Block &block : commonStmt.blocks) { + const auto &[name, objects] = block.t; + Symbol &commonBlock{MakeCommonBlockSymbol(name)}; + for (const auto &object : objects) { + Symbol &obj{DeclareObjectEntity(std::get(object.t))}; + if (auto *details{obj.detailsIf()}) { + details->set_commonBlock(commonBlock); + commonBlock.get().add_object(obj); + } + } + } +} + void ResolveNamesVisitor::CreateGeneric(const parser::GenericSpec &x) { auto info{GenericSpecInfo{x}}; const SourceName &symbolName{info.symbolName()}; @@ -6154,6 +6226,11 @@ void ResolveNamesVisitor::Post(const parser::AssignedGotoStmt &x) { } bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) { + if (std::holds_alternative>( + x.u)) { + // TODO: global directives + return true; + } auto root{ProgramTree::Build(x)}; SetScope(context().globalScope()); ResolveSpecificationParts(root); diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index d5ef9c76aa34c..cde345af642a3 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -1305,4 +1305,9 @@ bool HasAlternateReturns(const Symbol &subprogram) { return false; } +bool InCommonBlock(const Symbol &symbol) { + const auto *details{symbol.detailsIf()}; + return details && details->commonBlock(); +} + } // namespace Fortran::semantics diff --git a/flang/test/Parser/compiler-directives.f90 b/flang/test/Parser/compiler-directives.f90 index 5545d0486e56f..c916b16b53274 100644 --- a/flang/test/Parser/compiler-directives.f90 +++ b/flang/test/Parser/compiler-directives.f90 @@ -2,6 +2,7 @@ ! Test that compiler directives can appear in various places. +!dir$ integer module m !dir$ integer use iso_fortran_env diff --git a/flang/test/Preprocessing/fixed-rescan.F b/flang/test/Preprocessing/fixed-rescan.F new file mode 100644 index 0000000000000..3d6ba9a8c6f45 --- /dev/null +++ b/flang/test/Preprocessing/fixed-rescan.F @@ -0,0 +1,7 @@ +! RUN: %f18 -E %s | FileCheck %s +! CHECK: callbar +! Ensure that rescanned lines after macro replacement are not +! misinterpreted as fixed-form comments when they start with C or D. +#define foo bar + call foo + end diff --git a/flang/test/Preprocessing/pp029.F b/flang/test/Preprocessing/pp029.F index bb8efe6c1a2e0..9be309f75143e 100644 --- a/flang/test/Preprocessing/pp029.F +++ b/flang/test/Preprocessing/pp029.F @@ -1,5 +1,5 @@ ! RUN: %f18 -E %s 2>&1 | FileCheck %s -! CHECK: if(77 7.eq.777)then +! CHECK: if(777.eq.777)then * \ newline allowed in #define integer, parameter :: KWM = 666 #define KWM 77\ diff --git a/flang/test/Preprocessing/pp130.F90 b/flang/test/Preprocessing/pp130.F90 index af4ad126e6fa4..be1148807b8bf 100644 --- a/flang/test/Preprocessing/pp130.F90 +++ b/flang/test/Preprocessing/pp130.F90 @@ -1,5 +1,5 @@ -! RUN: %f18 -E %s 2>&1 | FileCheck %s -! CHECK: j = j + & +! RUN: (%f18 -E %s 2>&1 || true) | FileCheck %s +! CHECK: error: bad character ('&') in Fortran token ! #define KWM &, use for continuation w/o pasting (ifort and nag seem to continue #define) #define KWM & diff --git a/flang/test/Semantics/acc-canonicalization-validity.f90 b/flang/test/Semantics/acc-canonicalization-validity.f90 index 06c63ed25ddbb..350f6315867cf 100644 --- a/flang/test/Semantics/acc-canonicalization-validity.f90 +++ b/flang/test/Semantics/acc-canonicalization-validity.f90 @@ -92,4 +92,18 @@ program openacc_clause_validity do end do + !$acc parallel + !ERROR: The loop construct with the TILE clause must be followed by 2 tightly-nested loops + !$acc loop tile(2, 2) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !ERROR: The loop construct with the TILE clause must be followed by 2 tightly-nested loops + !$acc parallel loop tile(2, 2) + do i = 1, N + a(i) = 3.14 + end do + end program openacc_clause_validity diff --git a/flang/test/Semantics/acc-clause-validity.f90 b/flang/test/Semantics/acc-clause-validity.f90 index 4f8bd1406de5a..9683a4e02c747 100644 --- a/flang/test/Semantics/acc-clause-validity.f90 +++ b/flang/test/Semantics/acc-clause-validity.f90 @@ -18,11 +18,17 @@ program openacc_clause_validity implicit none - integer :: i, j - integer :: N = 256 + integer :: i, j, b, gang_size, vector_size, worker_size + integer, parameter :: N = 256 + integer, dimension(N) :: c + logical, dimension(N) :: d, e + real :: reduction_r + logical :: reduction_l + real(8), dimension(N, N) :: aa + !ERROR: At least one clause is required on the DECLARE directive !$acc declare - real(8) :: a(256) + real(8), dimension(N) :: a !ERROR: At least one of ATTACH, COPYIN, CREATE clause must appear on the ENTER DATA directive !$acc enter data @@ -78,6 +84,25 @@ program openacc_clause_validity end do !$acc end parallel + !$acc parallel + !$acc loop tile(2) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel loop tile(2) + do i = 1, N + a(i) = 3.14 + end do + + !$acc parallel loop tile(2, 2) + do i = 1, N + do j = 1, N + aa(i, j) = 3.14 + end do + end do + !$acc parallel device_type(*) num_gangs(2) !$acc loop do i = 1, N @@ -85,6 +110,141 @@ program openacc_clause_validity end do !$acc end parallel + !$acc parallel + !$acc loop seq + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop independent + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop auto + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop vector + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop vector(10) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop vector(vector_size) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop vector(length: vector_size) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop worker + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop worker(10) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop worker(worker_size) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop worker(num: worker_size) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop gang(gang_size) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop gang(num: gang_size) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop gang(gang_size, static:*) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop gang(num: gang_size, static:*) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop gang(num: gang_size, static: gang_size) + do i = 1, N + a(i) = 3.14 + end do + !$acc end parallel + + !$acc parallel + !$acc loop private(b, a(:)) + do i = 1, N + a(i) = b + end do + !$acc end parallel + + !$acc parallel + !$acc loop tile(*) + do i = 1, N + a(i) = b + end do + !$acc end parallel + + !$acc parallel + !$acc loop tile(2, 2) + do i = 1, N + do j = 1, N + a(i) = b + end do + end do + !$acc end parallel + !$acc parallel !ERROR: The parameter of the COLLAPSE clause must be a constant positive integer expression !$acc loop collapse(-1) @@ -187,6 +347,61 @@ program openacc_clause_validity !ERROR: Unmatched END PARALLEL LOOP directive !$acc end parallel loop + !$acc parallel loop reduction(+: reduction_r) + do i = 1, N + reduction_r = a(i) + i + end do + + !$acc parallel loop reduction(*: reduction_r) + do i = 1, N + reduction_r = reduction_r * (a(i) + i) + end do + + !$acc parallel loop reduction(min: reduction_r) + do i = 1, N + reduction_r = min(reduction_r, a(i) * i) + end do + + !$acc parallel loop reduction(max: reduction_r) + do i = 1, N + reduction_r = max(reduction_r, a(i) * i) + end do + + !$acc parallel loop reduction(iand: b) + do i = 1, N + b = iand(b, c(i)) + end do + + !$acc parallel loop reduction(ior: b) + do i = 1, N + b = ior(b, c(i)) + end do + + !$acc parallel loop reduction(ieor: b) + do i = 1, N + b = ieor(b, c(i)) + end do + + !$acc parallel loop reduction(.and.: reduction_l) + do i = 1, N + reduction_l = d(i) .and. e(i) + end do + + !$acc parallel loop reduction(.or.: reduction_l) + do i = 1, N + reduction_l = d(i) .or. e(i) + end do + + !$acc parallel loop reduction(.eqv.: reduction_l) + do i = 1, N + reduction_l = d(i) .eqv. e(i) + end do + + !$acc parallel loop reduction(.neqv.: reduction_l) + do i = 1, N + reduction_l = d(i) .neqv. e(i) + end do + !$acc kernels wait(1, 2) async(3) !$acc end kernels diff --git a/flang/test/Semantics/block-data01.f90 b/flang/test/Semantics/block-data01.f90 index 6424824f46069..6549e402cc9db 100644 --- a/flang/test/Semantics/block-data01.f90 +++ b/flang/test/Semantics/block-data01.f90 @@ -7,9 +7,10 @@ block data foo !ERROR: An initialized variable in BLOCK DATA must be in a COMMON block integer :: notInCommon = 1 integer :: uninitialized ! ok - !ERROR: 'p' may not appear in a BLOCK DATA subprogram + !ERROR: 'q' may not appear in a BLOCK DATA subprogram + procedure(sin), pointer :: q => cos + !ERROR: 'p' may not be a procedure as it is in a COMMON block procedure(sin), pointer :: p => cos - !ERROR: 'p' is already declared as a procedure common /block/ pi, p !ERROR: An initialized variable in BLOCK DATA must be in a COMMON block integer :: inDataButNotCommon diff --git a/flang/test/Semantics/deallocate05.f90 b/flang/test/Semantics/deallocate05.f90 index 7524cc88fe0b8..4a54469e5ab67 100644 --- a/flang/test/Semantics/deallocate05.f90 +++ b/flang/test/Semantics/deallocate05.f90 @@ -21,6 +21,7 @@ Program deallocatetest Real :: r Integer :: s +Integer, Parameter :: const_s = 13 Integer :: e Integer :: pi Character(256) :: ee @@ -56,6 +57,8 @@ Program deallocatetest !ERROR: STAT may not be duplicated in a DEALLOCATE statement Deallocate(x, stat=s, stat=s) +!ERROR: STAT variable 'const_s' must be definable +Deallocate(x, stat=const_s) !ERROR: ERRMSG may not be duplicated in a DEALLOCATE statement Deallocate(x, errmsg=ee, errmsg=ee) !ERROR: STAT may not be duplicated in a DEALLOCATE statement diff --git a/flang/test/Semantics/implicit11.f90 b/flang/test/Semantics/implicit11.f90 new file mode 100644 index 0000000000000..2c8e138de1efe --- /dev/null +++ b/flang/test/Semantics/implicit11.f90 @@ -0,0 +1,61 @@ +! RUN: %S/test_errors.sh %s %t %f18 + +! Test use of implicitly declared variable in specification expression + +subroutine s1() + m = 1 +contains + subroutine s1a() + implicit none + !ERROR: No explicit type declared for 'n' + real :: a(m, n) + end + subroutine s1b() + !ERROR: Implicitly typed local entity 'n' not allowed in specification expression + real :: a(m, n) + end +end + +subroutine s2() + type :: t(m, n) + integer, len :: m + integer, len :: n + end type + n = 1 +contains + subroutine s2a() + !ERROR: Implicitly typed local entity 'm' not allowed in specification expression + type(t(m, n)) :: a + end + subroutine s2b() + implicit none + !ERROR: No explicit type declared for 'm' + character(m) :: a + end +end + +subroutine s3() + m = 1 +contains + subroutine s3a() + implicit none + real :: a(m, n) + !ERROR: No explicit type declared for 'n' + common n + end + subroutine s3b() + ! n is okay here because it is in a common block + real :: a(m, n) + common n + end +end + +subroutine s4() + implicit none +contains + subroutine s4a() + !ERROR: No explicit type declared for 'n' + real :: a(n) + end +end + diff --git a/flang/test/Semantics/io01.f90 b/flang/test/Semantics/io01.f90 index 4238df89f5d0e..9828d4afe8921 100644 --- a/flang/test/Semantics/io01.f90 +++ b/flang/test/Semantics/io01.f90 @@ -21,6 +21,7 @@ integer :: unit10 = 10 integer :: unit11 = 11 integer :: n = 40 + integer, parameter :: const_new_unit = 66 integer(kind=1) :: stat1 integer(kind=2) :: stat2 @@ -73,6 +74,9 @@ !ERROR: If NEWUNIT appears, FILE or STATUS must also appear open(newunit=n, newunit=nn, iostat=stat4) + !ERROR: NEWUNIT variable 'const_new_unit' must be definable + open(newunit=const_new_unit, status=cc) + !ERROR: Duplicate UNIT specifier open(unit=100, unit=100) diff --git a/flang/test/Semantics/io02.f90 b/flang/test/Semantics/io02.f90 index 5fd5fca4bc0c8..9f5235d353cbd 100644 --- a/flang/test/Semantics/io02.f90 +++ b/flang/test/Semantics/io02.f90 @@ -1,6 +1,7 @@ ! RUN: %S/test_errors.sh %s %t %f18 integer :: unit10 = 10 integer :: unit11 = 11 + integer, parameter :: const_stat = 6666 integer(kind=1) :: stat1 integer(kind=8) :: stat8 @@ -28,5 +29,8 @@ !ERROR: Invalid STATUS value 'old' close(status='old', unit=17) + !ERROR: IOSTAT variable 'const_stat' must be definable + close(14, iostat=const_stat) + 9 continue end diff --git a/flang/test/Semantics/io03.f90 b/flang/test/Semantics/io03.f90 index 0041e6cd0f5c4..5eb3420d1aea1 100644 --- a/flang/test/Semantics/io03.f90 +++ b/flang/test/Semantics/io03.f90 @@ -2,13 +2,18 @@ character(kind=1,len=50) internal_file character(kind=2,len=50) internal_file2 character(kind=4,len=50) internal_file4 + character(kind=1,len=50) internal_fileA(20) character(kind=1,len=111) msg character(20) advance + character(20) :: cvar; + character, parameter :: const_internal_file = "(I6)" + character, parameter :: const_cvar = "Ceci n'est pas une pipe." integer*1 stat1 integer*2 stat2, id2 integer*8 stat8 integer :: iunit = 10 - integer, parameter :: junit = 11 + integer, parameter :: junit = 11, const_size = 13, const_int = 15 + integer :: vv(10) = 7 namelist /mmm/ mm1, mm2 namelist /nnn/ nn1, nn2 @@ -29,11 +34,14 @@ read(fmt='(I4)', unit=*) jj read(iunit, *) jj read(junit, *) jj - read(10, *) jj + read(10, *) jj, cvar, cvar(7:17) read(internal_file, *) jj + read(internal_fileA(3), *) jj + read(internal_fileA(4:9), *) jj read(10, nnn) read(internal_file, nnn) read(internal_file, nml=nnn) + read(const_internal_file, *) read(fmt=*, unit=internal_file) read(nml=nnn, unit=internal_file) read(iunit, nnn) @@ -53,6 +61,21 @@ !ERROR: Invalid character kind for an internal file variable read(internal_file4, *) jj + !ERROR: Internal file must not have a vector subscript + read(internal_fileA(vv), *) jj + + !ERROR: Input variable 'const_int' must be definable + read(11, *) const_int + + !ERROR: SIZE variable 'const_size' must be definable + read(11, pos=ipos, size=const_size, end=9) + + !ERROR: Input variable 'const_cvar' must be definable + read(11, *) const_cvar + + !ERROR: Input variable 'const_cvar' must be definable + read(11, *) const_cvar(3:13) + !ERROR: Duplicate IOSTAT specifier read(11, pos=ipos, iostat=stat1, iostat=stat2) @@ -136,3 +159,25 @@ 9 continue end + +subroutine s(aa, n) + integer :: aa(5,*) + integer, intent(in) :: n + integer :: bb(10), vv(10) + type tt + real :: x, y, z + end type tt + type(tt) :: qq(20) + + vv = 1 + + read(*, *) aa(n,1) + read(*, *) aa(n:n+2,2) + read(*, *) qq(2:5)%y + + !ERROR: Input variable 'n' must be definable + read(*, *) n + + !ERROR: Whole assumed size array 'aa' may not be an input item + read(*, *) aa +end diff --git a/flang/test/Semantics/io04.f90 b/flang/test/Semantics/io04.f90 index 0a37d685d3ee5..6be26047fd5b2 100644 --- a/flang/test/Semantics/io04.f90 +++ b/flang/test/Semantics/io04.f90 @@ -2,6 +2,7 @@ character(kind=1,len=50) internal_file character(kind=1,len=100) msg character(20) sign + character, parameter :: const_internal_file = "(I6)" integer*1 stat1, id1 integer*2 stat2 integer*4 stat4 @@ -9,6 +10,8 @@ integer :: iunit = 10 integer, parameter :: junit = 11 integer, pointer :: a(:) + integer, parameter :: const_id = 66666 + procedure(), pointer :: procptr namelist /nnn/ nn1, nn2 @@ -66,6 +69,9 @@ !ERROR: If NML appears, a data list must not appear write(10, nnn, rec=40, fmt=1) 'Ok' + !ERROR: Internal file variable 'const_internal_file' must be definable + write(const_internal_file, fmt=*) + !ERROR: If UNIT=* appears, POS must not appear write(*, pos=n, nml=nnn) @@ -118,8 +124,14 @@ !ERROR: ID kind (1) is smaller than default INTEGER kind (4) write(id=id1, unit=10, asynchronous='Yes') 'Ok' + !ERROR: ID variable 'const_id' must be definable + write(10, *, asynchronous='yes', id=const_id, iostat=stat2) 'Ok' + write(*, '(X)') + !ERROR: Output item must not be a procedure pointer + print*, n1, procptr, n2 + 1 format (A) 9 continue end diff --git a/flang/test/Semantics/io05.f90 b/flang/test/Semantics/io05.f90 index 1501fbf587f5e..ed6b77f7d4ad9 100644 --- a/flang/test/Semantics/io05.f90 +++ b/flang/test/Semantics/io05.f90 @@ -1,10 +1,12 @@ ! RUN: %S/test_errors.sh %s %t %f18 character*20 c(25), cv character(kind=1,len=59) msg + character, parameter :: const_round = "c'est quoi?" logical*2 v(5), lv integer*1 stat1 integer*2 stat4 integer*8 stat8, iv + integer, parameter :: const_id = 1 inquire(10) inquire(file='abc') @@ -22,6 +24,7 @@ exist=v(1), named=v(2), opened=v(3), pending=v(4)) inquire(pending=v(5), file='abc') inquire(10, id=id, pending=v(5)) + inquire(10, id=const_id, pending=v(5)) ! using variable 'cv' multiple times seems to be allowed inquire(file='abc', & @@ -56,5 +59,8 @@ !ERROR: If ID appears, PENDING must also appear inquire(file='abc', id=id) + !ERROR: ROUND variable 'const_round' must be definable + inquire(file='abc', round=const_round) + 9 continue end diff --git a/flang/test/Semantics/io06.f90 b/flang/test/Semantics/io06.f90 index 157d831dc3331..fe3b97f0e67e4 100644 --- a/flang/test/Semantics/io06.f90 +++ b/flang/test/Semantics/io06.f90 @@ -1,6 +1,7 @@ ! RUN: %S/test_errors.sh %s %t %f18 character(kind=1,len=100) msg1 character(kind=2,len=200) msg2 + character, parameter :: const_msg = 'doof' integer(1) stat1 integer(2) stat2 integer(8) stat8 @@ -28,6 +29,9 @@ !ERROR: Duplicate IOSTAT specifier endfile(iostat=stat2, err=9, unit=10, iostat=stat8, iomsg=msg1) + !ERROR: IOMSG variable 'const_msg' must be definable + flush(iomsg=const_msg, unit=10, iostat=stat8, err=9) + !ERROR: REWIND statement must have a UNIT number specifier rewind(iostat=stat2) diff --git a/flang/test/Semantics/modfile21.f90 b/flang/test/Semantics/modfile21.f90 index f1e4036c96a8e..d7b45f70c00d8 100644 --- a/flang/test/Semantics/modfile21.f90 +++ b/flang/test/Semantics/modfile21.f90 @@ -26,10 +26,10 @@ module m ! real(4)::v ! complex(4)::w ! real(4)::cb -! common/cb2/a,b,c -! bind(c)::/cb2/ ! common//t,w,u,v ! common/cb/x,y,z ! bind(c, name="CB")::/cb/ +! common/cb2/a,b,c +! bind(c)::/cb2/ ! common/b/cb !end diff --git a/flang/test/Semantics/resolve42.f90 b/flang/test/Semantics/resolve42.f90 index b0b092ae34292..0ae7459ab089d 100644 --- a/flang/test/Semantics/resolve42.f90 +++ b/flang/test/Semantics/resolve42.f90 @@ -11,11 +11,11 @@ subroutine s2 end subroutine s3 + !ERROR: 'x' may not be a procedure as it is in a COMMON block procedure(real) :: x - !ERROR: 'x' is already declared as a procedure common x common y - !ERROR: 'y' is already declared as an object + !ERROR: 'y' may not be a procedure as it is in a COMMON block procedure(real) :: y end diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt index 6f3cfdb64a5f7..2275dad7f6535 100644 --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -53,11 +53,6 @@ function(add_libc_benchmark_unittest target_name) EXCLUDE_FROM_ALL ${LIBC_BENCHMARKS_UNITTEST_SRCS} ) - target_include_directories(${target_name} - PRIVATE - ${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include - ${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock/include - ) target_link_libraries(${target_name} PRIVATE gtest_main diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index fe63403ae2210..34d07c24505d9 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -75,6 +75,9 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.roundl libc.src.math.sincosf libc.src.math.sinf + libc.src.math.sqrt + libc.src.math.sqrtf + libc.src.math.sqrtl libc.src.math.trunc libc.src.math.truncf libc.src.math.truncl diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index 6b50c4284ae2e..063fe401da8b5 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -204,6 +204,9 @@ def MathAPI : PublicAPI<"math.h"> { "roundl", "sincosf", "sinf", + "sqrt", + "sqrtf", + "sqrtl", "trunc", "truncf", "truncl", diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 35ca8bbeaf683..c24173b1d0e77 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -108,6 +108,9 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.roundl libc.src.math.sincosf libc.src.math.sinf + libc.src.math.sqrt + libc.src.math.sqrtf + libc.src.math.sqrtl libc.src.math.trunc libc.src.math.truncf libc.src.math.truncl diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index ac240ff9576e7..15fc12d375e63 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -314,6 +314,10 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"roundf", RetValSpec, [ArgSpec]>, FunctionSpec<"roundl", RetValSpec, [ArgSpec]>, + FunctionSpec<"sqrt", RetValSpec, [ArgSpec]>, + FunctionSpec<"sqrtf", RetValSpec, [ArgSpec]>, + FunctionSpec<"sqrtl", RetValSpec, [ArgSpec]>, + FunctionSpec<"trunc", RetValSpec, [ArgSpec]>, FunctionSpec<"truncf", RetValSpec, [ArgSpec]>, FunctionSpec<"truncl", RetValSpec, [ArgSpec]>, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index da18aeba9a2a5..0c878de2ac95d 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -485,3 +485,39 @@ add_entrypoint_object( COMPILE_OPTIONS -O2 ) + +add_entrypoint_object( + sqrt + SRCS + sqrt.cpp + HDRS + sqrt.h + DEPENDS + libc.utils.FPUtil.fputil + COMPILE_OPTIONS + -O2 +) + +add_entrypoint_object( + sqrtf + SRCS + sqrtf.cpp + HDRS + sqrtf.h + DEPENDS + libc.utils.FPUtil.fputil + COMPILE_OPTIONS + -O2 +) + +add_entrypoint_object( + sqrtl + SRCS + sqrtl.cpp + HDRS + sqrtl.h + DEPENDS + libc.utils.FPUtil.fputil + COMPILE_OPTIONS + -O2 +) diff --git a/libc/src/math/sqrt.cpp b/libc/src/math/sqrt.cpp new file mode 100644 index 0000000000000..32d38e61463d0 --- /dev/null +++ b/libc/src/math/sqrt.cpp @@ -0,0 +1,16 @@ +//===-- Implementation of sqrt function -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "utils/FPUtil/Sqrt.h" +#include "src/__support/common.h" + +namespace __llvm_libc { + +double LLVM_LIBC_ENTRYPOINT(sqrt)(double x) { return fputil::sqrt(x); } + +} // namespace __llvm_libc diff --git a/libc/src/math/sqrt.h b/libc/src/math/sqrt.h new file mode 100644 index 0000000000000..2390e07b5dce5 --- /dev/null +++ b/libc/src/math/sqrt.h @@ -0,0 +1,18 @@ +//===-- Implementation header for sqrt --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_SQRT_H +#define LLVM_LIBC_SRC_MATH_SQRT_H + +namespace __llvm_libc { + +double sqrt(double x); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MATH_SQRT_H diff --git a/libc/src/math/sqrtf.cpp b/libc/src/math/sqrtf.cpp new file mode 100644 index 0000000000000..391fa6a3281a5 --- /dev/null +++ b/libc/src/math/sqrtf.cpp @@ -0,0 +1,16 @@ +//===-- Implementation of sqrtf function ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/common.h" +#include "utils/FPUtil/Sqrt.h" + +namespace __llvm_libc { + +float LLVM_LIBC_ENTRYPOINT(sqrtf)(float x) { return fputil::sqrt(x); } + +} // namespace __llvm_libc diff --git a/libc/src/math/sqrtf.h b/libc/src/math/sqrtf.h new file mode 100644 index 0000000000000..d1d06f3adfa8e --- /dev/null +++ b/libc/src/math/sqrtf.h @@ -0,0 +1,18 @@ +//===-- Implementation header for sqrtf -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_SQRTF_H +#define LLVM_LIBC_SRC_MATH_SQRTF_H + +namespace __llvm_libc { + +float sqrtf(float x); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MATH_SQRTF_H diff --git a/libc/src/math/sqrtl.cpp b/libc/src/math/sqrtl.cpp new file mode 100644 index 0000000000000..16450349d23a0 --- /dev/null +++ b/libc/src/math/sqrtl.cpp @@ -0,0 +1,18 @@ +//===-- Implementation of sqrtl function ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/common.h" +#include "utils/FPUtil/Sqrt.h" + +namespace __llvm_libc { + +long double LLVM_LIBC_ENTRYPOINT(sqrtl)(long double x) { + return fputil::sqrt(x); +} + +} // namespace __llvm_libc diff --git a/libc/src/math/sqrtl.h b/libc/src/math/sqrtl.h new file mode 100644 index 0000000000000..5fbfa14507147 --- /dev/null +++ b/libc/src/math/sqrtl.h @@ -0,0 +1,18 @@ +//===-- Implementation header for sqrtl -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_SQRTL_H +#define LLVM_LIBC_SRC_MATH_SQRTL_H + +namespace __llvm_libc { + +long double sqrtl(long double x); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MATH_SQRTL_H diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index e73de54035642..07b5052074528 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -333,6 +333,7 @@ add_fp_unittest( add_fp_unittest( frexp_test + NEED_MPFR SUITE libc_math_unittests SRCS @@ -345,6 +346,7 @@ add_fp_unittest( add_fp_unittest( frexpf_test + NEED_MPFR SUITE libc_math_unittests SRCS @@ -357,6 +359,7 @@ add_fp_unittest( add_fp_unittest( frexpl_test + NEED_MPFR SUITE libc_math_unittests SRCS @@ -510,3 +513,42 @@ add_fp_unittest( libc.src.math.fmaxl libc.utils.FPUtil.fputil ) + +add_fp_unittest( + sqrtf_test + NEED_MPFR + SUITE + libc_math_unittests + SRCS + sqrtf_test.cpp + DEPENDS + libc.include.math + libc.src.math.sqrtf + libc.utils.FPUtil.fputil +) + +add_fp_unittest( + sqrt_test + NEED_MPFR + SUITE + libc_math_unittests + SRCS + sqrt_test.cpp + DEPENDS + libc.include.math + libc.src.math.sqrt + libc.utils.FPUtil.fputil +) + +add_fp_unittest( + sqrtl_test + NEED_MPFR + SUITE + libc_math_unittests + SRCS + sqrtl_test.cpp + DEPENDS + libc.include.math + libc.src.math.sqrtl + libc.utils.FPUtil.fputil +) diff --git a/libc/test/src/math/frexp_test.cpp b/libc/test/src/math/frexp_test.cpp index f828d515a6884..360bbf237560e 100644 --- a/libc/test/src/math/frexp_test.cpp +++ b/libc/test/src/math/frexp_test.cpp @@ -11,13 +11,18 @@ #include "utils/FPUtil/BasicOperations.h" #include "utils/FPUtil/BitPatterns.h" #include "utils/FPUtil/ClassificationFunctions.h" +#include "utils/FPUtil/FPBits.h" #include "utils/FPUtil/FloatOperations.h" #include "utils/FPUtil/FloatProperties.h" +#include "utils/MPFRWrapper/MPFRUtils.h" #include "utils/UnitTest/Test.h" +using FPBits = __llvm_libc::fputil::FPBits; using __llvm_libc::fputil::valueAsBits; using __llvm_libc::fputil::valueFromBits; +namespace mpfr = __llvm_libc::testing::mpfr; + using BitPatterns = __llvm_libc::fputil::BitPatterns; using Properties = __llvm_libc::fputil::FloatProperties; @@ -127,17 +132,19 @@ TEST(FrexpTest, SomeIntegers) { } TEST(FrexpTest, InDoubleRange) { - using BitsType = Properties::BitsType; - constexpr BitsType count = 1000000; - constexpr BitsType step = UINT64_MAX / count; - for (BitsType i = 0, v = 0; i <= count; ++i, v += step) { - double x = valueFromBits(v); + using UIntType = FPBits::UIntType; + constexpr UIntType count = 1000001; + constexpr UIntType step = UIntType(-1) / count; + for (UIntType i = 0, v = 0; i <= count; ++i, v += step) { + double x = FPBits(v); if (isnan(x) || isinf(x) || x == 0.0) continue; - int exponent; - double frac = __llvm_libc::frexp(x, &exponent); - ASSERT_TRUE(__llvm_libc::fputil::abs(frac) < 1.0); - ASSERT_TRUE(__llvm_libc::fputil::abs(frac) >= 0.5); + mpfr::BinaryOutput result; + result.f = __llvm_libc::frexp(x, &result.i); + + ASSERT_TRUE(__llvm_libc::fputil::abs(result.f) < 1.0); + ASSERT_TRUE(__llvm_libc::fputil::abs(result.f) >= 0.5); + ASSERT_MPFR_MATCH(mpfr::Operation::Frexp, x, result, 0.0); } } diff --git a/libc/test/src/math/frexpf_test.cpp b/libc/test/src/math/frexpf_test.cpp index 3b82c68078ee8..1bf0c36cf165f 100644 --- a/libc/test/src/math/frexpf_test.cpp +++ b/libc/test/src/math/frexpf_test.cpp @@ -11,14 +11,18 @@ #include "utils/FPUtil/BasicOperations.h" #include "utils/FPUtil/BitPatterns.h" #include "utils/FPUtil/ClassificationFunctions.h" +#include "utils/FPUtil/FPBits.h" #include "utils/FPUtil/FloatOperations.h" #include "utils/FPUtil/FloatProperties.h" #include "utils/MPFRWrapper/MPFRUtils.h" #include "utils/UnitTest/Test.h" +using FPBits = __llvm_libc::fputil::FPBits; using __llvm_libc::fputil::valueAsBits; using __llvm_libc::fputil::valueFromBits; +namespace mpfr = __llvm_libc::testing::mpfr; + using BitPatterns = __llvm_libc::fputil::BitPatterns; using Properties = __llvm_libc::fputil::FloatProperties; @@ -109,7 +113,7 @@ TEST(FrexpfTest, PowersOfTwo) { EXPECT_EQ(exponent, 7); } -TEST(FrexpTest, SomeIntegers) { +TEST(FrexpfTest, SomeIntegers) { int exponent; EXPECT_EQ(valueAsBits(0.75f), @@ -135,17 +139,19 @@ TEST(FrexpTest, SomeIntegers) { } TEST(FrexpfTest, InFloatRange) { - using BitsType = Properties::BitsType; - constexpr BitsType count = 1000000; - constexpr BitsType step = UINT32_MAX / count; - for (BitsType i = 0, v = 0; i <= count; ++i, v += step) { - float x = valueFromBits(v); + using UIntType = FPBits::UIntType; + constexpr UIntType count = 1000001; + constexpr UIntType step = UIntType(-1) / count; + for (UIntType i = 0, v = 0; i <= count; ++i, v += step) { + float x = FPBits(v); if (isnan(x) || isinf(x) || x == 0.0) continue; - int exponent; - float frac = __llvm_libc::frexpf(x, &exponent); - ASSERT_TRUE(__llvm_libc::fputil::abs(frac) < 1.0f); - ASSERT_TRUE(__llvm_libc::fputil::abs(frac) >= 0.5f); + mpfr::BinaryOutput result; + result.f = __llvm_libc::frexpf(x, &result.i); + + ASSERT_TRUE(__llvm_libc::fputil::abs(result.f) < 1.0); + ASSERT_TRUE(__llvm_libc::fputil::abs(result.f) >= 0.5); + ASSERT_MPFR_MATCH(mpfr::Operation::Frexp, x, result, 0.0); } } diff --git a/libc/test/src/math/frexpl_test.cpp b/libc/test/src/math/frexpl_test.cpp index ace445f0a2de4..9846bb84ae279 100644 --- a/libc/test/src/math/frexpl_test.cpp +++ b/libc/test/src/math/frexpl_test.cpp @@ -10,10 +10,13 @@ #include "src/math/frexpl.h" #include "utils/FPUtil/BasicOperations.h" #include "utils/FPUtil/FPBits.h" +#include "utils/MPFRWrapper/MPFRUtils.h" #include "utils/UnitTest/Test.h" using FPBits = __llvm_libc::fputil::FPBits; +namespace mpfr = __llvm_libc::testing::mpfr; + TEST(FrexplTest, SpecialNumbers) { int exponent; @@ -94,10 +97,11 @@ TEST(FrexplTest, LongDoubleRange) { if (isnan(x) || isinf(x) || x == 0.0l) continue; - int exponent; - long double frac = __llvm_libc::frexpl(x, &exponent); + mpfr::BinaryOutput result; + result.f = __llvm_libc::frexpl(x, &result.i); - ASSERT_TRUE(__llvm_libc::fputil::abs(frac) < 1.0l); - ASSERT_TRUE(__llvm_libc::fputil::abs(frac) >= 0.5l); + ASSERT_TRUE(__llvm_libc::fputil::abs(result.f) < 1.0); + ASSERT_TRUE(__llvm_libc::fputil::abs(result.f) >= 0.5); + ASSERT_MPFR_MATCH(mpfr::Operation::Frexp, x, result, 0.0); } } diff --git a/libc/test/src/math/sqrt_test.cpp b/libc/test/src/math/sqrt_test.cpp new file mode 100644 index 0000000000000..7ff4978ec9e3c --- /dev/null +++ b/libc/test/src/math/sqrt_test.cpp @@ -0,0 +1,67 @@ +//===-- Unittests for sqrt -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "include/math.h" +#include "src/math/sqrt.h" +#include "utils/FPUtil/FPBits.h" +#include "utils/FPUtil/TestHelpers.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using FPBits = __llvm_libc::fputil::FPBits; +using UIntType = typename FPBits::UIntType; + +namespace mpfr = __llvm_libc::testing::mpfr; + +constexpr UIntType HiddenBit = + UIntType(1) << __llvm_libc::fputil::MantissaWidth::value; + +double nan = FPBits::buildNaN(1); +double inf = FPBits::inf(); +double negInf = FPBits::negInf(); + +TEST(SqrtTest, SpecialValues) { + ASSERT_FP_EQ(nan, __llvm_libc::sqrt(nan)); + ASSERT_FP_EQ(inf, __llvm_libc::sqrt(inf)); + ASSERT_FP_EQ(nan, __llvm_libc::sqrt(negInf)); + ASSERT_FP_EQ(0.0, __llvm_libc::sqrt(0.0)); + ASSERT_FP_EQ(-0.0, __llvm_libc::sqrt(-0.0)); + ASSERT_FP_EQ(nan, __llvm_libc::sqrt(-1.0)); + ASSERT_FP_EQ(1.0, __llvm_libc::sqrt(1.0)); + ASSERT_FP_EQ(2.0, __llvm_libc::sqrt(4.0)); + ASSERT_FP_EQ(3.0, __llvm_libc::sqrt(9.0)); +} + +TEST(SqrtTest, DenormalValues) { + for (UIntType mant = 1; mant < HiddenBit; mant <<= 1) { + FPBits denormal(0.0); + denormal.mantissa = mant; + + ASSERT_MPFR_MATCH(mpfr::Operation::Sqrt, double(denormal), + __llvm_libc::sqrt(denormal), 0.5); + } + + constexpr UIntType count = 1'000'001; + constexpr UIntType step = HiddenBit / count; + for (UIntType i = 0, v = 0; i <= count; ++i, v += step) { + double x = *reinterpret_cast(&v); + ASSERT_MPFR_MATCH(mpfr::Operation::Sqrt, x, __llvm_libc::sqrt(x), 0.5); + } +} + +TEST(SqrtTest, InDoubleRange) { + constexpr UIntType count = 10'000'001; + constexpr UIntType step = UIntType(-1) / count; + for (UIntType i = 0, v = 0; i <= count; ++i, v += step) { + double x = *reinterpret_cast(&v); + if (isnan(x) || (x < 0)) { + continue; + } + + ASSERT_MPFR_MATCH(mpfr::Operation::Sqrt, x, __llvm_libc::sqrt(x), 0.5); + } +} diff --git a/libc/test/src/math/sqrtf_test.cpp b/libc/test/src/math/sqrtf_test.cpp new file mode 100644 index 0000000000000..8c429065bb455 --- /dev/null +++ b/libc/test/src/math/sqrtf_test.cpp @@ -0,0 +1,67 @@ +//===-- Unittests for sqrtf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "include/math.h" +#include "src/math/sqrtf.h" +#include "utils/FPUtil/FPBits.h" +#include "utils/FPUtil/TestHelpers.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using FPBits = __llvm_libc::fputil::FPBits; +using UIntType = typename FPBits::UIntType; + +namespace mpfr = __llvm_libc::testing::mpfr; + +constexpr UIntType HiddenBit = + UIntType(1) << __llvm_libc::fputil::MantissaWidth::value; + +float nan = FPBits::buildNaN(1); +float inf = FPBits::inf(); +float negInf = FPBits::negInf(); + +TEST(SqrtfTest, SpecialValues) { + ASSERT_FP_EQ(nan, __llvm_libc::sqrtf(nan)); + ASSERT_FP_EQ(inf, __llvm_libc::sqrtf(inf)); + ASSERT_FP_EQ(nan, __llvm_libc::sqrtf(negInf)); + ASSERT_FP_EQ(0.0f, __llvm_libc::sqrtf(0.0f)); + ASSERT_FP_EQ(-0.0f, __llvm_libc::sqrtf(-0.0f)); + ASSERT_FP_EQ(nan, __llvm_libc::sqrtf(-1.0f)); + ASSERT_FP_EQ(1.0f, __llvm_libc::sqrtf(1.0f)); + ASSERT_FP_EQ(2.0f, __llvm_libc::sqrtf(4.0f)); + ASSERT_FP_EQ(3.0f, __llvm_libc::sqrtf(9.0f)); +} + +TEST(SqrtfTest, DenormalValues) { + for (UIntType mant = 1; mant < HiddenBit; mant <<= 1) { + FPBits denormal(0.0f); + denormal.mantissa = mant; + + ASSERT_MPFR_MATCH(mpfr::Operation::Sqrt, float(denormal), + __llvm_libc::sqrtf(denormal), 0.5); + } + + constexpr UIntType count = 1'000'001; + constexpr UIntType step = HiddenBit / count; + for (UIntType i = 0, v = 0; i <= count; ++i, v += step) { + float x = *reinterpret_cast(&v); + ASSERT_MPFR_MATCH(mpfr::Operation::Sqrt, x, __llvm_libc::sqrtf(x), 0.5); + } +} + +TEST(SqrtfTest, InFloatRange) { + constexpr UIntType count = 10'000'001; + constexpr UIntType step = UIntType(-1) / count; + for (UIntType i = 0, v = 0; i <= count; ++i, v += step) { + float x = *reinterpret_cast(&v); + if (isnan(x) || (x < 0)) { + continue; + } + + ASSERT_MPFR_MATCH(mpfr::Operation::Sqrt, x, __llvm_libc::sqrtf(x), 0.5); + } +} diff --git a/libc/test/src/math/sqrtl_test.cpp b/libc/test/src/math/sqrtl_test.cpp new file mode 100644 index 0000000000000..1fab3b2567e5e --- /dev/null +++ b/libc/test/src/math/sqrtl_test.cpp @@ -0,0 +1,67 @@ +//===-- Unittests for sqrtl ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "include/math.h" +#include "src/math/sqrtl.h" +#include "utils/FPUtil/FPBits.h" +#include "utils/FPUtil/TestHelpers.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using FPBits = __llvm_libc::fputil::FPBits; +using UIntType = typename FPBits::UIntType; + +namespace mpfr = __llvm_libc::testing::mpfr; + +constexpr UIntType HiddenBit = + UIntType(1) << __llvm_libc::fputil::MantissaWidth::value; + +long double nan = FPBits::buildNaN(1); +long double inf = FPBits::inf(); +long double negInf = FPBits::negInf(); + +TEST(SqrtlTest, SpecialValues) { + ASSERT_FP_EQ(nan, __llvm_libc::sqrtl(nan)); + ASSERT_FP_EQ(inf, __llvm_libc::sqrtl(inf)); + ASSERT_FP_EQ(nan, __llvm_libc::sqrtl(negInf)); + ASSERT_FP_EQ(0.0L, __llvm_libc::sqrtl(0.0L)); + ASSERT_FP_EQ(-0.0L, __llvm_libc::sqrtl(-0.0L)); + ASSERT_FP_EQ(nan, __llvm_libc::sqrtl(-1.0L)); + ASSERT_FP_EQ(1.0L, __llvm_libc::sqrtl(1.0L)); + ASSERT_FP_EQ(2.0L, __llvm_libc::sqrtl(4.0L)); + ASSERT_FP_EQ(3.0L, __llvm_libc::sqrtl(9.0L)); +} + +TEST(SqrtlTest, DenormalValues) { + for (UIntType mant = 1; mant < HiddenBit; mant <<= 1) { + FPBits denormal(0.0L); + denormal.mantissa = mant; + + ASSERT_MPFR_MATCH(mpfr::Operation::Sqrt, static_cast(denormal), + __llvm_libc::sqrtl(denormal), 0.5); + } + + constexpr UIntType count = 1'000'001; + constexpr UIntType step = HiddenBit / count; + for (UIntType i = 0, v = 0; i <= count; ++i, v += step) { + long double x = *reinterpret_cast(&v); + ASSERT_MPFR_MATCH(mpfr::Operation::Sqrt, x, __llvm_libc::sqrtl(x), 0.5); + } +} + +TEST(SqrtlTest, InLongDoubleRange) { + constexpr UIntType count = 10'000'001; + constexpr UIntType step = UIntType(-1) / count; + for (UIntType i = 0, v = 0; i <= count; ++i, v += step) { + long double x = *reinterpret_cast(&v); + if (isnan(x) || (x < 0)) { + continue; + } + + ASSERT_MPFR_MATCH(mpfr::Operation::Sqrt, x, __llvm_libc::sqrtl(x), 0.5); + } +} diff --git a/libc/utils/FPUtil/Sqrt.h b/libc/utils/FPUtil/Sqrt.h new file mode 100644 index 0000000000000..a12cc42fa3408 --- /dev/null +++ b/libc/utils/FPUtil/Sqrt.h @@ -0,0 +1,186 @@ +//===-- Square root of IEEE 754 floating point numbers ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_FPUTIL_SQRT_H +#define LLVM_LIBC_UTILS_FPUTIL_SQRT_H + +#include "FPBits.h" + +#include "utils/CPP/TypeTraits.h" + +namespace __llvm_libc { +namespace fputil { + +namespace internal { + +template +static inline void normalize(int &exponent, + typename FPBits::UIntType &mantissa); + +template <> inline void normalize(int &exponent, uint32_t &mantissa) { + // Use binary search to shift the leading 1 bit. + // With MantissaWidth = 23, it will take + // ceil(log2(23)) = 5 steps checking the mantissa bits as followed: + // Step 1: 0000 0000 0000 XXXX XXXX XXXX + // Step 2: 0000 00XX XXXX XXXX XXXX XXXX + // Step 3: 000X XXXX XXXX XXXX XXXX XXXX + // Step 4: 00XX XXXX XXXX XXXX XXXX XXXX + // Step 5: 0XXX XXXX XXXX XXXX XXXX XXXX + constexpr int nsteps = 5; // = ceil(log2(MantissaWidth)) + constexpr uint32_t bounds[nsteps] = {1 << 12, 1 << 18, 1 << 21, 1 << 22, + 1 << 23}; + constexpr int shifts[nsteps] = {12, 6, 3, 2, 1}; + + for (int i = 0; i < nsteps; ++i) { + if (mantissa < bounds[i]) { + exponent -= shifts[i]; + mantissa <<= shifts[i]; + } + } +} + +template <> inline void normalize(int &exponent, uint64_t &mantissa) { + // Use binary search to shift the leading 1 bit similar to float. + // With MantissaWidth = 52, it will take + // ceil(log2(52)) = 6 steps checking the mantissa bits. + constexpr int nsteps = 6; // = ceil(log2(MantissaWidth)) + constexpr uint64_t bounds[nsteps] = {1ULL << 26, 1ULL << 39, 1ULL << 46, + 1ULL << 49, 1ULL << 51, 1ULL << 52}; + constexpr int shifts[nsteps] = {27, 14, 7, 4, 2, 1}; + + for (int i = 0; i < nsteps; ++i) { + if (mantissa < bounds[i]) { + exponent -= shifts[i]; + mantissa <<= shifts[i]; + } + } +} + +#if !(defined(__x86_64__) || defined(__i386__)) +template <> +inline void normalize(int &exponent, __uint128_t &mantissa) { + // Use binary search to shift the leading 1 bit similar to float. + // With MantissaWidth = 112, it will take + // ceil(log2(112)) = 7 steps checking the mantissa bits. + constexpr int nsteps = 7; // = ceil(log2(MantissaWidth)) + constexpr __uint128_t bounds[nsteps] = { + __uint128_t(1) << 56, __uint128_t(1) << 84, __uint128_t(1) << 98, + __uint128_t(1) << 105, __uint128_t(1) << 109, __uint128_t(1) << 111, + __uint128_t(1) << 112}; + constexpr int shifts[nsteps] = {57, 29, 15, 8, 4, 2, 1}; + + for (int i = 0; i < nsteps; ++i) { + if (mantissa < bounds[i]) { + exponent -= shifts[i]; + mantissa <<= shifts[i]; + } + } +} +#endif + +} // namespace internal + +// Correctly rounded IEEE 754 SQRT with round to nearest, ties to even. +// Shift-and-add algorithm. +template ::Value, int> = 0> +static inline T sqrt(T x) { + using UIntType = typename FPBits::UIntType; + constexpr UIntType One = UIntType(1) << MantissaWidth::value; + + FPBits bits(x); + + if (bits.isInfOrNaN()) { + if (bits.sign && (bits.mantissa == 0)) { + // sqrt(-Inf) = NaN + return FPBits::buildNaN(One >> 1); + } else { + // sqrt(NaN) = NaN + // sqrt(+Inf) = +Inf + return x; + } + } else if (bits.isZero()) { + // sqrt(+0) = +0 + // sqrt(-0) = -0 + return x; + } else if (bits.sign) { + // sqrt( negative numbers ) = NaN + return FPBits::buildNaN(One >> 1); + } else { + int xExp = bits.getExponent(); + UIntType xMant = bits.mantissa; + + // Step 1a: Normalize denormal input and append hiddent bit to the mantissa + if (bits.exponent == 0) { + ++xExp; // let xExp be the correct exponent of One bit. + internal::normalize(xExp, xMant); + } else { + xMant |= One; + } + + // Step 1b: Make sure the exponent is even. + if (xExp & 1) { + --xExp; + xMant <<= 1; + } + + // After step 1b, x = 2^(xExp) * xMant, where xExp is even, and + // 1 <= xMant < 4. So sqrt(x) = 2^(xExp / 2) * y, with 1 <= y < 2. + // Notice that the output of sqrt is always in the normal range. + // To perform shift-and-add algorithm to find y, let denote: + // y(n) = 1.y_1 y_2 ... y_n, we can define the nth residue to be: + // r(n) = 2^n ( xMant - y(n)^2 ). + // That leads to the following recurrence formula: + // r(n) = 2*r(n-1) - y_n*[ 2*y(n-1) + 2^(-n-1) ] + // with the initial conditions: y(0) = 1, and r(0) = x - 1. + // So the nth digit y_n of the mantissa of sqrt(x) can be found by: + // y_n = 1 if 2*r(n-1) >= 2*y(n - 1) + 2^(-n-1) + // 0 otherwise. + UIntType y = One; + UIntType r = xMant - One; + + for (UIntType current_bit = One >> 1; current_bit; current_bit >>= 1) { + r <<= 1; + UIntType tmp = (y << 1) + current_bit; // 2*y(n - 1) + 2^(-n-1) + if (r >= tmp) { + r -= tmp; + y += current_bit; + } + } + + // We compute one more iteration in order to round correctly. + bool lsb = y & 1; // Least significant bit + bool rb = false; // Round bit + r <<= 2; + UIntType tmp = (y << 2) + 1; + if (r >= tmp) { + r -= tmp; + rb = true; + } + + // Remove hidden bit and append the exponent field. + xExp = ((xExp >> 1) + FPBits::exponentBias); + + y = (y - One) | (static_cast(xExp) << MantissaWidth::value); + // Round to nearest, ties to even + if (rb && (lsb || (r != 0))) { + ++y; + } + + return *reinterpret_cast(&y); + } +} + +} // namespace fputil +} // namespace __llvm_libc + +#if (defined(__x86_64__) || defined(__i386__)) +#include "SqrtLongDoubleX86.h" +#endif // defined(__x86_64__) || defined(__i386__) + +#endif // LLVM_LIBC_UTILS_FPUTIL_SQRT_H diff --git a/libc/utils/FPUtil/SqrtLongDoubleX86.h b/libc/utils/FPUtil/SqrtLongDoubleX86.h new file mode 100644 index 0000000000000..2ac73044cf92f --- /dev/null +++ b/libc/utils/FPUtil/SqrtLongDoubleX86.h @@ -0,0 +1,142 @@ +//===-- Square root of x86 long double numbers ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_FPUTIL_SQRT_LONG_DOUBLE_X86_H +#define LLVM_LIBC_UTILS_FPUTIL_SQRT_LONG_DOUBLE_X86_H + +#include "FPBits.h" +#include "utils/CPP/TypeTraits.h" + +namespace __llvm_libc { +namespace fputil { + +#if (defined(__x86_64__) || defined(__i386__)) +namespace internal { + +template <> +inline void normalize(int &exponent, __uint128_t &mantissa) { + // Use binary search to shift the leading 1 bit similar to float. + // With MantissaWidth = 63, it will take + // ceil(log2(63)) = 6 steps checking the mantissa bits. + constexpr int nsteps = 6; // = ceil(log2(MantissaWidth)) + constexpr __uint128_t bounds[nsteps] = { + __uint128_t(1) << 32, __uint128_t(1) << 48, __uint128_t(1) << 56, + __uint128_t(1) << 60, __uint128_t(1) << 62, __uint128_t(1) << 63}; + constexpr int shifts[nsteps] = {32, 16, 8, 4, 2, 1}; + + for (int i = 0; i < nsteps; ++i) { + if (mantissa < bounds[i]) { + exponent -= shifts[i]; + mantissa <<= shifts[i]; + } + } +} + +} // namespace internal + +// Correctly rounded SQRT with round to nearest, ties to even. +// Shift-and-add algorithm. +template <> inline long double sqrt(long double x) { + using UIntType = typename FPBits::UIntType; + constexpr UIntType One = UIntType(1) + << int(MantissaWidth::value); + + FPBits bits(x); + + if (bits.isInfOrNaN()) { + if (bits.sign && (bits.mantissa == 0)) { + // sqrt(-Inf) = NaN + return FPBits::buildNaN(One >> 1); + } else { + // sqrt(NaN) = NaN + // sqrt(+Inf) = +Inf + return x; + } + } else if (bits.isZero()) { + // sqrt(+0) = +0 + // sqrt(-0) = -0 + return x; + } else if (bits.sign) { + // sqrt( negative numbers ) = NaN + return FPBits::buildNaN(One >> 1); + } else { + int xExp = bits.getExponent(); + UIntType xMant = bits.mantissa; + + // Step 1a: Normalize denormal input + if (bits.implicitBit) { + xMant |= One; + } else if (bits.exponent == 0) { + internal::normalize(xExp, xMant); + } + + // Step 1b: Make sure the exponent is even. + if (xExp & 1) { + --xExp; + xMant <<= 1; + } + + // After step 1b, x = 2^(xExp) * xMant, where xExp is even, and + // 1 <= xMant < 4. So sqrt(x) = 2^(xExp / 2) * y, with 1 <= y < 2. + // Notice that the output of sqrt is always in the normal range. + // To perform shift-and-add algorithm to find y, let denote: + // y(n) = 1.y_1 y_2 ... y_n, we can define the nth residue to be: + // r(n) = 2^n ( xMant - y(n)^2 ). + // That leads to the following recurrence formula: + // r(n) = 2*r(n-1) - y_n*[ 2*y(n-1) + 2^(-n-1) ] + // with the initial conditions: y(0) = 1, and r(0) = x - 1. + // So the nth digit y_n of the mantissa of sqrt(x) can be found by: + // y_n = 1 if 2*r(n-1) >= 2*y(n - 1) + 2^(-n-1) + // 0 otherwise. + UIntType y = One; + UIntType r = xMant - One; + + for (UIntType current_bit = One >> 1; current_bit; current_bit >>= 1) { + r <<= 1; + UIntType tmp = (y << 1) + current_bit; // 2*y(n - 1) + 2^(-n-1) + if (r >= tmp) { + r -= tmp; + y += current_bit; + } + } + + // We compute one more iteration in order to round correctly. + bool lsb = y & 1; // Least significant bit + bool rb = false; // Round bit + r <<= 2; + UIntType tmp = (y << 2) + 1; + if (r >= tmp) { + r -= tmp; + rb = true; + } + + // Append the exponent field. + xExp = ((xExp >> 1) + FPBits::exponentBias); + y |= (static_cast(xExp) + << (MantissaWidth::value + 1)); + + // Round to nearest, ties to even + if (rb && (lsb || (r != 0))) { + ++y; + } + + // Extract output + FPBits out(0.0L); + out.exponent = xExp; + out.implicitBit = 1; + out.mantissa = (y & (One - 1)); + + return out; + } +} +#endif // defined(__x86_64__) || defined(__i386__) + +} // namespace fputil +} // namespace __llvm_libc + +#endif // LLVM_LIBC_UTILS_FPUTIL_SQRT_LONG_DOUBLE_X86_H diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp index a3abfce08bf34..a121234e62246 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.cpp +++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include #include #include #include @@ -65,50 +66,94 @@ class MPFRNumber { mpfr_set_sj(value, x, MPFR_RNDN); } - template ::Value, int> = 0> - MPFRNumber(Operation op, XType rawValue) { - mpfr_init2(value, mpfrPrecision); - MPFRNumber mpfrInput(rawValue); - switch (op) { - case Operation::Abs: - mpfr_abs(value, mpfrInput.value, MPFR_RNDN); - break; - case Operation::Ceil: - mpfr_ceil(value, mpfrInput.value); - break; - case Operation::Cos: - mpfr_cos(value, mpfrInput.value, MPFR_RNDN); - break; - case Operation::Exp: - mpfr_exp(value, mpfrInput.value, MPFR_RNDN); - break; - case Operation::Exp2: - mpfr_exp2(value, mpfrInput.value, MPFR_RNDN); - break; - case Operation::Floor: - mpfr_floor(value, mpfrInput.value); - break; - case Operation::Round: - mpfr_round(value, mpfrInput.value); - break; - case Operation::Sin: - mpfr_sin(value, mpfrInput.value, MPFR_RNDN); - break; - case Operation::Sqrt: - mpfr_sqrt(value, mpfrInput.value, MPFR_RNDN); - break; - case Operation::Trunc: - mpfr_trunc(value, mpfrInput.value); - break; - } - } - MPFRNumber(const MPFRNumber &other) { mpfr_set(value, other.value, MPFR_RNDN); } - ~MPFRNumber() { mpfr_clear(value); } + ~MPFRNumber() { + mpfr_clear(value); + } + + MPFRNumber &operator=(const MPFRNumber &rhs) { + mpfr_set(value, rhs.value, MPFR_RNDN); + return *this; + } + + MPFRNumber abs() const { + MPFRNumber result; + mpfr_abs(result.value, value, MPFR_RNDN); + return result; + } + + MPFRNumber ceil() const { + MPFRNumber result; + mpfr_ceil(result.value, value); + return result; + } + + MPFRNumber cos() const { + MPFRNumber result; + mpfr_cos(result.value, value, MPFR_RNDN); + return result; + } + + MPFRNumber exp() const { + MPFRNumber result; + mpfr_exp(result.value, value, MPFR_RNDN); + return result; + } + + MPFRNumber exp2() const { + MPFRNumber result; + mpfr_exp2(result.value, value, MPFR_RNDN); + return result; + } + + MPFRNumber floor() const { + MPFRNumber result; + mpfr_floor(result.value, value); + return result; + } + + MPFRNumber frexp(int &exp) { + MPFRNumber result; + mpfr_exp_t resultExp; + mpfr_frexp(&resultExp, result.value, value, MPFR_RNDN); + exp = resultExp; + return result; + } + + MPFRNumber remquo(const MPFRNumber &divisor, int "ient) { + MPFRNumber remainder; + long q; + mpfr_remquo(remainder.value, &q, value, divisor.value, MPFR_RNDN); + quotient = q; + return remainder; + } + + MPFRNumber round() const { + MPFRNumber result; + mpfr_round(result.value, value); + return result; + } + + MPFRNumber sin() const { + MPFRNumber result; + mpfr_sin(result.value, value, MPFR_RNDN); + return result; + } + + MPFRNumber sqrt() const { + MPFRNumber result; + mpfr_sqrt(result.value, value, MPFR_RNDN); + return result; + } + + MPFRNumber trunc() const { + MPFRNumber result; + mpfr_trunc(result.value, value); + return result; + } std::string str() const { // 200 bytes should be more than sufficient to hold a 100-digit number @@ -179,10 +224,65 @@ class MPFRNumber { namespace internal { +template +cpp::EnableIfType::Value, MPFRNumber> +unaryOperation(Operation op, InputType input) { + MPFRNumber mpfrInput(input); + switch (op) { + case Operation::Abs: + return mpfrInput.abs(); + case Operation::Ceil: + return mpfrInput.ceil(); + case Operation::Cos: + return mpfrInput.cos(); + case Operation::Exp: + return mpfrInput.exp(); + case Operation::Exp2: + return mpfrInput.exp2(); + case Operation::Floor: + return mpfrInput.floor(); + case Operation::Round: + return mpfrInput.round(); + case Operation::Sin: + return mpfrInput.sin(); + case Operation::Sqrt: + return mpfrInput.sqrt(); + case Operation::Trunc: + return mpfrInput.trunc(); + default: + __builtin_unreachable(); + } +} + +template +cpp::EnableIfType::Value, MPFRNumber> +unaryOperationTwoOutputs(Operation op, InputType input, int &output) { + MPFRNumber mpfrInput(input); + switch (op) { + case Operation::Frexp: + return mpfrInput.frexp(output); + default: + __builtin_unreachable(); + } +} + +template +cpp::EnableIfType::Value, MPFRNumber> +binaryOperationTwoOutputs(Operation op, InputType x, InputType y, int &output) { + MPFRNumber inputX(x), inputY(y); + switch (op) { + case Operation::RemQuo: + return inputX.remquo(inputY, output); + default: + __builtin_unreachable(); + } +} + template -void MPFRMatcher::explainError(testutils::StreamWrapper &OS) { - MPFRNumber mpfrResult(operation, input); +void explainUnaryOperationSingleOutputError(Operation op, T input, T matchValue, + testutils::StreamWrapper &OS) { MPFRNumber mpfrInput(input); + MPFRNumber mpfrResult = unaryOperation(op, input); MPFRNumber mpfrMatchValue(matchValue); FPBits inputBits(input); FPBits matchBits(matchValue); @@ -201,25 +301,174 @@ void MPFRMatcher::explainError(testutils::StreamWrapper &OS) { << '\n'; } -template void MPFRMatcher::explainError(testutils::StreamWrapper &); -template void MPFRMatcher::explainError(testutils::StreamWrapper &); template void -MPFRMatcher::explainError(testutils::StreamWrapper &); +explainUnaryOperationSingleOutputError(Operation op, float, float, + testutils::StreamWrapper &); +template void +explainUnaryOperationSingleOutputError(Operation op, double, double, + testutils::StreamWrapper &); +template void explainUnaryOperationSingleOutputError( + Operation op, long double, long double, testutils::StreamWrapper &); + +template +void explainUnaryOperationTwoOutputsError(Operation op, T input, + const BinaryOutput &libcResult, + testutils::StreamWrapper &OS) { + MPFRNumber mpfrInput(input); + FPBits inputBits(input); + int mpfrIntResult; + MPFRNumber mpfrResult = unaryOperationTwoOutputs(op, input, mpfrIntResult); + + if (mpfrIntResult != libcResult.i) { + OS << "MPFR integral result: " << mpfrIntResult << '\n' + << "Libc integral result: " << libcResult.i << '\n'; + } else { + OS << "Integral result from libc matches integral result from MPFR.\n"; + } + + MPFRNumber mpfrMatchValue(libcResult.f); + OS << "Libc floating point result is not within tolerance value of the MPFR " + << "result.\n\n"; + + OS << " Input decimal: " << mpfrInput.str() << "\n\n"; + + OS << "Libc floating point value: " << mpfrMatchValue.str() << '\n'; + __llvm_libc::fputil::testing::describeValue( + " Libc floating point bits: ", libcResult.f, OS); + OS << "\n\n"; + + OS << " MPFR result: " << mpfrResult.str() << '\n'; + __llvm_libc::fputil::testing::describeValue( + " MPFR rounded: ", mpfrResult.as(), OS); + OS << '\n' + << " ULP error: " + << std::to_string(mpfrResult.ulp(libcResult.f)) << '\n'; +} + +template void explainUnaryOperationTwoOutputsError( + Operation, float, const BinaryOutput &, testutils::StreamWrapper &); +template void +explainUnaryOperationTwoOutputsError(Operation, double, + const BinaryOutput &, + testutils::StreamWrapper &); +template void explainUnaryOperationTwoOutputsError( + Operation, long double, const BinaryOutput &, + testutils::StreamWrapper &); + +template +void explainBinaryOperationTwoOutputsError(Operation op, + const BinaryInput &input, + const BinaryOutput &libcResult, + testutils::StreamWrapper &OS) { + MPFRNumber mpfrX(input.x); + MPFRNumber mpfrY(input.y); + FPBits xbits(input.x); + FPBits ybits(input.y); + int mpfrIntResult; + MPFRNumber mpfrResult = + binaryOperationTwoOutputs(op, input.x, input.y, mpfrIntResult); + MPFRNumber mpfrMatchValue(libcResult.f); + + OS << "Input decimal: x: " << mpfrX.str() << " y: " << mpfrY.str() << '\n' + << "MPFR integral result: " << mpfrIntResult << '\n' + << "Libc integral result: " << libcResult.i << '\n' + << "Libc floating point result: " << mpfrMatchValue.str() << '\n' + << " MPFR result: " << mpfrResult.str() << '\n'; + __llvm_libc::fputil::testing::describeValue( + "Libc floating point result bits: ", libcResult.f, OS); + __llvm_libc::fputil::testing::describeValue( + " MPFR rounded bits: ", mpfrResult.as(), OS); + OS << "ULP error: " << std::to_string(mpfrResult.ulp(libcResult.f)) << '\n'; +} + +template void explainBinaryOperationTwoOutputsError( + Operation, const BinaryInput &, const BinaryOutput &, + testutils::StreamWrapper &); +template void explainBinaryOperationTwoOutputsError( + Operation, const BinaryInput &, const BinaryOutput &, + testutils::StreamWrapper &); +template void explainBinaryOperationTwoOutputsError( + Operation, const BinaryInput &, + const BinaryOutput &, testutils::StreamWrapper &); template -bool compare(Operation op, T input, T libcResult, double ulpError) { +bool compareUnaryOperationSingleOutput(Operation op, T input, T libcResult, + double ulpError) { // If the ulp error is exactly 0.5 (i.e a tie), we would check that the result // is rounded to the nearest even. - MPFRNumber mpfrResult(op, input); + MPFRNumber mpfrResult = unaryOperation(op, input); double ulp = mpfrResult.ulp(libcResult); bool bitsAreEven = ((FPBits(libcResult).bitsAsUInt() & 1) == 0); return (ulp < ulpError) || ((ulp == ulpError) && ((ulp != 0.5) || bitsAreEven)); } -template bool compare(Operation, float, float, double); -template bool compare(Operation, double, double, double); -template bool compare(Operation, long double, long double, double); +template bool compareUnaryOperationSingleOutput(Operation, float, float, + double); +template bool compareUnaryOperationSingleOutput(Operation, double, + double, double); +template bool compareUnaryOperationSingleOutput(Operation, + long double, + long double, + double); + +template +bool compareUnaryOperationTwoOutputs(Operation op, T input, + const BinaryOutput &libcResult, + double ulpError) { + int mpfrIntResult; + MPFRNumber mpfrResult = unaryOperationTwoOutputs(op, input, mpfrIntResult); + double ulp = mpfrResult.ulp(libcResult.f); + + if (mpfrIntResult != libcResult.i) + return false; + + bool bitsAreEven = ((FPBits(libcResult.f).bitsAsUInt() & 1) == 0); + return (ulp < ulpError) || + ((ulp == ulpError) && ((ulp != 0.5) || bitsAreEven)); +} + +template bool +compareUnaryOperationTwoOutputs(Operation, float, + const BinaryOutput &, double); +template bool +compareUnaryOperationTwoOutputs(Operation, double, + const BinaryOutput &, double); +template bool compareUnaryOperationTwoOutputs( + Operation, long double, const BinaryOutput &, double); + +template +bool compareBinaryOperationTwoOutputs(Operation op, const BinaryInput &input, + const BinaryOutput &libcResult, + double ulpError) { + int mpfrIntResult; + MPFRNumber mpfrResult = + binaryOperationTwoOutputs(op, input.x, input.y, mpfrIntResult); + double ulp = mpfrResult.ulp(libcResult.f); + + if (mpfrIntResult != libcResult.i) { + if (op == Operation::RemQuo) { + if ((0x7 & mpfrIntResult) != (0x7 & libcResult.i)) + return false; + } else { + return false; + } + } + + bool bitsAreEven = ((FPBits(libcResult.f).bitsAsUInt() & 1) == 0); + return (ulp < ulpError) || + ((ulp == ulpError) && ((ulp != 0.5) || bitsAreEven)); +} + +template bool +compareBinaryOperationTwoOutputs(Operation, const BinaryInput &, + const BinaryOutput &, double); +template bool +compareBinaryOperationTwoOutputs(Operation, const BinaryInput &, + const BinaryOutput &, double); +template bool compareBinaryOperationTwoOutputs( + Operation, const BinaryInput &, + const BinaryOutput &, double); } // namespace internal diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h index 3d94079e65d81..b46f09dd5e558 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.h +++ b/libc/utils/MPFRWrapper/MPFRUtils.h @@ -19,6 +19,10 @@ namespace testing { namespace mpfr { enum class Operation : int { + // Operations with take a single floating point number as input + // and produce a single floating point number as output. The input + // and output floating point numbers are of the same kind. + BeginUnaryOperationsSingleOutput, Abs, Ceil, Cos, @@ -28,45 +32,193 @@ enum class Operation : int { Round, Sin, Sqrt, - Trunc + Trunc, + EndUnaryOperationsSingleOutput, + + // Operations which take a single floating point nubmer as input + // but produce two outputs. The first ouput is a floating point + // number of the same type as the input. The second output is of type + // 'int'. + BeginUnaryOperationsTwoOutputs, + Frexp, // Floating point output, the first output, is the fractional part. + EndUnaryOperationsTwoOutputs, + + // Operations wich take two floating point nubmers of the same type as + // input and produce a single floating point number of the same type as + // output. + BeginBinaryOperationsSingleOutput, + // TODO: Add operations like hypot. + EndBinaryOperationsSingleOutput, + + // Operations which take two floating point numbers of the same type as + // input and produce two outputs. The first output is a floating nubmer of + // the same type as the inputs. The second output is af type 'int'. + BeginBinaryOperationsTwoOutputs, + RemQuo, // The first output, the floating point output, is the remainder. + EndBinaryOperationsTwoOutputs, + + BeginTernaryOperationsSingleOuput, + // TODO: Add operations like fma. + EndTernaryOperationsSingleOutput, +}; + +template struct BinaryInput { + static_assert( + __llvm_libc::cpp::IsFloatingPointType::Value, + "Template parameter of BinaryInput must be a floating point type."); + + using Type = T; + T x, y; +}; + +template struct TernaryInput { + static_assert( + __llvm_libc::cpp::IsFloatingPointType::Value, + "Template parameter of TernaryInput must be a floating point type."); + + using Type = T; + T x, y, z; +}; + +template struct BinaryOutput { + T f; + int i; }; namespace internal { +template +struct AreMatchingBinaryInputAndBinaryOutput { + static constexpr bool value = false; +}; + template -bool compare(Operation op, T input, T libcOutput, double t); +struct AreMatchingBinaryInputAndBinaryOutput, BinaryOutput> { + static constexpr bool value = cpp::IsFloatingPointType::Value; +}; -template class MPFRMatcher : public testing::Matcher { - static_assert(__llvm_libc::cpp::IsFloatingPointType::Value, - "MPFRMatcher can only be used with floating point values."); +template +bool compareUnaryOperationSingleOutput(Operation op, T input, T libcOutput, + double t); +template +bool compareUnaryOperationTwoOutputs(Operation op, T input, + const BinaryOutput &libcOutput, + double t); +template +bool compareBinaryOperationTwoOutputs(Operation op, const BinaryInput &input, + const BinaryOutput &libcOutput, + double t); - Operation operation; - T input; - T matchValue; +template +void explainUnaryOperationSingleOutputError(Operation op, T input, T matchValue, + testutils::StreamWrapper &OS); +template +void explainUnaryOperationTwoOutputsError(Operation op, T input, + const BinaryOutput &matchValue, + testutils::StreamWrapper &OS); +template +void explainBinaryOperationTwoOutputsError(Operation op, + const BinaryInput &input, + const BinaryOutput &matchValue, + testutils::StreamWrapper &OS); + +template +class MPFRMatcher : public testing::Matcher { + InputType input; + OutputType matchValue; double ulpTolerance; public: - MPFRMatcher(Operation op, T testInput, double ulpTolerance) - : operation(op), input(testInput), ulpTolerance(ulpTolerance) {} + MPFRMatcher(InputType testInput, double ulpTolerance) + : input(testInput), ulpTolerance(ulpTolerance) {} - bool match(T libcResult) { + bool match(OutputType libcResult) { matchValue = libcResult; - return internal::compare(operation, input, libcResult, ulpTolerance); + return match(input, matchValue, ulpTolerance); } - void explainError(testutils::StreamWrapper &OS) override; + void explainError(testutils::StreamWrapper &OS) override { + explainError(input, matchValue, OS); + } + +private: + template static bool match(T in, T out, double tolerance) { + return compareUnaryOperationSingleOutput(op, in, out, tolerance); + } + + template + static bool match(T in, const BinaryOutput &out, double tolerance) { + return compareUnaryOperationTwoOutputs(op, in, out, tolerance); + } + + template + static bool match(const BinaryInput &in, T out, double tolerance) { + // TODO: Implement the comparision function and error reporter. + } + + template + static bool match(BinaryInput in, const BinaryOutput &out, + double tolerance) { + return compareBinaryOperationTwoOutputs(op, in, out, tolerance); + } + + template + static bool match(const TernaryInput &in, T out, double tolerance) { + // TODO: Implement the comparision function and error reporter. + } + + template + static void explainError(T in, T out, testutils::StreamWrapper &OS) { + explainUnaryOperationSingleOutputError(op, in, out, OS); + } + + template + static void explainError(T in, const BinaryOutput &out, + testutils::StreamWrapper &OS) { + explainUnaryOperationTwoOutputsError(op, in, out, OS); + } + + template + static void explainError(const BinaryInput &in, const BinaryOutput &out, + testutils::StreamWrapper &OS) { + explainBinaryOperationTwoOutputsError(op, in, out, OS); + } }; } // namespace internal -template +// Return true if the input and ouput types for the operation op are valid +// types. +template +constexpr bool isValidOperation() { + return (Operation::BeginUnaryOperationsSingleOutput < op && + op < Operation::EndUnaryOperationsSingleOutput && + cpp::IsSame::Value && + cpp::IsFloatingPointType::Value) || + (Operation::BeginUnaryOperationsTwoOutputs < op && + op < Operation::EndUnaryOperationsTwoOutputs && + cpp::IsFloatingPointType::Value && + cpp::IsSame>::Value) || + (Operation::BeginBinaryOperationsSingleOutput < op && + op < Operation::EndBinaryOperationsSingleOutput && + cpp::IsFloatingPointType::Value && + cpp::IsSame>::Value) || + (Operation::BeginBinaryOperationsTwoOutputs < op && + op < Operation::EndBinaryOperationsTwoOutputs && + internal::AreMatchingBinaryInputAndBinaryOutput::value) || + (Operation::BeginTernaryOperationsSingleOuput < op && + op < Operation::EndTernaryOperationsSingleOutput && + cpp::IsFloatingPointType::Value && + cpp::IsSame>::Value); +} + +template __attribute__((no_sanitize("address"))) -typename cpp::EnableIfType, internal::MPFRMatcher> -getMPFRMatcher(Operation op, T input, U t) { - static_assert( - __llvm_libc::cpp::IsFloatingPointType::Value, - "getMPFRMatcher can only be used to match floating point results."); - return internal::MPFRMatcher(op, input, t); +cpp::EnableIfType(), + internal::MPFRMatcher> +getMPFRMatcher(InputType input, OutputType outputUnused, double t) { + return internal::MPFRMatcher(input, t); } } // namespace mpfr @@ -74,11 +226,11 @@ getMPFRMatcher(Operation op, T input, U t) { } // namespace __llvm_libc #define EXPECT_MPFR_MATCH(op, input, matchValue, tolerance) \ - EXPECT_THAT(matchValue, __llvm_libc::testing::mpfr::getMPFRMatcher( \ - op, input, tolerance)) + EXPECT_THAT(matchValue, __llvm_libc::testing::mpfr::getMPFRMatcher( \ + input, matchValue, tolerance)) #define ASSERT_MPFR_MATCH(op, input, matchValue, tolerance) \ - ASSERT_THAT(matchValue, __llvm_libc::testing::mpfr::getMPFRMatcher( \ - op, input, tolerance)) + ASSERT_THAT(matchValue, __llvm_libc::testing::mpfr::getMPFRMatcher( \ + input, matchValue, tolerance)) #endif // LLVM_LIBC_UTILS_TESTUTILS_MPFRUTILS_H diff --git a/libcxx/include/__threading_support b/libcxx/include/__threading_support index 072c4c7bcc899..6501217c2741a 100644 --- a/libcxx/include/__threading_support +++ b/libcxx/include/__threading_support @@ -278,24 +278,21 @@ int __libcpp_tls_set(__libcpp_tls_key __key, void *__p); #endif // !defined(_LIBCPP_HAS_THREAD_API_EXTERNAL) struct __libcpp_timed_backoff_policy { - _LIBCPP_THREAD_ABI_VISIBILITY - bool operator()(chrono::nanoseconds __elapsed) const; + _LIBCPP_INLINE_VISIBILITY + bool operator()(chrono::nanoseconds __elapsed) const + { + if(__elapsed > chrono::milliseconds(128)) + __libcpp_thread_sleep_for(chrono::milliseconds(8)); + else if(__elapsed > chrono::microseconds(64)) + __libcpp_thread_sleep_for(__elapsed / 2); + else if(__elapsed > chrono::microseconds(4)) + __libcpp_thread_yield(); + else + ; // poll + return false; + } }; -inline _LIBCPP_INLINE_VISIBILITY -bool __libcpp_timed_backoff_policy::operator()(chrono::nanoseconds __elapsed) const -{ - if(__elapsed > chrono::milliseconds(128)) - __libcpp_thread_sleep_for(chrono::milliseconds(8)); - else if(__elapsed > chrono::microseconds(64)) - __libcpp_thread_sleep_for(__elapsed / 2); - else if(__elapsed > chrono::microseconds(4)) - __libcpp_thread_yield(); - else - ; // poll - return false; -} - static _LIBCPP_CONSTEXPR const int __libcpp_polling_count = 64; template diff --git a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp index b8c9c72c93a15..5dea3cb7cc175 100644 --- a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp +++ b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp @@ -37,6 +37,8 @@ int main(int, char**) { SPtr<3> s3(nullptr, Deleter{}); // OK } // expected-error-re@memory:* 2 {{static_assert failed{{.*}} "default_delete cannot be instantiated for function types"}} + // FIXME: suppress this bogus diagnostic, see https://reviews.llvm.org/D86685. + // expected-error@memory:* 0+ {{no member named 'value' in}} { SPtr<4> s4(getFn<4>()); // expected-note {{requested here}} SPtr<5> s5(getFn<5>(), std::default_delete>{}); // expected-note {{requested here}} diff --git a/libcxx/utils/ci/macos-backdeployment.sh b/libcxx/utils/ci/macos-backdeployment.sh index b511faf30c9cb..24b866cdc1aef 100755 --- a/libcxx/utils/ci/macos-backdeployment.sh +++ b/libcxx/utils/ci/macos-backdeployment.sh @@ -105,7 +105,7 @@ echo "@@@@@@" echo "@@@ Building and installing libc++ and libc++abi @@@" -ninja -C "${LLVM_BUILD_DIR}" install-cxx install-cxxabi +xcrun ninja -C "${LLVM_BUILD_DIR}" install-cxx install-cxxabi echo "@@@@@@" diff --git a/libcxx/utils/ci/macos-trunk.sh b/libcxx/utils/ci/macos-trunk.sh index c9c881cf730ef..f43a93e555cfd 100755 --- a/libcxx/utils/ci/macos-trunk.sh +++ b/libcxx/utils/ci/macos-trunk.sh @@ -109,18 +109,18 @@ echo "@@@@@@" echo "@@@ Building libc++.dylib and libc++abi.dylib from sources (just to make sure it works) @@@" -ninja -C "${LLVM_BUILD_DIR}" install-cxx install-cxxabi -v +xcrun ninja -C "${LLVM_BUILD_DIR}" install-cxx install-cxxabi -v echo "@@@@@@" echo "@@@ Running tests for libc++ @@@" # TODO: We should run check-cxx-abilist too -ninja -C "${LLVM_BUILD_DIR}" check-cxx +xcrun ninja -C "${LLVM_BUILD_DIR}" check-cxx echo "@@@@@@" echo "@@@ Running tests for libc++abi @@@" -ninja -C "${LLVM_BUILD_DIR}" check-cxxabi +xcrun ninja -C "${LLVM_BUILD_DIR}" check-cxxabi echo "@@@@@@" diff --git a/libcxx/utils/docker/debian9/buildbot/install-packages.sh b/libcxx/utils/docker/debian9/buildbot/install-packages.sh index 82d1463164420..56e7c00d49301 100755 --- a/libcxx/utils/docker/debian9/buildbot/install-packages.sh +++ b/libcxx/utils/docker/debian9/buildbot/install-packages.sh @@ -36,5 +36,5 @@ apt-get update && \ # Install a recent CMake yes | apt-get purge cmake -wget https://github.com/Kitware/CMake/releases/download/v3.15.2/cmake-3.15.2-Linux-x86_64.sh -O /tmp/install-cmake.sh +wget https://github.com/Kitware/CMake/releases/download/v3.18.2/cmake-3.18.2-Linux-x86_64.sh -O /tmp/install-cmake.sh bash /tmp/install-cmake.sh --prefix=/usr --exclude-subdir --skip-license diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp index 236c1d96b7266..fd84d3bca1a17 100644 --- a/libcxxabi/test/test_demangle.pass.cpp +++ b/libcxxabi/test/test_demangle.pass.cpp @@ -29842,7 +29842,7 @@ struct FPLiteralCase { #if LDBL_FP128 // This was found by libFuzzer+HWASan on aarch64 Android. {"1\006ILeeeEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE", - {"\x6<-0x1.cecececececececececececececep+11983"}}, + {"\x6<-0x1.cecececececececececececececep+11983L>"}}, #endif }; const unsigned NF = sizeof(fp_literal_cases) / sizeof(fp_literal_cases[0]); diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp index b3949b2d27a42..e6f2609d679b9 100644 --- a/libunwind/src/AddressSpace.hpp +++ b/libunwind/src/AddressSpace.hpp @@ -17,6 +17,12 @@ #include #include +#include "libunwind.h" +#include "config.h" +#include "dwarf2.h" +#include "EHHeaderParser.hpp" +#include "Registers.hpp" + #ifndef _LIBUNWIND_USE_DLADDR #if !defined(_LIBUNWIND_IS_BAREMETAL) && !defined(_WIN32) #define _LIBUNWIND_USE_DLADDR 1 @@ -39,12 +45,6 @@ struct EHABIIndexEntry { }; #endif -#include "libunwind.h" -#include "config.h" -#include "dwarf2.h" -#include "EHHeaderParser.hpp" -#include "Registers.hpp" - #ifdef __APPLE__ struct dyld_unwind_sections @@ -414,8 +414,9 @@ struct _LIBUNWIND_HIDDEN dl_iterate_cb_data { #if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE) #include "FrameHeaderCache.hpp" -// There should be just one of these per process. -static FrameHeaderCache ProcessFrameHeaderCache; +// Typically there is one cache per process, but when libunwind is built as a +// hermetic static library, then each shared object may have its own cache. +static FrameHeaderCache TheFrameHeaderCache; #endif static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base, @@ -438,8 +439,11 @@ static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, if (pinfo->dlpi_phnum == 0 || cbdata->targetAddr < pinfo->dlpi_addr) return 0; #if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE) - if (ProcessFrameHeaderCache.find(pinfo, pinfo_size, data)) + if (TheFrameHeaderCache.find(pinfo, pinfo_size, data)) return 1; +#else + // Avoid warning about unused variable. + (void)pinfo_size; #endif Elf_Addr image_base = calculateImageBase(pinfo); @@ -469,7 +473,7 @@ static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, } if (found_obj && found_hdr) { #if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE) - ProcessFrameHeaderCache.add(cbdata->sects); + TheFrameHeaderCache.add(cbdata->sects); #endif return 1; } diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt index 8f79b1cf87409..09449178208be 100644 --- a/libunwind/src/CMakeLists.txt +++ b/libunwind/src/CMakeLists.txt @@ -24,14 +24,10 @@ set(LIBUNWIND_ASM_SOURCES UnwindRegistersRestore.S UnwindRegistersSave.S ) -if (MINGW OR APPLE) - # CMake doesn't build assembly sources for windows/gnu targets properly - # (up to current CMake, 3.16), so treat them as C files. - # Additionally, CMake ignores OSX_ARCHITECTURE for ASM files when targeting - # Apple platforms. - set_source_files_properties(${LIBUNWIND_ASM_SOURCES} - PROPERTIES - LANGUAGE C) + +# See add_asm_sources() in compiler-rt for explanation of this workaround. +if((APPLE AND CMAKE_VERSION VERSION_LESS 3.19) OR (MINGW AND CMAKE_VERSION VERSION_LESS 3.17)) + set_source_files_properties(${LIBUNWIND_ASM_SOURCES} PROPERTIES LANGUAGE C) endif() set(LIBUNWIND_HEADERS @@ -42,9 +38,12 @@ set(LIBUNWIND_HEADERS dwarf2.h DwarfInstructions.hpp DwarfParser.hpp + EHHeaderParser.hpp + FrameHeaderCache.hpp libunwind_ext.h Registers.hpp RWMutex.hpp + Unwind-EHABI.h UnwindCursor.hpp ../include/libunwind.h ../include/unwind.h diff --git a/libunwind/src/Registers.hpp b/libunwind/src/Registers.hpp index c76b05bf314ee..4d963b4156d1c 100644 --- a/libunwind/src/Registers.hpp +++ b/libunwind/src/Registers.hpp @@ -39,6 +39,8 @@ enum { }; #if defined(_LIBUNWIND_TARGET_I386) +class _LIBUNWIND_HIDDEN Registers_x86; +extern "C" void __libunwind_Registers_x86_jumpto(Registers_x86 *); /// Registers_x86 holds the register state of a thread in a 32-bit intel /// process. class _LIBUNWIND_HIDDEN Registers_x86 { @@ -56,7 +58,7 @@ class _LIBUNWIND_HIDDEN Registers_x86 { v128 getVectorRegister(int num) const; void setVectorRegister(int num, v128 value); static const char *getRegisterName(int num); - void jumpto(); + void jumpto() { __libunwind_Registers_x86_jumpto(this); } static int lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_X86; } static int getArch() { return REGISTERS_X86; } @@ -248,6 +250,8 @@ inline void Registers_x86::setVectorRegister(int, v128) { #if defined(_LIBUNWIND_TARGET_X86_64) /// Registers_x86_64 holds the register state of a thread in a 64-bit intel /// process. +class _LIBUNWIND_HIDDEN Registers_x86_64; +extern "C" void __libunwind_Registers_x86_64_jumpto(Registers_x86_64 *); class _LIBUNWIND_HIDDEN Registers_x86_64 { public: Registers_x86_64(); @@ -263,7 +267,7 @@ class _LIBUNWIND_HIDDEN Registers_x86_64 { v128 getVectorRegister(int num) const; void setVectorRegister(int num, v128 value); static const char *getRegisterName(int num); - void jumpto(); + void jumpto() { __libunwind_Registers_x86_64_jumpto(this); } static int lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_X86_64; } static int getArch() { return REGISTERS_X86_64; } @@ -1771,6 +1775,8 @@ inline const char *Registers_ppc64::getRegisterName(int regNum) { #if defined(_LIBUNWIND_TARGET_AARCH64) /// Registers_arm64 holds the register state of a thread in a 64-bit arm /// process. +class _LIBUNWIND_HIDDEN Registers_arm64; +extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *); class _LIBUNWIND_HIDDEN Registers_arm64 { public: Registers_arm64(); @@ -1786,7 +1792,7 @@ class _LIBUNWIND_HIDDEN Registers_arm64 { v128 getVectorRegister(int num) const; void setVectorRegister(int num, v128 value); static const char *getRegisterName(int num); - void jumpto(); + void jumpto() { __libunwind_Registers_arm64_jumpto(this); } static int lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_ARM64; } static int getArch() { return REGISTERS_ARM64; } diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp index 54913360ca297..e6a36764fc793 100644 --- a/libunwind/src/UnwindCursor.hpp +++ b/libunwind/src/UnwindCursor.hpp @@ -925,6 +925,9 @@ class UnwindCursor : public AbstractUnwindCursor{ #endif #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) + bool getInfoFromFdeCie(const typename CFI_Parser::FDE_Info &fdeInfo, + const typename CFI_Parser::CIE_Info &cieInfo, + pint_t pc, uintptr_t dso_base); bool getInfoFromDwarfSection(pint_t pc, const UnwindInfoSections §s, uint32_t fdeSectionOffsetHint=0); int stepWithDwarfFDE() { @@ -1476,6 +1479,32 @@ bool UnwindCursor::getInfoFromEHABISection( #endif #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) +template +bool UnwindCursor::getInfoFromFdeCie( + const typename CFI_Parser::FDE_Info &fdeInfo, + const typename CFI_Parser::CIE_Info &cieInfo, pint_t pc, + uintptr_t dso_base) { + typename CFI_Parser::PrologInfo prolog; + if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, pc, + R::getArch(), &prolog)) { + // Save off parsed FDE info + _info.start_ip = fdeInfo.pcStart; + _info.end_ip = fdeInfo.pcEnd; + _info.lsda = fdeInfo.lsda; + _info.handler = cieInfo.personality; + // Some frameless functions need SP altered when resuming in function, so + // propagate spExtraArgSize. + _info.gp = prolog.spExtraArgSize; + _info.flags = 0; + _info.format = dwarfEncoding(); + _info.unwind_info = fdeInfo.fdeStart; + _info.unwind_info_size = static_cast(fdeInfo.fdeLength); + _info.extra = static_cast(dso_base); + return true; + } + return false; +} + template bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, const UnwindInfoSections §s, @@ -1516,21 +1545,7 @@ bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, &fdeInfo, &cieInfo); } if (foundFDE) { - typename CFI_Parser::PrologInfo prolog; - if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, pc, - R::getArch(), &prolog)) { - // Save off parsed FDE info - _info.start_ip = fdeInfo.pcStart; - _info.end_ip = fdeInfo.pcEnd; - _info.lsda = fdeInfo.lsda; - _info.handler = cieInfo.personality; - _info.gp = prolog.spExtraArgSize; - _info.flags = 0; - _info.format = dwarfEncoding(); - _info.unwind_info = fdeInfo.fdeStart; - _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength; - _info.extra = (unw_word_t) sects.dso_base; - + if (getInfoFromFdeCie(fdeInfo, cieInfo, pc, sects.dso_base)) { // Add to cache (to make next lookup faster) if we had no hint // and there was no index. if (!foundInCache && (fdeSectionOffsetHint == 0)) { @@ -1932,58 +1947,24 @@ void UnwindCursor::setInfoBasedOnIPRegister(bool isReturnAddress) { // dynamically registered for it. pint_t cachedFDE = DwarfFDECache::findFDE(0, pc); if (cachedFDE != 0) { - CFI_Parser::FDE_Info fdeInfo; - CFI_Parser::CIE_Info cieInfo; - const char *msg = CFI_Parser::decodeFDE(_addressSpace, - cachedFDE, &fdeInfo, &cieInfo); - if (msg == NULL) { - typename CFI_Parser::PrologInfo prolog; - if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, - pc, R::getArch(), &prolog)) { - // save off parsed FDE info - _info.start_ip = fdeInfo.pcStart; - _info.end_ip = fdeInfo.pcEnd; - _info.lsda = fdeInfo.lsda; - _info.handler = cieInfo.personality; - _info.gp = prolog.spExtraArgSize; - // Some frameless functions need SP - // altered when resuming in function. - _info.flags = 0; - _info.format = dwarfEncoding(); - _info.unwind_info = fdeInfo.fdeStart; - _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength; - _info.extra = 0; + typename CFI_Parser::FDE_Info fdeInfo; + typename CFI_Parser::CIE_Info cieInfo; + if (!CFI_Parser::decodeFDE(_addressSpace, cachedFDE, &fdeInfo, &cieInfo)) + if (getInfoFromFdeCie(fdeInfo, cieInfo, pc, 0)) return; - } - } } // Lastly, ask AddressSpace object about platform specific ways to locate // other FDEs. pint_t fde; if (_addressSpace.findOtherFDE(pc, fde)) { - CFI_Parser::FDE_Info fdeInfo; - CFI_Parser::CIE_Info cieInfo; + typename CFI_Parser::FDE_Info fdeInfo; + typename CFI_Parser::CIE_Info cieInfo; if (!CFI_Parser::decodeFDE(_addressSpace, fde, &fdeInfo, &cieInfo)) { // Double check this FDE is for a function that includes the pc. - if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) { - typename CFI_Parser::PrologInfo prolog; - if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, - pc, R::getArch(), &prolog)) { - // save off parsed FDE info - _info.start_ip = fdeInfo.pcStart; - _info.end_ip = fdeInfo.pcEnd; - _info.lsda = fdeInfo.lsda; - _info.handler = cieInfo.personality; - _info.gp = prolog.spExtraArgSize; - _info.flags = 0; - _info.format = dwarfEncoding(); - _info.unwind_info = fdeInfo.fdeStart; - _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength; - _info.extra = 0; + if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) + if (getInfoFromFdeCie(fdeInfo, cieInfo, pc, 0)) return; - } - } } } #endif // #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S index 5d54432152860..d817e6dae9482 100644 --- a/libunwind/src/UnwindRegistersRestore.S +++ b/libunwind/src/UnwindRegistersRestore.S @@ -13,14 +13,10 @@ #if !defined(__USING_SJLJ_EXCEPTIONS__) #if defined(__i386__) -DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_x866jumptoEv) +DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_jumpto) # -# void libunwind::Registers_x86::jumpto() +# extern "C" void __libunwind_Registers_x86_jumpto(Registers_x86 *); # -#if defined(_WIN32) -# On windows, the 'this' pointer is passed in ecx instead of on the stack - movl %ecx, %eax -#else # On entry: # + + # +-----------------------+ @@ -30,7 +26,6 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_x866jumptoEv) # +-----------------------+ <-- SP # + + movl 4(%esp), %eax -#endif # set up eax and ret on new stack location movl 28(%eax), %edx # edx holds new stack pointer subl $8,%edx @@ -60,9 +55,9 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_x866jumptoEv) #elif defined(__x86_64__) -DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind16Registers_x86_646jumptoEv) +DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_64_jumpto) # -# void libunwind::Registers_x86_64::jumpto() +# extern "C" void __libunwind_Registers_x86_64_jumpto(Registers_x86_64 *); # #if defined(_WIN64) # On entry, thread_state pointer is in rcx; move it into rdi @@ -560,13 +555,13 @@ Lnovec: #elif defined(__aarch64__) // -// void libunwind::Registers_arm64::jumpto() +// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *); // // On entry: // thread_state pointer is in x0 // .p2align 2 -DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_arm646jumptoEv) +DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto) // skip restore of x0,x1 for now ldp x2, x3, [x0, #0x010] ldp x4, x5, [x0, #0x020] diff --git a/libunwind/src/config.h b/libunwind/src/config.h index 842fd829af197..2014b8cb77abd 100644 --- a/libunwind/src/config.h +++ b/libunwind/src/config.h @@ -18,23 +18,13 @@ #include #include -// Define static_assert() unless already defined by compiler. -#ifndef __has_feature - #define __has_feature(__x) 0 -#endif -#if !(__has_feature(cxx_static_assert)) && !defined(static_assert) - #define static_assert(__b, __m) \ - extern int compile_time_assert_failed[ ( __b ) ? 1 : -1 ] \ - __attribute__( ( unused ) ); -#endif - // Platform specific configuration defines. #ifdef __APPLE__ #if defined(FOR_DYLD) - #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND + #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1 #else - #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND - #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 + #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1 + #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 #endif #elif defined(_WIN32) #ifdef __SEH__ diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 0bcc6c940bbab..a692dfe95d6d9 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -472,8 +472,23 @@ Symbol *ObjFile::createUndefined(COFFSymbolRef sym) { return symtab->addUndefined(name, this, sym.isWeakExternal()); } -void ObjFile::handleComdatSelection(COFFSymbolRef sym, COMDATType &selection, - bool &prevailing, DefinedRegular *leader) { +static const coff_aux_section_definition *findSectionDef(COFFObjectFile *obj, + int32_t section) { + uint32_t numSymbols = obj->getNumberOfSymbols(); + for (uint32_t i = 0; i < numSymbols; ++i) { + COFFSymbolRef sym = check(obj->getSymbol(i)); + if (sym.getSectionNumber() != section) + continue; + if (const coff_aux_section_definition *def = sym.getSectionDefinition()) + return def; + } + return nullptr; +} + +void ObjFile::handleComdatSelection( + COFFSymbolRef sym, COMDATType &selection, bool &prevailing, + DefinedRegular *leader, + const llvm::object::coff_aux_section_definition *def) { if (prevailing) return; // There's already an existing comdat for this symbol: `Leader`. @@ -540,8 +555,16 @@ void ObjFile::handleComdatSelection(COFFSymbolRef sym, COMDATType &selection, break; case IMAGE_COMDAT_SELECT_SAME_SIZE: - if (leaderChunk->getSize() != getSection(sym)->SizeOfRawData) - symtab->reportDuplicate(leader, this); + if (leaderChunk->getSize() != getSection(sym)->SizeOfRawData) { + if (!config->mingw) { + symtab->reportDuplicate(leader, this); + } else { + const coff_aux_section_definition *leaderDef = findSectionDef( + leaderChunk->file->getCOFFObj(), leaderChunk->getSectionNumber()); + if (!leaderDef || leaderDef->Length != def->Length) + symtab->reportDuplicate(leader, this); + } + } break; case IMAGE_COMDAT_SELECT_EXACT_MATCH: { @@ -657,7 +680,7 @@ Optional ObjFile::createDefined( COMDATType selection = (COMDATType)def->Selection; if (leader->isCOMDAT) - handleComdatSelection(sym, selection, prevailing, leader); + handleComdatSelection(sym, selection, prevailing, leader, def); if (prevailing) { SectionChunk *c = readSection(sectionNumber, def, getName()); diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h index 1e0b97a82be29..0a5114b165f0c 100644 --- a/lld/COFF/InputFiles.h +++ b/lld/COFF/InputFiles.h @@ -255,9 +255,10 @@ class ObjFile : public InputFile { // match the existing symbol and its selection. If either old or new // symbol have selection IMAGE_COMDAT_SELECT_LARGEST, Sym might replace // the existing leader. In that case, Prevailing is set to true. - void handleComdatSelection(COFFSymbolRef sym, - llvm::COFF::COMDATType &selection, - bool &prevailing, DefinedRegular *leader); + void + handleComdatSelection(COFFSymbolRef sym, llvm::COFF::COMDATType &selection, + bool &prevailing, DefinedRegular *leader, + const llvm::object::coff_aux_section_definition *def); llvm::Optional createDefined(COFFSymbolRef sym, diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index fbb9ac758a4f1..bfc8e9c1e53b1 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -780,20 +780,21 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) { // of zero or more type-length-value fields. We want to find a field of a // certain type. It seems a bit too much to just store a 32-bit value, perhaps // the ABI is unnecessarily complicated. -template -static uint32_t readAndFeatures(ObjFile *obj, ArrayRef data) { +template static uint32_t readAndFeatures(const InputSection &sec) { using Elf_Nhdr = typename ELFT::Nhdr; using Elf_Note = typename ELFT::Note; uint32_t featuresSet = 0; + ArrayRef data = sec.data(); + auto reportFatal = [&](const uint8_t *place, const char *msg) { + fatal(toString(sec.file) + ":(" + sec.name + "+0x" + + Twine::utohexstr(place - sec.data().data()) + "): " + msg); + }; while (!data.empty()) { // Read one NOTE record. - if (data.size() < sizeof(Elf_Nhdr)) - fatal(toString(obj) + ": .note.gnu.property: section too short"); - auto *nhdr = reinterpret_cast(data.data()); - if (data.size() < nhdr->getSize()) - fatal(toString(obj) + ": .note.gnu.property: section too short"); + if (data.size() < sizeof(Elf_Nhdr) || data.size() < nhdr->getSize()) + reportFatal(data.data(), "data is too short"); Elf_Note note(*nhdr); if (nhdr->n_type != NT_GNU_PROPERTY_TYPE_0 || note.getName() != "GNU") { @@ -808,25 +809,26 @@ static uint32_t readAndFeatures(ObjFile *obj, ArrayRef data) { // Read a body of a NOTE record, which consists of type-length-value fields. ArrayRef desc = note.getDesc(); while (!desc.empty()) { + const uint8_t *place = desc.data(); if (desc.size() < 8) - fatal(toString(obj) + ": .note.gnu.property: section too short"); - - uint32_t type = read32le(desc.data()); - uint32_t size = read32le(desc.data() + 4); + reportFatal(place, "program property is too short"); + uint32_t type = read32(desc.data()); + uint32_t size = read32(desc.data() + 4); + desc = desc.slice(8); + if (desc.size() < size) + reportFatal(place, "program property is too short"); if (type == featureAndType) { // We found a FEATURE_1_AND field. There may be more than one of these // in a .note.gnu.property section, for a relocatable object we // accumulate the bits set. - featuresSet |= read32le(desc.data() + 8); + if (size < 4) + reportFatal(place, "FEATURE_1_AND entry is too short"); + featuresSet |= read32(desc.data()); } - // On 64-bit, a payload may be followed by a 4-byte padding to make its - // size a multiple of 8. - if (ELFT::Is64Bits) - size = alignTo(size, 8); - - desc = desc.slice(size + 8); // +8 for Type and Size + // Padding is present in the note descriptor, if necessary. + desc = desc.slice(alignTo<(ELFT::Is64Bits ? 8 : 4)>(size)); } // Go to next NOTE record to look for more FEATURE_1_AND descriptions. @@ -985,8 +987,7 @@ InputSectionBase *ObjFile::createInputSection(const Elf_Shdr &sec) { // .note.gnu.property containing a single AND'ed bitmap, we discard an input // file's .note.gnu.property section. if (name == ".note.gnu.property") { - ArrayRef contents = check(this->getObj().getSectionContents(&sec)); - this->andFeatures = readAndFeatures(this, contents); + this->andFeatures = readAndFeatures(InputSection(*this, sec, name)); return &InputSection::discarded; } diff --git a/lld/MachO/Arch/X86_64.cpp b/lld/MachO/Arch/X86_64.cpp index cdfb8b871803b..5d61b5b5cd2af 100644 --- a/lld/MachO/Arch/X86_64.cpp +++ b/lld/MachO/Arch/X86_64.cpp @@ -29,12 +29,12 @@ struct X86_64 : TargetInfo { const relocation_info &) const override; void relocateOne(uint8_t *loc, const Reloc &, uint64_t val) const override; - void writeStub(uint8_t *buf, const DylibSymbol &) const override; + void writeStub(uint8_t *buf, const macho::Symbol &) const override; void writeStubHelperHeader(uint8_t *buf) const override; void writeStubHelperEntry(uint8_t *buf, const DylibSymbol &, uint64_t entryAddr) const override; - void prepareSymbolRelocation(lld::macho::Symbol &, const InputSection *, + void prepareSymbolRelocation(lld::macho::Symbol *, const InputSection *, const Reloc &) override; uint64_t resolveSymbolVA(uint8_t *buf, const lld::macho::Symbol &, uint8_t type) const override; @@ -182,7 +182,7 @@ static constexpr uint8_t stub[] = { 0xff, 0x25, 0, 0, 0, 0, // jmpq *__la_symbol_ptr(%rip) }; -void X86_64::writeStub(uint8_t *buf, const DylibSymbol &sym) const { +void X86_64::writeStub(uint8_t *buf, const macho::Symbol &sym) const { memcpy(buf, stub, 2); // just copy the two nonzero bytes uint64_t stubAddr = in.stubs->addr + sym.stubsIndex * sizeof(stub); writeRipRelative(buf, stubAddr, sizeof(stub), @@ -217,55 +217,73 @@ void X86_64::writeStubHelperEntry(uint8_t *buf, const DylibSymbol &sym, in.stubHelper->addr); } -void X86_64::prepareSymbolRelocation(lld::macho::Symbol &sym, +void X86_64::prepareSymbolRelocation(lld::macho::Symbol *sym, const InputSection *isec, const Reloc &r) { switch (r.type) { - case X86_64_RELOC_GOT_LOAD: - // TODO: implement mov -> lea relaxation for non-dynamic symbols - case X86_64_RELOC_GOT: + case X86_64_RELOC_GOT_LOAD: { + if (needsBinding(sym)) + in.got->addEntry(sym); + + if (sym->isTlv()) + error("found GOT relocation referencing thread-local variable in " + + toString(isec)); + break; + } + case X86_64_RELOC_GOT: { in.got->addEntry(sym); - if (sym.isTlv()) + + if (sym->isTlv()) error("found GOT relocation referencing thread-local variable in " + toString(isec)); break; + } case X86_64_RELOC_BRANCH: { - // TODO: weak dysyms should go into the weak binding section instead - if (auto *dysym = dyn_cast(&sym)) - in.stubs->addEntry(*dysym); + if (auto *dysym = dyn_cast(sym)) { + if (in.stubs->addEntry(dysym)) { + if (sym->isWeakDef()) { + in.binding->addEntry(dysym, in.lazyPointers, + sym->stubsIndex * WordSize); + in.weakBinding->addEntry(sym, in.lazyPointers, + sym->stubsIndex * WordSize); + } else { + in.lazyBinding->addEntry(dysym); + } + } + } else if (auto *defined = dyn_cast(sym)) { + if (defined->isWeakDef() && defined->isExternal()) + if (in.stubs->addEntry(sym)) + in.weakBinding->addEntry(sym, in.lazyPointers, + sym->stubsIndex * WordSize); + } break; } case X86_64_RELOC_UNSIGNED: { - if (auto *dysym = dyn_cast(&sym)) { + if (auto *dysym = dyn_cast(sym)) { if (r.length != 3) { error("X86_64_RELOC_UNSIGNED referencing the dynamic symbol " + dysym->getName() + " must have r_length = 3"); return; } - in.binding->addEntry(dysym, isec, r.offset, r.addend); } + addNonLazyBindingEntries(sym, isec, r.offset, r.addend); break; } case X86_64_RELOC_SIGNED: case X86_64_RELOC_SIGNED_1: case X86_64_RELOC_SIGNED_2: case X86_64_RELOC_SIGNED_4: + // TODO: warn if they refer to a weak global break; - case X86_64_RELOC_TLV: - if (isa(&sym)) { + case X86_64_RELOC_TLV: { + if (needsBinding(sym)) in.tlvPointers->addEntry(sym); - } else { - assert(isa(&sym)); - // TLV relocations on x86_64 are always used with a movq opcode, which - // can be converted to leaq opcodes if they reference a defined symbol. - // (This is in contrast to GOT relocations, which can be used with - // non-movq opcodes.) As such, there is no need to add an entry to - // tlvPointers here. - } - if (!sym.isTlv()) + + if (!sym->isTlv()) error( "found X86_64_RELOC_TLV referencing a non-thread-local variable in " + toString(isec)); break; + } case X86_64_RELOC_SUBTRACTOR: fatal("TODO: handle relocation type " + std::to_string(r.type)); break; @@ -277,13 +295,22 @@ void X86_64::prepareSymbolRelocation(lld::macho::Symbol &sym, uint64_t X86_64::resolveSymbolVA(uint8_t *buf, const lld::macho::Symbol &sym, uint8_t type) const { switch (type) { - case X86_64_RELOC_GOT_LOAD: + case X86_64_RELOC_GOT_LOAD: { + if (!sym.isInGot()) { + if (buf[-2] != 0x8b) + error("X86_64_RELOC_GOT_LOAD must be used with movq instructions"); + buf[-2] = 0x8d; + return sym.getVA(); + } + LLVM_FALLTHROUGH; + } case X86_64_RELOC_GOT: return in.got->addr + sym.gotIndex * WordSize; - case X86_64_RELOC_BRANCH: - if (auto *dysym = dyn_cast(&sym)) - return in.stubs->addr + dysym->stubsIndex * sizeof(stub); + case X86_64_RELOC_BRANCH: { + if (sym.isInStubs()) + return in.stubs->addr + sym.stubsIndex * sizeof(stub); return sym.getVA(); + } case X86_64_RELOC_UNSIGNED: case X86_64_RELOC_SIGNED: case X86_64_RELOC_SIGNED_1: @@ -291,7 +318,7 @@ uint64_t X86_64::resolveSymbolVA(uint8_t *buf, const lld::macho::Symbol &sym, case X86_64_RELOC_SIGNED_4: return sym.getVA(); case X86_64_RELOC_TLV: { - if (isa(&sym)) + if (sym.isInGot()) return in.tlvPointers->addr + sym.gotIndex * WordSize; // Convert the movq to a leaq. diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt index 6fe356f515894..716449c8574a8 100644 --- a/lld/MachO/CMakeLists.txt +++ b/lld/MachO/CMakeLists.txt @@ -5,10 +5,12 @@ add_public_tablegen_target(MachOOptionsTableGen) add_lld_library(lldMachO2 Arch/X86_64.cpp Driver.cpp + DriverUtils.cpp ExportTrie.cpp InputFiles.cpp InputSection.cpp MergedOutputSection.cpp + ObjC.cpp OutputSection.cpp OutputSegment.cpp SymbolTable.cpp diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index 362069ea80408..5fbb477c19939 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -33,12 +33,15 @@ struct PlatformInfo { struct Configuration { Symbol *entry; bool hasReexports = false; + bool allLoad = false; + bool forceLoadObjC = false; uint32_t headerPad; llvm::StringRef installName; llvm::StringRef outputFile; llvm::MachO::Architecture arch; PlatformInfo platform; llvm::MachO::HeaderFileType outputType; + std::vector systemLibraryRoots; std::vector librarySearchPaths; std::vector frameworkSearchPaths; std::vector runtimePaths; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 2ffedaaf42c44..cda82ecd56302 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -8,7 +8,9 @@ #include "Driver.h" #include "Config.h" +#include "DriverUtils.h" #include "InputFiles.h" +#include "ObjC.h" #include "OutputSection.h" #include "OutputSegment.h" #include "SymbolTable.h" @@ -129,7 +131,7 @@ static Optional findFramework(StringRef name) { // Suffix lookup failed, fall through to the no-suffix case. } - if (Optional path = findWithExtension(symlink, {".tbd", ""})) + if (Optional path = resolveDylibPath(symlink)) return path; } return {}; @@ -148,7 +150,7 @@ static TargetInfo *createTargetInfo(opt::InputArgList &args) { } } -static bool isDirectory(StringRef option, StringRef path) { +static bool warnIfNotDirectory(StringRef option, StringRef path) { if (!fs::exists(path)) { warn("directory not found for option -" + option + path); return false; @@ -163,21 +165,23 @@ static void getSearchPaths(std::vector &paths, unsigned optionCode, opt::InputArgList &args, const std::vector &roots, const SmallVector &systemPaths) { - StringRef optionLetter{(optionCode == OPT_F ? "F" : "L")}; - for (auto const &path : args::getStrings(args, optionCode)) { + StringRef optionLetter{optionCode == OPT_F ? "F" : "L"}; + for (StringRef path : args::getStrings(args, optionCode)) { // NOTE: only absolute paths are re-rooted to syslibroot(s) - if (llvm::sys::path::is_absolute(path, llvm::sys::path::Style::posix)) { + bool found = false; + if (path::is_absolute(path, path::Style::posix)) { for (StringRef root : roots) { SmallString<261> buffer(root); - llvm::sys::path::append(buffer, path); + path::append(buffer, path); // Do not warn about paths that are computed via the syslib roots - if (llvm::sys::fs::is_directory(buffer)) + if (fs::is_directory(buffer)) { paths.push_back(saver.save(buffer.str())); + found = true; + } } - } else { - if (isDirectory(optionLetter, path)) - paths.push_back(path); } + if (!found && warnIfNotDirectory(optionLetter, path)) + paths.push_back(path); } // `-Z` suppresses the standard "system" search paths. @@ -187,8 +191,8 @@ static void getSearchPaths(std::vector &paths, unsigned optionCode, for (auto const &path : systemPaths) { for (auto root : roots) { SmallString<261> buffer(root); - llvm::sys::path::append(buffer, path); - if (isDirectory(optionLetter, buffer)) + path::append(buffer, path); + if (warnIfNotDirectory(optionLetter, buffer)) paths.push_back(saver.save(buffer.str())); } } @@ -207,6 +211,29 @@ static void getFrameworkSearchPaths(opt::InputArgList &args, {"/Library/Frameworks", "/System/Library/Frameworks"}); } +// Returns slices of MB by parsing MB as an archive file. +// Each slice consists of a member file in the archive. +static std::vector getArchiveMembers(MemoryBufferRef mb) { + std::unique_ptr file = + CHECK(Archive::create(mb), + mb.getBufferIdentifier() + ": failed to parse archive"); + + std::vector v; + Error err = Error::success(); + for (const Archive::Child &c : file->children(err)) { + MemoryBufferRef mbref = + CHECK(c.getMemoryBufferRef(), + mb.getBufferIdentifier() + + ": could not get the buffer for a child of the archive"); + v.push_back(mbref); + } + if (err) + fatal(mb.getBufferIdentifier() + + ": Archive::children failed: " + toString(std::move(err))); + + return v; +} + static void addFile(StringRef path) { Optional buffer = readFile(path); if (!buffer) @@ -221,6 +248,25 @@ static void addFile(StringRef path) { if (!file->isEmpty() && !file->hasSymbolTable()) error(path + ": archive has no index; run ranlib to add one"); + if (config->allLoad) { + if (Optional buffer = readFile(path)) + for (MemoryBufferRef member : getArchiveMembers(*buffer)) + inputFiles.push_back(make(member)); + } else if (config->forceLoadObjC) { + for (const object::Archive::Symbol &sym : file->symbols()) + if (sym.getName().startswith(objc::klass)) + symtab->addUndefined(sym.getName()); + + // TODO: no need to look for ObjC sections for a given archive member if + // we already found that it contains an ObjC symbol. We should also + // consider creating a LazyObjFile class in order to avoid double-loading + // these files here and below (as part of the ArchiveFile). + if (Optional buffer = readFile(path)) + for (MemoryBufferRef member : getArchiveMembers(*buffer)) + if (hasObjCSection(member)) + inputFiles.push_back(make(member)); + } + inputFiles.push_back(make(std::move(file))); break; } @@ -231,13 +277,10 @@ static void addFile(StringRef path) { inputFiles.push_back(make(mbref)); break; case file_magic::tapi_file: { - Expected> result = TextAPIReader::get(mbref); - if (!result) { - error("could not load TAPI file at " + mbref.getBufferIdentifier() + - ": " + toString(result.takeError())); + Optional dylibFile = makeDylibFromTAPI(mbref); + if (!dylibFile) return; - } - inputFiles.push_back(make(**result)); + inputFiles.push_back(*dylibFile); break; } default: @@ -254,29 +297,6 @@ static void addFileList(StringRef path) { addFile(path); } -// Returns slices of MB by parsing MB as an archive file. -// Each slice consists of a member file in the archive. -static std::vector getArchiveMembers(MemoryBufferRef mb) { - std::unique_ptr file = - CHECK(Archive::create(mb), - mb.getBufferIdentifier() + ": failed to parse archive"); - - std::vector v; - Error err = Error::success(); - for (const Archive::Child &c : file->children(err)) { - MemoryBufferRef mbref = - CHECK(c.getMemoryBufferRef(), - mb.getBufferIdentifier() + - ": could not get the buffer for a child of the archive"); - v.push_back(mbref); - } - if (err) - fatal(mb.getBufferIdentifier() + - ": Archive::children failed: " + toString(std::move(err))); - - return v; -} - static void forceLoadArchive(StringRef path) { if (Optional buffer = readFile(path)) for (MemoryBufferRef member : getArchiveMembers(*buffer)) @@ -377,12 +397,15 @@ static void parseOrderFile(StringRef path) { } // We expect sub-library names of the form "libfoo", which will match a dylib -// with a path of .*/libfoo.dylib. +// with a path of .*/libfoo.{dylib, tbd}. +// XXX ld64 seems to ignore the extension entirely when matching sub-libraries; +// I'm not sure what the use case for that is. static bool markSubLibrary(StringRef searchName) { for (InputFile *file : inputFiles) { if (auto *dylibFile = dyn_cast(file)) { StringRef filename = path::filename(dylibFile->getName()); - if (filename.consume_front(searchName) && filename == ".dylib") { + if (filename.consume_front(searchName) && + (filename == ".dylib" || filename == ".tbd")) { dylibFile->reexport = true; return true; } @@ -500,8 +523,9 @@ bool macho::link(llvm::ArrayRef argsArr, bool canExitEarly, config->headerPad = args::getHex(args, OPT_headerpad, /*Default=*/32); config->outputType = args.hasArg(OPT_dylib) ? MH_DYLIB : MH_EXECUTE; config->runtimePaths = args::getStrings(args, OPT_rpath); + config->allLoad = args.hasArg(OPT_all_load); - std::vector roots; + std::vector &roots = config->systemLibraryRoots; for (const Arg *arg : args.filtered(OPT_syslibroot)) roots.push_back(arg->getValue()); // NOTE: the final `-syslibroot` being `/` will ignore all roots @@ -514,6 +538,7 @@ bool macho::link(llvm::ArrayRef argsArr, bool canExitEarly, getLibrarySearchPaths(args, roots, config->librarySearchPaths); getFrameworkSearchPaths(args, roots, config->frameworkSearchPaths); + config->forceLoadObjC = args.hasArg(OPT_ObjC); if (args.hasArg(OPT_v)) { message(getLLDVersion()); @@ -563,11 +588,13 @@ bool macho::link(llvm::ArrayRef argsArr, bool canExitEarly, case OPT_platform_version: handlePlatformVersion(arg); break; + case OPT_all_load: case OPT_o: case OPT_dylib: case OPT_e: case OPT_F: case OPT_L: + case OPT_ObjC: case OPT_headerpad: case OPT_install_name: case OPT_rpath: diff --git a/lld/MachO/DriverUtils.cpp b/lld/MachO/DriverUtils.cpp new file mode 100644 index 0000000000000..fa0b62e11c493 --- /dev/null +++ b/lld/MachO/DriverUtils.cpp @@ -0,0 +1,46 @@ +//===- DriverUtils.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "DriverUtils.h" +#include "InputFiles.h" + +#include "lld/Common/ErrorHandler.h" +#include "lld/Common/Memory.h" +#include "llvm/Support/Path.h" +#include "llvm/TextAPI/MachO/TextAPIReader.h" + +using namespace llvm; +using namespace llvm::MachO; +using namespace llvm::sys; +using namespace lld; +using namespace lld::macho; + +Optional macho::resolveDylibPath(StringRef path) { + // TODO: if a tbd and dylib are both present, we should check to make sure + // they are consistent. + if (fs::exists(path)) + return std::string(path); + + SmallString<261> location = path; + path::replace_extension(location, ".tbd"); + if (fs::exists(location)) + return std::string(location); + + return {}; +} + +Optional macho::makeDylibFromTAPI(MemoryBufferRef mbref, + DylibFile *umbrella) { + Expected> result = TextAPIReader::get(mbref); + if (!result) { + error("could not load TAPI file at " + mbref.getBufferIdentifier() + ": " + + toString(result.takeError())); + return {}; + } + return make(**result, umbrella); +} diff --git a/lld/MachO/DriverUtils.h b/lld/MachO/DriverUtils.h new file mode 100644 index 0000000000000..d3d3670ab2464 --- /dev/null +++ b/lld/MachO/DriverUtils.h @@ -0,0 +1,31 @@ +//===- DriverUtils.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_MACHO_DRIVER_UTILS_H +#define LLD_MACHO_DRIVER_UTILS_H + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" + +namespace lld { +namespace macho { + +class DylibFile; + +// Check for both libfoo.dylib and libfoo.tbd (in that order). +llvm::Optional resolveDylibPath(llvm::StringRef path); + +llvm::Optional makeDylibFromTAPI(llvm::MemoryBufferRef mbref, + DylibFile *umbrella = nullptr); + +} // namespace macho +} // namespace lld + +#endif diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 5e0c9e0679daa..d19654306be51 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -43,16 +43,20 @@ #include "InputFiles.h" #include "Config.h" +#include "DriverUtils.h" #include "ExportTrie.h" #include "InputSection.h" #include "MachOStructs.h" +#include "ObjC.h" #include "OutputSection.h" +#include "OutputSegment.h" #include "SymbolTable.h" #include "Symbols.h" #include "Target.h" #include "lld/Common/ErrorHandler.h" #include "lld/Common/Memory.h" +#include "llvm/ADT/iterator.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/Support/Endian.h" #include "llvm/Support/MemoryBuffer.h" @@ -114,7 +118,7 @@ Optional macho::readFile(StringRef path) { return None; } -static const load_command *findCommand(const mach_header_64 *hdr, +const load_command *macho::findCommand(const mach_header_64 *hdr, uint32_t type) { const uint8_t *p = reinterpret_cast(hdr) + sizeof(mach_header_64); @@ -135,8 +139,10 @@ void InputFile::parseSections(ArrayRef sections) { for (const section_64 &sec : sections) { InputSection *isec = make(); isec->file = this; - isec->name = StringRef(sec.sectname, strnlen(sec.sectname, 16)); - isec->segname = StringRef(sec.segname, strnlen(sec.segname, 16)); + isec->name = + StringRef(sec.sectname, strnlen(sec.sectname, sizeof(sec.sectname))); + isec->segname = + StringRef(sec.segname, strnlen(sec.segname, sizeof(sec.segname))); isec->data = {isZeroFill(sec.flags) ? nullptr : buf + sec.offset, static_cast(sec.size)}; if (sec.align >= 32) @@ -230,7 +236,8 @@ void InputFile::parseSymbols(ArrayRef nList, // Global defined symbol return symtab->addDefined(name, isec, value, sym.n_desc & N_WEAK_DEF); // Local defined symbol - return make(name, isec, value, sym.n_desc & N_WEAK_DEF); + return make(name, isec, value, sym.n_desc & N_WEAK_DEF, + /*isExternal=*/false); }; for (size_t i = 0, n = nList.size(); i < n; ++i) { @@ -340,6 +347,60 @@ ObjFile::ObjFile(MemoryBufferRef mb) : InputFile(ObjKind, mb) { parseRelocations(sectionHeaders[i], subsections[i]); } +// The path can point to either a dylib or a .tbd file. +static Optional loadDylib(StringRef path, DylibFile *umbrella) { + Optional mbref = readFile(path); + if (!mbref) { + error("could not read dylib file at " + path); + return {}; + } + + file_magic magic = identify_magic(mbref->getBuffer()); + if (magic == file_magic::tapi_file) + return makeDylibFromTAPI(*mbref, umbrella); + assert(magic == file_magic::macho_dynamically_linked_shared_lib); + return make(*mbref, umbrella); +} + +// TBD files are parsed into a series of TAPI documents (InterfaceFiles), with +// the first document storing child pointers to the rest of them. When we are +// processing a given TBD file, we store that top-level document here. When +// processing re-exports, we search its children for potentially matching +// documents in the same TBD file. Note that the children themselves don't +// point to further documents, i.e. this is a two-level tree. +// +// ld64 allows a TAPI re-export to reference documents nested within other TBD +// files, but that seems like a strange design, so this is an intentional +// deviation. +const InterfaceFile *currentTopLevelTapi = nullptr; + +// Re-exports can either refer to on-disk files, or to documents within .tbd +// files. +static Optional loadReexport(StringRef path, DylibFile *umbrella) { + if (path::is_absolute(path, path::Style::posix)) + for (StringRef root : config->systemLibraryRoots) + if (Optional dylibPath = + resolveDylibPath((root + path).str())) + return loadDylib(*dylibPath, umbrella); + + // TODO: Expand @loader_path, @executable_path etc + + if (currentTopLevelTapi != nullptr) { + for (InterfaceFile &child : + make_pointee_range(currentTopLevelTapi->documents())) { + if (path == child.getInstallName()) + return make(child, umbrella); + assert(child.documents().empty()); + } + } + + if (Optional dylibPath = resolveDylibPath(path)) + return loadDylib(*dylibPath, umbrella); + + error("unable to locate re-export with install name " + path); + return {}; +} + DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella) : InputFile(DylibKind, mb) { if (umbrella == nullptr) @@ -358,6 +419,9 @@ DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella) } // Initialize symbols. + // TODO: if a re-exported dylib is public (lives in /usr/lib or + // /System/Library/Frameworks), we should bind to its symbols directly + // instead of the re-exporting umbrella library. if (const load_command *cmd = findCommand(hdr, LC_DYLD_INFO_ONLY)) { auto *c = reinterpret_cast(cmd); parseTrie(buf + c->export_off, c->export_size, @@ -386,18 +450,13 @@ DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella) auto *c = reinterpret_cast(cmd); StringRef reexportPath = reinterpret_cast(c) + read32le(&c->dylib.name); - // TODO: Expand @loader_path, @executable_path etc in reexportPath - Optional buffer = readFile(reexportPath); - if (!buffer) { - error("unable to read re-exported dylib at " + reexportPath); - return; - } - reexported.push_back(make(*buffer, umbrella)); + if (Optional reexport = loadReexport(reexportPath, umbrella)) + reexported.push_back(*reexport); } } DylibFile::DylibFile(const InterfaceFile &interface, DylibFile *umbrella) - : InputFile(DylibKind, MemoryBufferRef()) { + : InputFile(DylibKind, interface) { if (umbrella == nullptr) umbrella = this; @@ -420,22 +479,31 @@ DylibFile::DylibFile(const InterfaceFile &interface, DylibFile *umbrella) case SymbolKind::ObjectiveCClass: // XXX ld64 only creates these symbols when -ObjC is passed in. We may // want to emulate that. - addSymbol("_OBJC_CLASS_$_" + symbol->getName()); - addSymbol("_OBJC_METACLASS_$_" + symbol->getName()); + addSymbol(objc::klass + symbol->getName()); + addSymbol(objc::metaclass + symbol->getName()); break; case SymbolKind::ObjectiveCClassEHType: - addSymbol("_OBJC_EHTYPE_$_" + symbol->getName()); + addSymbol(objc::ehtype + symbol->getName()); break; case SymbolKind::ObjectiveCInstanceVariable: - addSymbol("_OBJC_IVAR_$_" + symbol->getName()); + addSymbol(objc::ivar + symbol->getName()); break; } } - // TODO(compnerd) properly represent the hierarchy of the documents as it is - // in theory possible to have re-exported dylibs from re-exported dylibs which - // should be parent'ed to the child. - for (const std::shared_ptr &intf : interface.documents()) - reexported.push_back(make(*intf, umbrella)); + + bool isTopLevelTapi = false; + if (currentTopLevelTapi == nullptr) { + currentTopLevelTapi = &interface; + isTopLevelTapi = true; + } + + for (InterfaceFileRef intfRef : interface.reexportedLibraries()) + if (Optional reexport = + loadReexport(intfRef.getInstallName(), umbrella)) + reexported.push_back(*reexport); + + if (isTopLevelTapi) + currentTopLevelTapi = nullptr; } ArchiveFile::ArchiveFile(std::unique_ptr &&f) diff --git a/lld/MachO/InputFiles.h b/lld/MachO/InputFiles.h index 3a7795254f9a0..194de0e1a4e9b 100644 --- a/lld/MachO/InputFiles.h +++ b/lld/MachO/InputFiles.h @@ -12,6 +12,7 @@ #include "MachOStructs.h" #include "lld/Common/LLVM.h" +#include "lld/Common/Memory.h" #include "llvm/ADT/DenseSet.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/Object/Archive.h" @@ -45,7 +46,7 @@ class InputFile { virtual ~InputFile() = default; Kind kind() const { return fileKind; } - StringRef getName() const { return mb.getBufferIdentifier(); } + StringRef getName() const { return name; } MemoryBufferRef mb; std::vector symbols; @@ -53,7 +54,11 @@ class InputFile { std::vector subsections; protected: - InputFile(Kind kind, MemoryBufferRef mb) : mb(mb), fileKind(kind) {} + InputFile(Kind kind, MemoryBufferRef mb) + : mb(mb), fileKind(kind), name(mb.getBufferIdentifier()) {} + + InputFile(Kind kind, const llvm::MachO::InterfaceFile &interface) + : fileKind(kind), name(saver.save(interface.getPath())) {} void parseSections(ArrayRef); @@ -64,6 +69,7 @@ class InputFile { private: const Kind fileKind; + const StringRef name; }; // .o file @@ -84,9 +90,6 @@ class OpaqueFile : public InputFile { // .dylib file class DylibFile : public InputFile { public: - explicit DylibFile(const llvm::MachO::InterfaceFile &interface, - DylibFile *umbrella = nullptr); - // Mach-O dylibs can re-export other dylibs as sub-libraries, meaning that the // symbols in those sub-libraries will be available under the umbrella // library's namespace. Those sub-libraries can also have their own @@ -96,6 +99,9 @@ class DylibFile : public InputFile { // (through an -lfoo flag), then `umbrella` should be a nullptr. explicit DylibFile(MemoryBufferRef mb, DylibFile *umbrella = nullptr); + explicit DylibFile(const llvm::MachO::InterfaceFile &interface, + DylibFile *umbrella = nullptr); + static bool classof(const InputFile *f) { return f->kind() == DylibKind; } StringRef dylibName; @@ -122,6 +128,9 @@ extern std::vector inputFiles; llvm::Optional readFile(StringRef path); +const llvm::MachO::load_command * +findCommand(const llvm::MachO::mach_header_64 *, uint32_t type); + } // namespace macho std::string toString(const macho::InputFile *file); diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp new file mode 100644 index 0000000000000..21691ef5255b3 --- /dev/null +++ b/lld/MachO/ObjC.cpp @@ -0,0 +1,36 @@ +//===- ObjC.cpp -----------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ObjC.h" +#include "InputFiles.h" +#include "OutputSegment.h" + +#include "llvm/BinaryFormat/MachO.h" + +using namespace llvm; +using namespace llvm::MachO; +using namespace lld; + +bool macho::hasObjCSection(MemoryBufferRef mb) { + auto *hdr = reinterpret_cast(mb.getBufferStart()); + if (const load_command *cmd = findCommand(hdr, LC_SEGMENT_64)) { + auto *c = reinterpret_cast(cmd); + auto sectionHeaders = ArrayRef{ + reinterpret_cast(c + 1), c->nsects}; + for (const section_64 &sec : sectionHeaders) { + StringRef sectname(sec.sectname, + strnlen(sec.sectname, sizeof(sec.sectname))); + StringRef segname(sec.segname, strnlen(sec.segname, sizeof(sec.segname))); + if ((segname == segment_names::data && sectname == "__objc_catlist") || + (segname == segment_names::text && sectname == "__swift")) { + return true; + } + } + } + return false; +} diff --git a/lld/MachO/ObjC.h b/lld/MachO/ObjC.h new file mode 100644 index 0000000000000..8db459ad8e2ba --- /dev/null +++ b/lld/MachO/ObjC.h @@ -0,0 +1,31 @@ +//===- ObjC.h ---------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_MACHO_OBJC_H +#define LLD_MACHO_OBJC_H + +#include "llvm/Support/MemoryBuffer.h" + +namespace lld { +namespace macho { + +namespace objc { + +constexpr const char klass[] = "_OBJC_CLASS_$_"; +constexpr const char metaclass[] = "_OBJC_METACLASS_$_"; +constexpr const char ehtype[] = "_OBJC_EHTYPE_$_"; +constexpr const char ivar[] = "_OBJC_IVAR_$_"; + +} // namespace objc + +bool hasObjCSection(llvm::MemoryBufferRef); + +} // namespace macho +} // namespace lld + +#endif diff --git a/lld/MachO/SymbolTable.cpp b/lld/MachO/SymbolTable.cpp index bd10a2c56989f..cfb35718e7ec2 100644 --- a/lld/MachO/SymbolTable.cpp +++ b/lld/MachO/SymbolTable.cpp @@ -40,6 +40,7 @@ Symbol *SymbolTable::addDefined(StringRef name, InputSection *isec, uint32_t value, bool isWeakDef) { Symbol *s; bool wasInserted; + bool overridesWeakDef = false; std::tie(s, wasInserted) = insert(name); if (!wasInserted) { @@ -48,12 +49,16 @@ Symbol *SymbolTable::addDefined(StringRef name, InputSection *isec, return s; if (!defined->isWeakDef()) error("duplicate symbol: " + name); + } else if (auto *dysym = dyn_cast(s)) { + overridesWeakDef = !isWeakDef && dysym->isWeakDef(); } // Defined symbols take priority over other types of symbols, so in case // of a name conflict, we fall through to the replaceSymbol() call below. } - replaceSymbol(s, name, isec, value, isWeakDef); + Defined *defined = replaceSymbol(s, name, isec, value, isWeakDef, + /*isExternal=*/true); + defined->overridesWeakDef = overridesWeakDef; return s; } @@ -75,6 +80,11 @@ Symbol *SymbolTable::addDylib(StringRef name, DylibFile *file, bool isWeakDef, bool wasInserted; std::tie(s, wasInserted) = insert(name); + if (!wasInserted && isWeakDef) + if (auto *defined = dyn_cast(s)) + if (!defined->isWeakDef()) + defined->overridesWeakDef = true; + if (wasInserted || isa(s) || (isa(s) && !isWeakDef && s->isWeakDef())) replaceSymbol(s, file, name, isWeakDef, isTlv); diff --git a/lld/MachO/Symbols.h b/lld/MachO/Symbols.h index 4ccb87e18335d..33ba00860ef33 100644 --- a/lld/MachO/Symbols.h +++ b/lld/MachO/Symbols.h @@ -57,11 +57,19 @@ class Symbol { virtual bool isTlv() const { llvm_unreachable("cannot be TLV"); } + // Whether this symbol is in the GOT or TLVPointer sections. + bool isInGot() const { return gotIndex != UINT32_MAX; } + + // Whether this symbol is in the StubsSection. + bool isInStubs() const { return stubsIndex != UINT32_MAX; } + // The index of this symbol in the GOT or the TLVPointer section, depending // on whether it is a thread-local. A given symbol cannot be referenced by // both these sections at once. uint32_t gotIndex = UINT32_MAX; + uint32_t stubsIndex = UINT32_MAX; + protected: Symbol(Kind k, StringRefZ name) : symbolKind(k), name(name) {} @@ -71,14 +79,17 @@ class Symbol { class Defined : public Symbol { public: - Defined(StringRefZ name, InputSection *isec, uint32_t value, bool isWeakDef) + Defined(StringRefZ name, InputSection *isec, uint32_t value, bool isWeakDef, + bool isExternal) : Symbol(DefinedKind, name), isec(isec), value(value), - weakDef(isWeakDef) {} + overridesWeakDef(false), weakDef(isWeakDef), external(isExternal) {} bool isWeakDef() const override { return weakDef; } bool isTlv() const override { return isThreadLocalVariables(isec->flags); } + bool isExternal() const { return external; } + static bool classof(const Symbol *s) { return s->kind() == DefinedKind; } uint64_t getVA() const override { return isec->getVA() + value; } @@ -90,8 +101,11 @@ class Defined : public Symbol { InputSection *isec; uint32_t value; + bool overridesWeakDef : 1; + private: - const bool weakDef; + const bool weakDef : 1; + const bool external : 1; }; class Undefined : public Symbol { @@ -107,13 +121,13 @@ class DylibSymbol : public Symbol { : Symbol(DylibKind, name), file(file), weakDef(isWeakDef), tlv(isTlv) {} bool isWeakDef() const override { return weakDef; } - bool isTlv() const override { return tlv; } + bool hasStubsHelper() const { return stubsHelperIndex != UINT32_MAX; } static bool classof(const Symbol *s) { return s->kind() == DylibKind; } DylibFile *file; - uint32_t stubsIndex = UINT32_MAX; + uint32_t stubsHelperIndex = UINT32_MAX; uint32_t lazyBindOffset = UINT32_MAX; private: @@ -157,9 +171,13 @@ class DSOHandle : public Symbol { uint64_t getFileOffset() const override; + bool isWeakDef() const override { return false; } + + bool isTlv() const override { return false; } + static constexpr StringRef name = "___dso_handle"; - static bool classof(const Symbol *s) { return s->kind() == DefinedKind; } + static bool classof(const Symbol *s) { return s->kind() == DSOHandleKind; } }; union SymbolUnion { @@ -170,14 +188,14 @@ union SymbolUnion { }; template -void replaceSymbol(Symbol *s, ArgT &&... arg) { +T *replaceSymbol(Symbol *s, ArgT &&... arg) { static_assert(sizeof(T) <= sizeof(SymbolUnion), "SymbolUnion too small"); static_assert(alignof(T) <= alignof(SymbolUnion), "SymbolUnion not aligned enough"); assert(static_cast(static_cast(nullptr)) == nullptr && "Not a Symbol"); - new (s) T(std::forward(arg)...); + return new (s) T(std::forward(arg)...); } } // namespace macho diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index 9876b9ddad379..66dcbdecea291 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -63,6 +63,12 @@ void MachHeaderSection::writeTo(uint8_t *buf) const { if (config->outputType == MachO::MH_DYLIB && !config->hasReexports) hdr->flags |= MachO::MH_NO_REEXPORTED_DYLIBS; + if (in.exports->hasWeakSymbol || in.weakBinding->hasNonWeakDefinition()) + hdr->flags |= MachO::MH_WEAK_DEFINES; + + if (in.exports->hasWeakSymbol || in.weakBinding->hasEntry()) + hdr->flags |= MachO::MH_BINDS_TO_WEAK; + for (OutputSegment *seg : outputSegments) { for (OutputSection *osec : seg->getSections()) { if (isThreadLocalVariables(osec->flags)) { @@ -89,10 +95,12 @@ NonLazyPointerSectionBase::NonLazyPointerSectionBase(const char *segname, flags = MachO::S_NON_LAZY_SYMBOL_POINTERS; } -void NonLazyPointerSectionBase::addEntry(Symbol &sym) { - if (entries.insert(&sym)) { - assert(sym.gotIndex == UINT32_MAX); - sym.gotIndex = entries.size() - 1; +void NonLazyPointerSectionBase::addEntry(Symbol *sym) { + if (entries.insert(sym)) { + assert(!sym->isInGot()); + sym->gotIndex = entries.size() - 1; + + addNonLazyBindingEntries(sym, this, sym->gotIndex * WordSize); } } @@ -105,11 +113,6 @@ void NonLazyPointerSectionBase::writeTo(uint8_t *buf) const { BindingSection::BindingSection() : LinkEditSection(segment_names::linkEdit, section_names::binding) {} -bool BindingSection::isNeeded() const { - return bindings.size() != 0 || in.got->isNeeded() || - in.tlvPointers->isNeeded(); -} - namespace { struct Binding { OutputSegment *segment = nullptr; @@ -119,13 +122,13 @@ struct Binding { }; } // namespace -// Encode a sequence of opcodes that tell dyld to write the address of dysym + +// Encode a sequence of opcodes that tell dyld to write the address of symbol + // addend at osec->addr + outSecOff. // // The bind opcode "interpreter" remembers the values of each binding field, so // we only need to encode the differences between bindings. Hence the use of // lastBinding. -static void encodeBinding(const DylibSymbol &dysym, const OutputSection *osec, +static void encodeBinding(const Symbol *sym, const OutputSection *osec, uint64_t outSecOff, int64_t addend, Binding &lastBinding, raw_svector_ostream &os) { using namespace llvm::MachO; @@ -143,17 +146,6 @@ static void encodeBinding(const DylibSymbol &dysym, const OutputSection *osec, lastBinding.offset = offset; } - if (lastBinding.ordinal != dysym.file->ordinal) { - if (dysym.file->ordinal <= BIND_IMMEDIATE_MASK) { - os << static_cast(BIND_OPCODE_SET_DYLIB_ORDINAL_IMM | - dysym.file->ordinal); - } else { - os << static_cast(MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB); - encodeULEB128(dysym.file->ordinal, os); - } - lastBinding.ordinal = dysym.file->ordinal; - } - if (lastBinding.addend != addend) { os << static_cast(BIND_OPCODE_SET_ADDEND_SLEB); encodeSLEB128(addend, os); @@ -161,27 +153,41 @@ static void encodeBinding(const DylibSymbol &dysym, const OutputSection *osec, } os << static_cast(BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM) - << dysym.getName() << '\0' + << sym->getName() << '\0' << static_cast(BIND_OPCODE_SET_TYPE_IMM | BIND_TYPE_POINTER) << static_cast(BIND_OPCODE_DO_BIND); // DO_BIND causes dyld to both perform the binding and increment the offset lastBinding.offset += WordSize; } -static bool encodeNonLazyPointerSection(NonLazyPointerSectionBase *osec, - Binding &lastBinding, - raw_svector_ostream &os) { - bool didEncode = false; - size_t idx = 0; - for (const Symbol *sym : osec->getEntries()) { - if (const auto *dysym = dyn_cast(sym)) { - didEncode = true; - encodeBinding(*dysym, osec, idx * WordSize, /*addend=*/0, lastBinding, - os); +// Non-weak bindings need to have their dylib ordinal encoded as well. +static void encodeDylibOrdinal(const DylibSymbol *dysym, Binding &lastBinding, + raw_svector_ostream &os) { + using namespace llvm::MachO; + if (lastBinding.ordinal != dysym->file->ordinal) { + if (dysym->file->ordinal <= BIND_IMMEDIATE_MASK) { + os << static_cast(BIND_OPCODE_SET_DYLIB_ORDINAL_IMM | + dysym->file->ordinal); + } else { + os << static_cast(BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB); + encodeULEB128(dysym->file->ordinal, os); } - ++idx; + lastBinding.ordinal = dysym->file->ordinal; } - return didEncode; +} + +static void encodeWeakOverride(const Defined *defined, + raw_svector_ostream &os) { + using namespace llvm::MachO; + os << static_cast(BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM | + BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION) + << defined->getName() << '\0'; +} + +uint64_t BindingTarget::getVA() const { + if (auto *isec = section.dyn_cast()) + return isec->getVA() + offset; + return section.get()->addr + offset; } // Emit bind opcodes, which are a stream of byte-sized opcodes that dyld @@ -200,30 +206,25 @@ static bool encodeNonLazyPointerSection(NonLazyPointerSectionBase *osec, void BindingSection::finalizeContents() { raw_svector_ostream os{contents}; Binding lastBinding; - bool didEncode = encodeNonLazyPointerSection(in.got, lastBinding, os); - didEncode |= encodeNonLazyPointerSection(in.tlvPointers, lastBinding, os); - // Sorting the relocations by segment and address allows us to encode them - // more compactly. + // Since bindings are delta-encoded, sorting them allows for a more compact + // result. Note that sorting by address alone ensures that bindings for the + // same segment / section are located together. llvm::sort(bindings, [](const BindingEntry &a, const BindingEntry &b) { - OutputSegment *segA = a.isec->parent->parent; - OutputSegment *segB = b.isec->parent->parent; - if (segA != segB) - return segA->fileOff < segB->fileOff; - OutputSection *osecA = a.isec->parent; - OutputSection *osecB = b.isec->parent; - if (osecA != osecB) - return osecA->addr < osecB->addr; - if (a.isec != b.isec) - return a.isec->outSecOff < b.isec->outSecOff; - return a.offset < b.offset; + return a.target.getVA() < b.target.getVA(); }); for (const BindingEntry &b : bindings) { - didEncode = true; - encodeBinding(*b.dysym, b.isec->parent, b.isec->outSecOff + b.offset, - b.addend, lastBinding, os); + encodeDylibOrdinal(b.dysym, lastBinding, os); + if (auto *isec = b.target.section.dyn_cast()) { + encodeBinding(b.dysym, isec->parent, isec->outSecOff + b.target.offset, + b.target.addend, lastBinding, os); + } else { + auto *osec = b.target.section.get(); + encodeBinding(b.dysym, osec, b.target.offset, b.target.addend, + lastBinding, os); + } } - if (didEncode) + if (!bindings.empty()) os << static_cast(MachO::BIND_OPCODE_DONE); } @@ -231,6 +232,69 @@ void BindingSection::writeTo(uint8_t *buf) const { memcpy(buf, contents.data(), contents.size()); } +WeakBindingSection::WeakBindingSection() + : LinkEditSection(segment_names::linkEdit, section_names::weakBinding) {} + +void WeakBindingSection::finalizeContents() { + raw_svector_ostream os{contents}; + Binding lastBinding; + + for (const Defined *defined : definitions) + encodeWeakOverride(defined, os); + + // Since bindings are delta-encoded, sorting them allows for a more compact + // result. + llvm::sort(bindings, + [](const WeakBindingEntry &a, const WeakBindingEntry &b) { + return a.target.getVA() < b.target.getVA(); + }); + for (const WeakBindingEntry &b : bindings) { + if (auto *isec = b.target.section.dyn_cast()) { + encodeBinding(b.symbol, isec->parent, isec->outSecOff + b.target.offset, + b.target.addend, lastBinding, os); + } else { + auto *osec = b.target.section.get(); + encodeBinding(b.symbol, osec, b.target.offset, b.target.addend, + lastBinding, os); + } + } + if (!bindings.empty() || !definitions.empty()) + os << static_cast(MachO::BIND_OPCODE_DONE); +} + +void WeakBindingSection::writeTo(uint8_t *buf) const { + memcpy(buf, contents.data(), contents.size()); +} + +bool macho::needsBinding(const Symbol *sym) { + if (isa(sym)) { + return true; + } else if (const auto *defined = dyn_cast(sym)) { + if (defined->isWeakDef() && defined->isExternal()) + return true; + } + return false; +} + +void macho::addNonLazyBindingEntries(const Symbol *sym, + SectionPointerUnion section, + uint64_t offset, int64_t addend) { + if (auto *dysym = dyn_cast(sym)) { + in.binding->addEntry(dysym, section, offset, addend); + if (dysym->isWeakDef()) + in.weakBinding->addEntry(sym, section, offset, addend); + } else if (auto *defined = dyn_cast(sym)) { + if (defined->isWeakDef() && defined->isExternal()) + in.weakBinding->addEntry(sym, section, offset, addend); + } else if (isa(sym)) { + error("cannot bind to " + DSOHandle::name); + } else { + // Undefined symbols are filtered out in scanRelocations(); we should never + // get here + llvm_unreachable("cannot bind to an undefined symbol"); + } +} + StubsSection::StubsSection() : SyntheticSection(segment_names::text, "__stubs") {} @@ -240,15 +304,17 @@ uint64_t StubsSection::getSize() const { void StubsSection::writeTo(uint8_t *buf) const { size_t off = 0; - for (const DylibSymbol *sym : in.stubs->getEntries()) { + for (const Symbol *sym : entries) { target->writeStub(buf + off, *sym); off += target->stubSize; } } -void StubsSection::addEntry(DylibSymbol &sym) { - if (entries.insert(&sym)) - sym.stubsIndex = entries.size() - 1; +bool StubsSection::addEntry(Symbol *sym) { + bool inserted = entries.insert(sym); + if (inserted) + sym->stubsIndex = entries.size() - 1; + return inserted; } StubHelperSection::StubHelperSection() @@ -256,17 +322,15 @@ StubHelperSection::StubHelperSection() uint64_t StubHelperSection::getSize() const { return target->stubHelperHeaderSize + - in.stubs->getEntries().size() * target->stubHelperEntrySize; + in.lazyBinding->getEntries().size() * target->stubHelperEntrySize; } -bool StubHelperSection::isNeeded() const { - return !in.stubs->getEntries().empty(); -} +bool StubHelperSection::isNeeded() const { return in.lazyBinding->isNeeded(); } void StubHelperSection::writeTo(uint8_t *buf) const { target->writeStubHelperHeader(buf); size_t off = target->stubHelperHeaderSize; - for (const DylibSymbol *sym : in.stubs->getEntries()) { + for (const DylibSymbol *sym : in.lazyBinding->getEntries()) { target->writeStubHelperEntry(buf + off, *sym, addr + off); off += target->stubHelperEntrySize; } @@ -279,7 +343,7 @@ void StubHelperSection::setup() { "Needed to perform lazy binding."); return; } - in.got->addEntry(*stubBinder); + in.got->addEntry(stubBinder); inputSections.push_back(in.imageLoaderCache); symtab->addDefined("__dyld_private", in.imageLoaderCache, 0, @@ -310,10 +374,17 @@ bool LazyPointerSection::isNeeded() const { void LazyPointerSection::writeTo(uint8_t *buf) const { size_t off = 0; - for (const DylibSymbol *sym : in.stubs->getEntries()) { - uint64_t stubHelperOffset = target->stubHelperHeaderSize + - sym->stubsIndex * target->stubHelperEntrySize; - write64le(buf + off, in.stubHelper->addr + stubHelperOffset); + for (const Symbol *sym : in.stubs->getEntries()) { + if (const auto *dysym = dyn_cast(sym)) { + if (dysym->hasStubsHelper()) { + uint64_t stubHelperOffset = + target->stubHelperHeaderSize + + dysym->stubsHelperIndex * target->stubHelperEntrySize; + write64le(buf + off, in.stubHelper->addr + stubHelperOffset); + } + } else { + write64le(buf + off, sym->getVA()); + } off += WordSize; } } @@ -321,12 +392,10 @@ void LazyPointerSection::writeTo(uint8_t *buf) const { LazyBindingSection::LazyBindingSection() : LinkEditSection(segment_names::linkEdit, section_names::lazyBinding) {} -bool LazyBindingSection::isNeeded() const { return in.stubs->isNeeded(); } - void LazyBindingSection::finalizeContents() { // TODO: Just precompute output size here instead of writing to a temporary // buffer - for (DylibSymbol *sym : in.stubs->getEntries()) + for (DylibSymbol *sym : entries) sym->lazyBindOffset = encode(*sym); } @@ -334,6 +403,11 @@ void LazyBindingSection::writeTo(uint8_t *buf) const { memcpy(buf, contents.data(), contents.size()); } +void LazyBindingSection::addEntry(DylibSymbol *dysym) { + if (entries.insert(dysym)) + dysym->stubsHelperIndex = entries.size() - 1; +} + // Unlike the non-lazy binding section, the bind opcodes in this section aren't // interpreted all at once. Rather, dyld will start interpreting opcodes at a // given offset, typically only binding a single symbol before it finds a @@ -368,19 +442,22 @@ ExportSection::ExportSection() void ExportSection::finalizeContents() { // TODO: We should check symbol visibility. - for (const Symbol *sym : symtab->getSymbols()) - if (auto *defined = dyn_cast(sym)) + for (const Symbol *sym : symtab->getSymbols()) { + if (const auto *defined = dyn_cast(sym)) { trieBuilder.addSymbol(*defined); + hasWeakSymbol = hasWeakSymbol || sym->isWeakDef(); + } + } size = trieBuilder.build(); } void ExportSection::writeTo(uint8_t *buf) const { trieBuilder.writeTo(buf); } SymtabSection::SymtabSection(StringTableSection &stringTableSection) - : SyntheticSection(segment_names::linkEdit, section_names::symbolTable), + : LinkEditSection(segment_names::linkEdit, section_names::symbolTable), stringTableSection(stringTableSection) {} -uint64_t SymtabSection::getSize() const { +uint64_t SymtabSection::getRawSize() const { return symbols.size() * sizeof(structs::nlist_64); } diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h index a900ef287331d..447327ba4d64d 100644 --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -16,6 +16,7 @@ #include "OutputSegment.h" #include "Target.h" +#include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/SetVector.h" #include "llvm/Support/raw_ostream.h" @@ -27,6 +28,7 @@ namespace section_names { constexpr const char pageZero[] = "__pagezero"; constexpr const char header[] = "__mach_header"; constexpr const char binding[] = "__binding"; +constexpr const char weakBinding[] = "__weak_binding"; constexpr const char lazyBinding[] = "__lazy_binding"; constexpr const char export_[] = "__export"; constexpr const char symbolTable[] = "__symbol_table"; @@ -36,6 +38,7 @@ constexpr const char threadPtrs[] = "__thread_ptrs"; } // namespace section_names +class Defined; class DylibSymbol; class LoadCommand; @@ -59,6 +62,11 @@ class LinkEditSection : public SyntheticSection { align = WordSize; } + // Sections in __LINKEDIT are special: their offsets are recorded in the + // load commands like LC_DYLD_INFO_ONLY and LC_SYMTAB, instead of in section + // headers. + bool isHidden() const override final { return true; } + virtual uint64_t getRawSize() const = 0; // codesign (or more specifically libstuff) checks that each section in @@ -113,7 +121,7 @@ class NonLazyPointerSectionBase : public SyntheticSection { void writeTo(uint8_t *buf) const override; - void addEntry(Symbol &sym); + void addEntry(Symbol *sym); private: llvm::SetVector entries; @@ -136,14 +144,25 @@ class TlvPointerSection : public NonLazyPointerSectionBase { section_names::threadPtrs) {} }; -struct BindingEntry { - const DylibSymbol *dysym; - const InputSection *isec; +using SectionPointerUnion = + llvm::PointerUnion; + +struct BindingTarget { + SectionPointerUnion section; uint64_t offset; int64_t addend; - BindingEntry(const DylibSymbol *dysym, const InputSection *isec, - uint64_t offset, int64_t addend) - : dysym(dysym), isec(isec), offset(offset), addend(addend) {} + + BindingTarget(SectionPointerUnion section, uint64_t offset, int64_t addend) + : section(section), offset(offset), addend(addend) {} + + uint64_t getVA() const; +}; + +struct BindingEntry { + const DylibSymbol *dysym; + BindingTarget target; + BindingEntry(const DylibSymbol *dysym, BindingTarget target) + : dysym(dysym), target(std::move(target)) {} }; // Stores bind opcodes for telling dyld which symbols to load non-lazily. @@ -152,16 +171,12 @@ class BindingSection : public LinkEditSection { BindingSection(); void finalizeContents(); uint64_t getRawSize() const override { return contents.size(); } - // Like other sections in __LINKEDIT, the binding section is special: its - // offsets are recorded in the LC_DYLD_INFO_ONLY load command, instead of in - // section headers. - bool isHidden() const override { return true; } - bool isNeeded() const override; + bool isNeeded() const override { return !bindings.empty(); } void writeTo(uint8_t *buf) const override; - void addEntry(const DylibSymbol *dysym, const InputSection *isec, - uint64_t offset, int64_t addend) { - bindings.emplace_back(dysym, isec, offset, addend); + void addEntry(const DylibSymbol *dysym, SectionPointerUnion section, + uint64_t offset, int64_t addend = 0) { + bindings.emplace_back(dysym, BindingTarget(section, offset, addend)); } private: @@ -169,16 +184,72 @@ class BindingSection : public LinkEditSection { SmallVector contents; }; +struct WeakBindingEntry { + const Symbol *symbol; + BindingTarget target; + WeakBindingEntry(const Symbol *symbol, BindingTarget target) + : symbol(symbol), target(std::move(target)) {} +}; + +// Stores bind opcodes for telling dyld which weak symbols need coalescing. +// There are two types of entries in this section: +// +// 1) Non-weak definitions: This is a symbol definition that weak symbols in +// other dylibs should coalesce to. +// +// 2) Weak bindings: These tell dyld that a given symbol reference should +// coalesce to a non-weak definition if one is found. Note that unlike in the +// entries in the BindingSection, the bindings here only refer to these +// symbols by name, but do not specify which dylib to load them from. +class WeakBindingSection : public LinkEditSection { +public: + WeakBindingSection(); + void finalizeContents(); + uint64_t getRawSize() const override { return contents.size(); } + bool isNeeded() const override { + return !bindings.empty() || !definitions.empty(); + } + + void writeTo(uint8_t *buf) const override; + + void addEntry(const Symbol *symbol, SectionPointerUnion section, + uint64_t offset, int64_t addend = 0) { + bindings.emplace_back(symbol, BindingTarget(section, offset, addend)); + } + + bool hasEntry() const { return !bindings.empty(); } + + void addNonWeakDefinition(const Defined *defined) { + definitions.emplace_back(defined); + } + + bool hasNonWeakDefinition() const { return !definitions.empty(); } + +private: + std::vector bindings; + std::vector definitions; + SmallVector contents; +}; + +// Whether a given symbol's address can only be resolved at runtime. +bool needsBinding(const Symbol *); + +// Add bindings for symbols that need weak or non-lazy bindings. +void addNonLazyBindingEntries(const Symbol *, SectionPointerUnion, + uint64_t offset, int64_t addend = 0); + // The following sections implement lazy symbol binding -- very similar to the // PLT mechanism in ELF. // -// ELF's .plt section is broken up into two sections in Mach-O: StubsSection and -// StubHelperSection. Calls to functions in dylibs will end up calling into +// ELF's .plt section is broken up into two sections in Mach-O: StubsSection +// and StubHelperSection. Calls to functions in dylibs will end up calling into // StubsSection, which contains indirect jumps to addresses stored in the // LazyPointerSection (the counterpart to ELF's .plt.got). // -// Initially, the LazyPointerSection contains addresses that point into one of -// the entry points in the middle of the StubHelperSection. The code in +// We will first describe how non-weak symbols are handled. +// +// At program start, the LazyPointerSection contains addresses that point into +// one of the entry points in the middle of the StubHelperSection. The code in // StubHelperSection will push on the stack an offset into the // LazyBindingSection. The push is followed by a jump to the beginning of the // StubHelperSection (similar to PLT0), which then calls into dyld_stub_binder. @@ -186,10 +257,17 @@ class BindingSection : public LinkEditSection { // the GOT. // // The stub binder will look up the bind opcodes in the LazyBindingSection at -// the given offset. The bind opcodes will tell the binder to update the address -// in the LazyPointerSection to point to the symbol, so that subsequent calls -// don't have to redo the symbol resolution. The binder will then jump to the -// resolved symbol. +// the given offset. The bind opcodes will tell the binder to update the +// address in the LazyPointerSection to point to the symbol, so that subsequent +// calls don't have to redo the symbol resolution. The binder will then jump to +// the resolved symbol. +// +// With weak symbols, the situation is slightly different. Since there is no +// "weak lazy" lookup, function calls to weak symbols are always non-lazily +// bound. We emit both regular non-lazy bindings as well as weak bindings, in +// order that the weak bindings may overwrite the non-lazy bindings if an +// appropriate symbol is found at runtime. However, the bound addresses will +// still be written (non-lazily) into the LazyPointerSection. class StubsSection : public SyntheticSection { public: @@ -197,13 +275,13 @@ class StubsSection : public SyntheticSection { uint64_t getSize() const override; bool isNeeded() const override { return !entries.empty(); } void writeTo(uint8_t *buf) const override; - - const llvm::SetVector &getEntries() const { return entries; } - - void addEntry(DylibSymbol &sym); + const llvm::SetVector &getEntries() const { return entries; } + // Returns whether the symbol was added. Note that every stubs entry will + // have a corresponding entry in the LazyPointerSection. + bool addEntry(Symbol *); private: - llvm::SetVector entries; + llvm::SetVector entries; }; class StubHelperSection : public SyntheticSection { @@ -228,6 +306,8 @@ class ImageLoaderCacheSection : public InputSection { uint64_t getSize() const override { return WordSize; } }; +// Note that this section may also be targeted by non-lazy bindings. In +// particular, this happens when branch relocations target weak symbols. class LazyPointerSection : public SyntheticSection { public: LazyPointerSection(); @@ -241,15 +321,17 @@ class LazyBindingSection : public LinkEditSection { LazyBindingSection(); void finalizeContents(); uint64_t getRawSize() const override { return contents.size(); } - uint32_t encode(const DylibSymbol &); - // Like other sections in __LINKEDIT, the lazy binding section is special: its - // offsets are recorded in the LC_DYLD_INFO_ONLY load command, instead of in - // section headers. - bool isHidden() const override { return true; } - bool isNeeded() const override; + bool isNeeded() const override { return !entries.empty(); } void writeTo(uint8_t *buf) const override; + // Note that every entry here will by referenced by a corresponding entry in + // the StubHelperSection. + void addEntry(DylibSymbol *dysym); + const llvm::SetVector &getEntries() const { return entries; } private: + uint32_t encode(const DylibSymbol &); + + llvm::SetVector entries; SmallVector contents; llvm::raw_svector_ostream os{contents}; }; @@ -260,12 +342,10 @@ class ExportSection : public LinkEditSection { ExportSection(); void finalizeContents(); uint64_t getRawSize() const override { return size; } - // Like other sections in __LINKEDIT, the export section is special: its - // offsets are recorded in the LC_DYLD_INFO_ONLY load command, instead of in - // section headers. - bool isHidden() const override { return true; } void writeTo(uint8_t *buf) const override; + bool hasWeakSymbol = false; + private: TrieBuilder trieBuilder; size_t size = 0; @@ -278,10 +358,6 @@ class StringTableSection : public LinkEditSection { // Returns the start offset of the added string. uint32_t addString(StringRef); uint64_t getRawSize() const override { return size; } - // Like other sections in __LINKEDIT, the string table section is special: its - // offsets are recorded in the LC_SYMTAB load command, instead of in section - // headers. - bool isHidden() const override { return true; } void writeTo(uint8_t *buf) const override; private: @@ -297,16 +373,12 @@ struct SymtabEntry { size_t strx; }; -class SymtabSection : public SyntheticSection { +class SymtabSection : public LinkEditSection { public: SymtabSection(StringTableSection &); void finalizeContents(); size_t getNumSymbols() const { return symbols.size(); } - uint64_t getSize() const override; - // Like other sections in __LINKEDIT, the symtab section is special: its - // offsets are recorded in the LC_SYMTAB load command, instead of in section - // headers. - bool isHidden() const override { return true; } + uint64_t getRawSize() const override; void writeTo(uint8_t *buf) const override; private: @@ -317,6 +389,9 @@ class SymtabSection : public SyntheticSection { struct InStruct { MachHeaderSection *header = nullptr; BindingSection *binding = nullptr; + WeakBindingSection *weakBinding = nullptr; + LazyBindingSection *lazyBinding = nullptr; + ExportSection *exports = nullptr; GotSection *got = nullptr; TlvPointerSection *tlvPointers = nullptr; LazyPointerSection *lazyPointers = nullptr; diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h index cbee6afa6b619..d80da011e2864 100644 --- a/lld/MachO/Target.h +++ b/lld/MachO/Target.h @@ -44,7 +44,7 @@ class TargetInfo { // Write code for lazy binding. See the comments on StubsSection for more // details. - virtual void writeStub(uint8_t *buf, const DylibSymbol &) const = 0; + virtual void writeStub(uint8_t *buf, const Symbol &) const = 0; virtual void writeStubHelperHeader(uint8_t *buf) const = 0; virtual void writeStubHelperEntry(uint8_t *buf, const DylibSymbol &, uint64_t entryAddr) const = 0; @@ -54,7 +54,7 @@ class TargetInfo { // GOT/stubs entries, and resolveSymbolVA() will return the addresses of those // entries. resolveSymbolVA() may also relax the target instructions to save // on a level of address indirection. - virtual void prepareSymbolRelocation(Symbol &, const InputSection *, + virtual void prepareSymbolRelocation(Symbol *, const InputSection *, const Reloc &) = 0; virtual uint64_t resolveSymbolVA(uint8_t *buf, const Symbol &, uint8_t type) const = 0; diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index 3593ff692c3d2..741ec31b19ee9 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -55,8 +55,6 @@ class Writer { uint64_t addr = 0; uint64_t fileOff = 0; MachHeaderSection *header = nullptr; - LazyBindingSection *lazyBindingSection = nullptr; - ExportSection *exportSection = nullptr; StringTableSection *stringTableSection = nullptr; SymtabSection *symtabSection = nullptr; }; @@ -65,10 +63,11 @@ class Writer { class LCDyldInfo : public LoadCommand { public: LCDyldInfo(BindingSection *bindingSection, + WeakBindingSection *weakBindingSection, LazyBindingSection *lazyBindingSection, ExportSection *exportSection) - : bindingSection(bindingSection), lazyBindingSection(lazyBindingSection), - exportSection(exportSection) {} + : bindingSection(bindingSection), weakBindingSection(weakBindingSection), + lazyBindingSection(lazyBindingSection), exportSection(exportSection) {} uint32_t getSize() const override { return sizeof(dyld_info_command); } @@ -80,6 +79,10 @@ class LCDyldInfo : public LoadCommand { c->bind_off = bindingSection->fileOff; c->bind_size = bindingSection->getFileSize(); } + if (weakBindingSection->isNeeded()) { + c->weak_bind_off = weakBindingSection->fileOff; + c->weak_bind_size = weakBindingSection->getFileSize(); + } if (lazyBindingSection->isNeeded()) { c->lazy_bind_off = lazyBindingSection->fileOff; c->lazy_bind_size = lazyBindingSection->getFileSize(); @@ -91,6 +94,7 @@ class LCDyldInfo : public LoadCommand { } BindingSection *bindingSection; + WeakBindingSection *weakBindingSection; LazyBindingSection *lazyBindingSection; ExportSection *exportSection; }; @@ -314,7 +318,7 @@ void Writer::scanRelocations() { error("undefined symbol " + s->getName() + ", referenced from " + sys::path::filename(isec->file->getName())); else - target->prepareSymbolRelocation(*s, isec, r); + target->prepareSymbolRelocation(s, isec, r); } } } @@ -322,7 +326,7 @@ void Writer::scanRelocations() { void Writer::createLoadCommands() { in.header->addLoadCommand( - make(in.binding, lazyBindingSection, exportSection)); + make(in.binding, in.weakBinding, in.lazyBinding, in.exports)); in.header->addLoadCommand(make(symtabSection, stringTableSection)); in.header->addLoadCommand(make()); for (StringRef path : config->runtimePaths) @@ -414,7 +418,8 @@ static int sectionOrder(OutputSection *osec) { return -1; } else if (segname == segment_names::linkEdit) { return StringSwitch(osec->name) - .Case(section_names::binding, -5) + .Case(section_names::binding, -6) + .Case(section_names::weakBinding, -5) .Case(section_names::lazyBinding, -4) .Case(section_names::export_, -3) .Case(section_names::symbolTable, -2) @@ -466,10 +471,8 @@ static void sortSegmentsAndSections() { void Writer::createOutputSections() { // First, create hidden sections - lazyBindingSection = make(); stringTableSection = make(); symtabSection = make(*stringTableSection); - exportSection = make(); switch (config->outputType) { case MH_EXECUTE: @@ -558,6 +561,11 @@ void Writer::run() { if (in.stubHelper->isNeeded()) in.stubHelper->setup(); + for (const macho::Symbol *sym : symtab->getSymbols()) + if (const auto *defined = dyn_cast(sym)) + if (defined->overridesWeakDef) + in.weakBinding->addNonWeakDefinition(defined); + // Sort and assign sections to their respective segments. No more sections nor // segments may be created after these methods run. createOutputSections(); @@ -577,8 +585,9 @@ void Writer::run() { // Fill __LINKEDIT contents. in.binding->finalizeContents(); - lazyBindingSection->finalizeContents(); - exportSection->finalizeContents(); + in.weakBinding->finalizeContents(); + in.lazyBinding->finalizeContents(); + in.exports->finalizeContents(); symtabSection->finalizeContents(); // Now that __LINKEDIT is filled out, do a proper calculation of its @@ -600,6 +609,9 @@ void macho::writeResult() { Writer().run(); } void macho::createSyntheticSections() { in.header = make(); in.binding = make(); + in.weakBinding = make(); + in.lazyBinding = make(); + in.exports = make(); in.got = make(); in.tlvPointers = make(); in.lazyPointers = make(); diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp index d60765c70c095..af7ecb8801e9d 100644 --- a/lld/MinGW/Driver.cpp +++ b/lld/MinGW/Driver.cpp @@ -292,7 +292,8 @@ bool mingw::link(ArrayRef argsArr, bool canExitEarly, add("-noseh"); if (args.getLastArgValue(OPT_m) != "thumb2pe" && - args.getLastArgValue(OPT_m) != "arm64pe" && !args.hasArg(OPT_dynamicbase)) + args.getLastArgValue(OPT_m) != "arm64pe" && + args.hasArg(OPT_no_dynamicbase)) add("-dynamicbase:no"); if (args.hasFlag(OPT_no_insert_timestamp, OPT_insert_timestamp, false)) @@ -313,6 +314,10 @@ bool mingw::link(ArrayRef argsArr, bool canExitEarly, else add("-runtime-pseudo-reloc:no"); + if (args.hasFlag(OPT_allow_multiple_definition, + OPT_no_allow_multiple_definition, false)) + add("-force:multiple"); + if (auto *a = args.getLastArg(OPT_icf)) { StringRef s = a->getValue(); if (s == "all") diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td index fe44166600509..d9f64c40ac53d 100644 --- a/lld/MinGW/Options.td +++ b/lld/MinGW/Options.td @@ -16,15 +16,23 @@ multiclass EqLong { HelpText; } +multiclass B { + def NAME: Flag<["--", "-"], name>, HelpText; + def no_ # NAME: Flag<["--", "-"], "no-" # name>, HelpText; +} + def L: JoinedOrSeparate<["-"], "L">, MetaVarName<"">, HelpText<"Add a directory to the library search path">; +defm allow_multiple_definition: B<"allow-multiple-definition", + "Allow multiple definitions", + "Do not allow multiple definitions (default)">; def Bdynamic: F<"Bdynamic">, HelpText<"Link against shared libraries">; def Bstatic: F<"Bstatic">, HelpText<"Do not link against shared libraries">; def disable_auto_import: F<"disable-auto-import">, HelpText<"Don't automatically import data symbols from other DLLs without dllimport">; def disable_runtime_pseudo_reloc: F<"disable-runtime-pseudo-reloc">, HelpText<"Don't do automatic imports that require runtime fixups">; -def dynamicbase: F<"dynamicbase">, HelpText<"Enable ASLR">; +defm dynamicbase: B<"dynamicbase", "Enable ASLR", "Disable ASLR">; def enable_auto_import: F<"enable-auto-import">, HelpText<"Automatically import data symbols from other DLLs where needed">; def enable_runtime_pseudo_reloc: F<"enable-runtime-pseudo-reloc">, @@ -35,12 +43,15 @@ def exclude_all_symbols: F<"exclude-all-symbols">, def export_all_symbols: F<"export-all-symbols">, HelpText<"Export all symbols even if a def file or dllexport attributes are used">; defm file_alignment: Eq<"file-alignment", "Set file alignment">; -def gc_sections: F<"gc-sections">, HelpText<"Remove unused sections">; +defm gc_sections: B<"gc-sections", + "Remove unused sections", + "Don't remove unused sections">; def help: F<"help">, HelpText<"Print option help">; def icf: J<"icf=">, HelpText<"Identical code folding">; def image_base: S<"image-base">, HelpText<"Base address of the program">; -def insert_timestamp: F<"insert-timestamp">, - HelpText<"Include PE header timestamp">; +defm insert_timestamp: B<"insert-timestamp", + "Include PE header timestamp", + "Don't include PE header timestamp">; def kill_at: F<"kill-at">, HelpText<"Remove @n from exported symbols">; def l: JoinedOrSeparate<["-"], "l">, MetaVarName<"">, HelpText<"Root name of library to use">; @@ -54,14 +65,9 @@ defm minor_os_version: EqLong<"minor-os-version", "Set the OS and subsystem minor version">; defm minor_subsystem_version: EqLong<"minor-subsystem-version", "Set the OS and subsystem minor version">; -def no_insert_timestamp: F<"no-insert-timestamp">, - HelpText<"Don't include PE header timestamp">; def no_seh: F<"no-seh">, HelpText<"Set the 'no SEH' flag in the executable">; -def no_whole_archive: F<"no-whole-archive">, - HelpText<"No longer include all object files for following archives">; def large_address_aware: Flag<["--"], "large-address-aware">, HelpText<"Enable large addresses">; -def no_gc_sections: F<"no-gc-sections">, HelpText<"Don't remove unused sections">; def o: JoinedOrSeparate<["-"], "o">, MetaVarName<"">, HelpText<"Path to file to write output">; defm out_implib: Eq<"out-implib", "Import library name">; @@ -76,8 +82,9 @@ def strip_debug: F<"strip-debug">, HelpText<"Omit all debug information, but keep symbol information">; defm reproduce: Eq<"reproduce", "Write a tar file containing input files and command line options to reproduce link">; defm undefined: Eq<"undefined", "Include symbol in the link, if available">; -def whole_archive: F<"whole-archive">, - HelpText<"Include all object files for following archives">; +defm whole_archive: B<"whole-archive", + "Include all object files for following archives", + "No longer include all object files for following archives">; def v: Flag<["-"], "v">, HelpText<"Display the version number">; def verbose: F<"verbose">, HelpText<"Verbose mode">; def version: F<"version">, HelpText<"Display the version number and exit">; diff --git a/lld/test/COFF/Inputs/comdat-binutils.yaml b/lld/test/COFF/Inputs/comdat-binutils.yaml new file mode 100644 index 0000000000000..70fff87e1a442 --- /dev/null +++ b/lld/test/COFF/Inputs/comdat-binutils.yaml @@ -0,0 +1,30 @@ +--- !COFF +header: + Machine: IMAGE_FILE_MACHINE_AMD64 + Characteristics: [ IMAGE_FILE_RELOCS_STRIPPED, IMAGE_FILE_LINE_NUMS_STRIPPED ] +sections: + - Name: '.rdata$mysymbol' + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_LNK_COMDAT, IMAGE_SCN_MEM_READ ] + Alignment: 16 + SectionData: 2A000000000000000000000000000000 +symbols: + - Name: '.rdata$mysymbol' + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_STATIC + SectionDefinition: + Length: 1 + NumberOfRelocations: 0 + NumberOfLinenumbers: 0 + CheckSum: 0 + Number: 0 + Selection: IMAGE_COMDAT_SELECT_SAME_SIZE + - Name: mysymbol + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_EXTERNAL +... diff --git a/lld/test/COFF/Inputs/comdat-llvm.yaml b/lld/test/COFF/Inputs/comdat-llvm.yaml new file mode 100644 index 0000000000000..eef117a92334f --- /dev/null +++ b/lld/test/COFF/Inputs/comdat-llvm.yaml @@ -0,0 +1,30 @@ +--- !COFF +header: + Machine: IMAGE_FILE_MACHINE_AMD64 + Characteristics: [ ] +sections: + - Name: '.rdata$mysymbol' + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_LNK_COMDAT, IMAGE_SCN_MEM_READ ] + Alignment: 1 + SectionData: 2A +symbols: + - Name: '.rdata$mysymbol' + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_STATIC + SectionDefinition: + Length: 1 + NumberOfRelocations: 0 + NumberOfLinenumbers: 0 + CheckSum: 3686517206 + Number: 1 + Selection: IMAGE_COMDAT_SELECT_SAME_SIZE + - Name: mysymbol + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_EXTERNAL +... diff --git a/lld/test/COFF/comdat-gcc-compatibility-size.test b/lld/test/COFF/comdat-gcc-compatibility-size.test new file mode 100644 index 0000000000000..493867646362d --- /dev/null +++ b/lld/test/COFF/comdat-gcc-compatibility-size.test @@ -0,0 +1,15 @@ +# RUN: yaml2obj %p/Inputs/comdat-llvm.yaml > %t.llvm.o +# RUN: yaml2obj %p/Inputs/comdat-binutils.yaml > %t.binutils.o +# RUN: lld-link -lldmingw -noentry -dll %t.llvm.o %t.binutils.o -out:%t.dll +# RUN: lld-link -lldmingw -noentry -dll %t.binutils.o %t.llvm.o -out:%t.dll +# RUN: not lld-link -noentry -dll %t.llvm.o %t.binutils.o -out:%t.dll +# RUN: not lld-link -noentry -dll %t.binutils.o %t.llvm.o -out:%t.dll + +# The test object files have been generated by assembling the following +# snippet using binutils and llvm. + +# .section .rdata$mysymbol, "dr" +# .linkonce same_size +# .globl mysymbol +#mysymbol: +# .byte 42 diff --git a/lld/test/ELF/gc-sections.s b/lld/test/ELF/gc-sections.s index b7f9b3db18207..a98696d41a002 100644 --- a/lld/test/ELF/gc-sections.s +++ b/lld/test/ELF/gc-sections.s @@ -12,6 +12,8 @@ # NOGC: Name: .text # NOGC: Name: .init # NOGC: Name: .fini +# NOGC: Name: .tdata +# NOGC: Name: .tbss # NOGC: Name: .ctors # NOGC: Name: .dtors # NOGC: Name: .debug_pubtypes @@ -19,6 +21,10 @@ # NOGC: Name: a # NOGC: Name: b # NOGC: Name: c +# NOGC: Name: e +# NOGC: Name: f +# NOGC: Name: g +# NOGC: Name: h # NOGC: Name: x # NOGC: Name: y # NOGC: Name: d @@ -27,6 +33,8 @@ # GC1: Name: .text # GC1: Name: .init # GC1: Name: .fini +# GC1: Name: .tdata +# GC1: Name: .tbss # GC1: Name: .ctors # GC1: Name: .dtors # GC1: Name: .debug_pubtypes @@ -34,6 +42,10 @@ # GC1: Name: a # GC1: Name: b # GC1: Name: c +# GC1: Name: e +# GC1-NOT: Name: f +# GC1: Name: g +# GC1-NOT: Name: h # GC1-NOT: Name: x # GC1-NOT: Name: y # GC1-NOT: Name: d @@ -42,6 +54,8 @@ # GC2: Name: .text # GC2: Name: .init # GC2: Name: .fini +# GC2: Name: .tdata +# GC2: Name: .tbss # GC2: Name: .ctors # GC2: Name: .dtors # GC2: Name: .debug_pubtypes @@ -49,12 +63,16 @@ # GC2: Name: a # GC2: Name: b # GC2: Name: c +# GC2: Name: e +# GC2-NOT: Name: f +# GC2: Name: g +# GC2-NOT: Name: h # GC2-NOT: Name: x # GC2-NOT: Name: y # GC2: Name: d .globl _start, d -.protected a, b, c, x, y +.protected a, b, c, e, f, g, h, x, y _start: call a @@ -65,11 +83,12 @@ a: .section .text.b,"ax",@progbits b: + leaq e@tpoff(%rax),%rdx call c .section .text.c,"ax",@progbits c: - nop + leaq g@tpoff(%rax),%rdx .section .text.d,"ax",@progbits d: @@ -83,6 +102,22 @@ x: y: call x +.section .tbss.e,"awT",@nobits +e: + .quad 0 + +.section .tbss.f,"awT",@nobits +f: + .quad 0 + +.section .tdata.g,"awT",@progbits +g: + .quad 0 + +.section .tdata.h,"awT",@progbits +h: + .quad 0 + .section .ctors,"aw",@progbits .quad 0 diff --git a/lld/test/ELF/gnu-property-err.s b/lld/test/ELF/gnu-property-err.s new file mode 100644 index 0000000000000..c400484e8816d --- /dev/null +++ b/lld/test/ELF/gnu-property-err.s @@ -0,0 +1,55 @@ +# REQUIRES: aarch64 +# RUN: split-file %s %t + +# RUN: llvm-mc -filetype=obj -triple=aarch64 %t/1.s -o %t1.o +# RUN: not ld.lld %t1.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR1 + +# ERR1: error: {{.*}}.o:(.note.gnu.property+0x0): data is too short + +# RUN: llvm-mc -filetype=obj -triple=aarch64 %t/2.s -o %t2.o +# RUN: not ld.lld %t2.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR2 +# RUN: llvm-mc -filetype=obj -triple=aarch64_be %t/2.s -o %t2be.o +# RUN: not ld.lld %t2be.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR2 + +# ERR2: error: {{.*}}.o:(.note.gnu.property+0x10): program property is too short + +# RUN: llvm-mc -filetype=obj -triple=aarch64 %t/3.s -o %t3.o +# RUN: not ld.lld %t3.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR3 +# RUN: llvm-mc -filetype=obj -triple=aarch64_be %t/3.s -o %t3be.o +# RUN: not ld.lld %t3be.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR3 + +# ERR3: error: {{.*}}.o:(.note.gnu.property+0x10): FEATURE_1_AND entry is too short + +#--- 1.s +.section ".note.gnu.property", "a" +.long 4 +.long 17 // n_descsz too long +.long 5 // NT_GNU_PROPERTY_TYPE_0 +.asciz "GNU" + +.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND +.long 4 // pr_datasz +.long 1 // GNU_PROPERTY_AARCH64_FEATURE_1_BTI +.long 0 + +#--- 2.s +.section ".note.gnu.property", "a" +.long 4 +.long 16 // n_descsz +.long 5 // NT_GNU_PROPERTY_TYPE_0 +.asciz "GNU" + +.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND +.long 9 // pr_datasz too long +.long 1 // GNU_PROPERTY_AARCH64_FEATURE_1_BTI +.long 0 + +#--- 3.s +.section ".note.gnu.property", "a" +.long 4 +.long 8 // n_descsz +.long 5 // NT_GNU_PROPERTY_TYPE_0 +.asciz "GNU" + +.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND +.long 0 // pr_datasz too short diff --git a/lld/test/ELF/linkerscript/non-alloc-segment.s b/lld/test/ELF/linkerscript/non-alloc-segment.s index 143cac1e2b126..f19ded3f17fdd 100644 --- a/lld/test/ELF/linkerscript/non-alloc-segment.s +++ b/lld/test/ELF/linkerscript/non-alloc-segment.s @@ -29,7 +29,7 @@ # CHECK-NEXT: 00 .text # CHECK-NEXT: 01 .foo -# PHDR: Type: (0x12345678) +# PHDR: Type: Unknown (0x12345678) # PHDR-NEXT: Offset: 0x1004 # PHDR-NEXT: VirtualAddress # PHDR-NEXT: PhysicalAddress diff --git a/lld/test/ELF/linkerscript/phdrs.s b/lld/test/ELF/linkerscript/phdrs.s index 2e9fcf23bf710..3e645f7919b7c 100644 --- a/lld/test/ELF/linkerscript/phdrs.s +++ b/lld/test/ELF/linkerscript/phdrs.s @@ -99,7 +99,7 @@ # INT-PHDRS: ProgramHeaders [ # INT-PHDRS: ProgramHeader { -# INT-PHDRS: Type: (0x11223344) +# INT-PHDRS: Type: Unknown (0x11223344) # INT-PHDRS-NEXT: Offset: 0xB0 # INT-PHDRS-NEXT: VirtualAddress: 0xB0 # INT-PHDRS-NEXT: PhysicalAddress: 0xB0 diff --git a/lld/test/ELF/merge-sym-gc.s b/lld/test/ELF/merge-sym-gc.s new file mode 100644 index 0000000000000..4f83179cc75b5 --- /dev/null +++ b/lld/test/ELF/merge-sym-gc.s @@ -0,0 +1,37 @@ +# REQUIRES: x86 +## Show how symbols in GCed mergeable pieces behave. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld --gc-sections %t.o -o %t.elf +# RUN: llvm-readelf %t.elf --sections --syms | FileCheck %s + +.section .rodata.merge,"aM",@progbits,4 +a1: ## Unreferenced. In first fragment, kept by a2 reference. + .short 1 +a2: ## Referenced. + .short 1 +b1: ## Unreferenced. Discarded as second fragment is unreferenced. + .short 1 +b2: ## Unreferenced. Discarded as second fragment is unreferenced. + .short 1 +c1: ## Referenced. + .short 1 +c2: ## Unreferenced. In third fragment, kept by c1 reference. + .short 1 + +.data +.global _start +_start: + .quad a2 + .quad c1 + +# CHECK: .rodata PROGBITS [[#%x, ADDR:]] + +# CHECK: Symbol table '.symtab' contains 6 entries: +# CHECK-NEXT: Num: Value {{.*}} Ndx Name +# CHECK-NEXT: 0: {{.*}} UND{{ *$}} +# CHECK-NEXT: 1: {{0*}}[[#ADDR]] {{.*}} a1 +# CHECK-NEXT: 2: {{0*}}[[#ADDR+2]] {{.*}} a2 +# CHECK-NEXT: 3: {{0*}}[[#ADDR]] {{.*}} c1 +# CHECK-NEXT: 4: {{0*}}[[#ADDR+2]] {{.*}} c2 +# CHECK-NEXT: 5: {{.*}} _start diff --git a/lld/test/ELF/merge-sym.s b/lld/test/ELF/merge-sym.s index 27ae3e048e200..6449ef731057a 100644 --- a/lld/test/ELF/merge-sym.s +++ b/lld/test/ELF/merge-sym.s @@ -1,21 +1,21 @@ // REQUIRES: x86 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o // RUN: ld.lld %t.o -o %t.so -shared -// RUN: llvm-readobj --symbols -S %t.so | FileCheck %s +// RUN: llvm-readelf --symbols -S %t.so | FileCheck %s .section .rodata.cst4,"aM",@progbits,4 .short 0 foo: .short 42 + .short 0 +bar: + .short 42 -// CHECK: Name: .rodata -// CHECK-NEXT: Type: SHT_PROGBITS -// CHECK-NEXT: Flags [ -// CHECK-NEXT: SHF_ALLOC -// CHECK-NEXT: SHF_MERGE -// CHECK-NEXT: ] -// CHECK-NEXT: Address: 0x20D +// CHECK: Name Type Address {{.*}} ES Flg +// CHECK: .rodata PROGBITS [[#%x, ADDR:]] {{.*}} 04 AM{{ }} -// CHECK: Name: foo -// CHECK-NEXT: Value: 0x20F +// CHECK: Symbol table '.symtab' contains {{.*}} entries: +// CHECK-NEXT: Num: Value {{.*}} Name +// CHECK-DAG: {{.*}}: {{0*}}[[#ADDR+2]] {{.*}} foo +// CHECK-DAG: {{.*}}: {{0*}}[[#ADDR+2]] {{.*}} bar diff --git a/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libc++.tbd b/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libc++.tbd new file mode 100644 index 0000000000000..f7c70b20666ce --- /dev/null +++ b/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libc++.tbd @@ -0,0 +1,10 @@ +--- !tapi-tbd-v3 +archs: [ i386, x86_64 ] +uuids: [ 'i386: 00000000-0000-0000-0000-000000000000', 'x86_64: 00000000-0000-0000-0000-000000000001' ] +platform: macosx +install-name: '/usr/lib/libc++.dylib' +current-version: 1281 +exports: + - archs: [ i386, x86_64 ] + re-exports: [ '/usr/lib/libc++abi.dylib' ] +... diff --git a/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libc++abi.tbd b/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libc++abi.tbd new file mode 100644 index 0000000000000..47b7456484b0e --- /dev/null +++ b/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libc++abi.tbd @@ -0,0 +1,10 @@ +--- !tapi-tbd-v3 +archs: [ i386, x86_64 ] +uuids: [ 'i386: 00000000-0000-0000-0000-000000000000', 'x86_64: 00000000-0000-0000-0000-000000000001' ] +platform: macosx +install-name: '/usr/lib/libc++abi.dylib' +current-version: 1281 +exports: + - archs: [ i386, x86_64 ] + symbols: [ ___gxx_personality_v0 ] +... diff --git a/lld/test/MachO/Inputs/iPhoneSimulator.sdk/usr/lib/libSystem.tbd b/lld/test/MachO/Inputs/iPhoneSimulator.sdk/usr/lib/libSystem.tbd index 3e62c2ee711bc..86d576b507cd2 100644 --- a/lld/test/MachO/Inputs/iPhoneSimulator.sdk/usr/lib/libSystem.tbd +++ b/lld/test/MachO/Inputs/iPhoneSimulator.sdk/usr/lib/libSystem.tbd @@ -12,7 +12,7 @@ exports: archs: [ i386, x86_64 ] uuids: [ 'i386: 00000000-0000-0000-0000-000000000002', 'x86_64: 00000000-0000-0000-0000-000000000003' ] platform: ios -install-name: '/usr/lib/libcache.dylib' +install-name: '/usr/lib/system/libcache.dylib' current-version: 83 parent-umbrella: System exports: @@ -20,4 +20,15 @@ exports: symbols: [ __cache_handle_memory_pressure_event ] - archs: [ i386, x86_64 ] symbols: [ _cache_create, _cache_destroy, _cache_get ] + +# The following TAPI document is not re-exported by any other document in this +# TBD file, and should therefore be inaccessible. +--- !tapi-tbd-v3 +archs: [ i386, x86_64 ] +uuids: [ 'i386: 00000000-0000-0000-0000-000000000003', 'x86_64: 00000000-0000-0000-0000-000000000004' ] +platform: ios +install-name: '/usr/lib/libnotreexported.dylib' +exports: + - archs: [ i386, x86_64 ] + symbols: [ _from_non_reexported_tapi_dylib ] ... diff --git a/lld/test/MachO/archive.s b/lld/test/MachO/archive.s index 370980768faaa..cb81fb0de47f2 100644 --- a/lld/test/MachO/archive.s +++ b/lld/test/MachO/archive.s @@ -22,11 +22,17 @@ # ARCHIVE-FIRST: T _boo # ARCHIVE-FIRST: T _main - # RUN: llvm-nm %t/test.out | FileCheck %s --check-prefix VISIBLE # VISIBLE-NOT: T _undefined # VISIBLE-NOT: T _unused +# RUN: lld -flavor darwinnew %t/test.a %t/main.o -o %t/all-load -all_load +# RUN: llvm-nm %t/all-load | FileCheck %s --check-prefix ALL-LOAD +# ALL-LOAD: T _bar +# ALL-LOAD: T _boo +# ALL-LOAD: T _main +# ALL-LOAD: T _unused + .global _main _main: callq _boo diff --git a/lld/test/MachO/dso-handle.s b/lld/test/MachO/dso-handle.s index f57ec7260fe06..9cef6448b3703 100644 --- a/lld/test/MachO/dso-handle.s +++ b/lld/test/MachO/dso-handle.s @@ -3,14 +3,17 @@ # RUN: lld -flavor darwinnew %t.o -o %t # RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s -# CHECK: leaq {{.*}} # 100000000 +# CHECK: leaq {{.*}} # 100000000 +# CHECK-NEXT: leaq {{.*}} # 100000000 # RUN: lld -flavor darwinnew -dylib %t.o -o %t.dylib # RUN: llvm-objdump -d --no-show-raw-insn %t.dylib | FileCheck %s --check-prefix=DYLIB-CHECK -# DYLIB-CHECK: leaq {{.*}} # 0 +# DYLIB-CHECK: leaq {{.*}} # 0 +# DYLIB-CHECK-NEXT: leaq {{.*}} # 0 .globl _main .text _main: leaq ___dso_handle(%rip), %rdx + movq ___dso_handle@GOTPCREL(%rip), %rdx ret diff --git a/lld/test/MachO/invalid/stub-link.s b/lld/test/MachO/invalid/stub-link.s index f1c1590370240..bcbdea1220a44 100644 --- a/lld/test/MachO/invalid/stub-link.s +++ b/lld/test/MachO/invalid/stub-link.s @@ -1,3 +1,6 @@ +## FIXME: This test seems to be failing on some Google Mac buildbots for +## unclear reasons, so it's disabled for now. See D85404 for details. +# UNSUPPORTED: darwin # REQUIRES: x86 # RUN: mkdir -p %t @@ -5,11 +8,13 @@ # RUN: llvm-mc -filetype obj -triple x86_64-apple-ios %s -o %t/test.o # RUN: not lld -flavor darwinnew -o %t/test -Z -L%S/../Inputs/iPhoneSimulator.sdk/usr/lib -lSystem %t/test.o 2>&1 | FileCheck %s -# CHECK: error: undefined symbol __cache_handle_memory_pressure_event +# CHECK-DAG: error: undefined symbol __cache_handle_memory_pressure_event +# CHECK-DAG: error: undefined symbol _from_non_reexported_tapi_dylib .section __TEXT,__text .global _main _main: movq __cache_handle_memory_pressure_event@GOTPCREL(%rip), %rax + movq _from_non_reexported_tapi_dylib@GOTPCREL(%rip), %rax ret diff --git a/lld/test/MachO/local-got.s b/lld/test/MachO/local-got.s index b95c44f6013de..6acd1ca4a72da 100644 --- a/lld/test/MachO/local-got.s +++ b/lld/test/MachO/local-got.s @@ -37,13 +37,16 @@ _main: movl $0x2000004, %eax # write() syscall mov $1, %rdi # stdout - movq _hello_world@GOTPCREL(%rip), %rsi +## We use pushq/popq here instead of movq in order to avoid relaxation. + pushq _hello_world@GOTPCREL(%rip) + popq %rsi mov $13, %rdx # length of str syscall movl $0x2000004, %eax # write() syscall mov $1, %rdi # stdout - movq _goodbye_world@GOTPCREL(%rip), %rsi + pushq _goodbye_world@GOTPCREL(%rip) + popq %rsi mov $15, %rdx # length of str syscall diff --git a/lld/test/MachO/no-unneeded-dyld-info.s b/lld/test/MachO/no-unneeded-dyld-info.s new file mode 100644 index 0000000000000..11a31594ec0c1 --- /dev/null +++ b/lld/test/MachO/no-unneeded-dyld-info.s @@ -0,0 +1,19 @@ +# REQUIRES: x86 +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: lld -flavor darwinnew -o %t %t.o +# RUN: llvm-objdump --macho --all-headers %t | FileCheck %s + +# CHECK: cmd LC_DYLD_INFO_ONLY +# CHECK-NEXT: cmdsize 48 +# CHECK-NEXT: rebase_off 0 +# CHECK-NEXT: rebase_size 0 +# CHECK-NEXT: bind_off 0 +# CHECK-NEXT: bind_size 0 +# CHECK-NEXT: weak_bind_off 0 +# CHECK-NEXT: weak_bind_size 0 +# CHECK-NEXT: lazy_bind_off 0 +# CHECK-NEXT: lazy_bind_size 0 + +.globl _main +_main: + ret diff --git a/lld/test/MachO/nonweak-definition-override.s b/lld/test/MachO/nonweak-definition-override.s new file mode 100644 index 0000000000000..d5c94d4cdca24 --- /dev/null +++ b/lld/test/MachO/nonweak-definition-override.s @@ -0,0 +1,60 @@ +# REQUIRES: x86 +# RUN: split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/libfoo.s -o %t/libfoo.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/nonweakdef.s -o %t/nonweakdef.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/weakdef.s -o %t/weakdef.o +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk -dylib %t/libfoo.o -o %t/libfoo.dylib + +## Check that non-weak defined symbols override weak dylib symbols. +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk %t/nonweakdef.o -L%t -lfoo -o %t/nonweakdef -lSystem +# RUN: llvm-objdump --macho --weak-bind %t/nonweakdef | FileCheck %s + +## Test loading the dylib before the obj file. +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk -L%t -lfoo %t/nonweakdef.o -o %t/nonweakdef -lSystem +# RUN: llvm-objdump --macho --weak-bind %t/nonweakdef | FileCheck %s + +# CHECK: Weak bind table: +# CHECK-NEXT: segment section address type addend symbol +# CHECK-NEXT: strong _weak_in_dylib +# CHECK-EMPTY: + +## Check that weak defined symbols do not override weak dylib symbols. +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk %t/weakdef.o -L%t -lfoo -o %t/weakdef -lSystem +# RUN: llvm-objdump --macho --weak-bind %t/weakdef | FileCheck %s --check-prefix=NO-WEAK-OVERRIDE + +## Test loading the dylib before the obj file. +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk -L%t -lfoo %t/weakdef.o -o %t/weakdef -lSystem +# RUN: llvm-objdump --macho --weak-bind %t/weakdef | FileCheck %s --check-prefix=NO-WEAK-OVERRIDE + +# NO-WEAK-OVERRIDE: Weak bind table: +# NO-WEAK-OVERRIDE-NEXT: segment section address type addend symbol +# NO-WEAK-OVERRIDE-EMPTY: + +#--- libfoo.s + +.globl _weak_in_dylib, _nonweak_in_dylib +.weak_definition _weak_in_dylib + +_weak_in_dylib: +_nonweak_in_dylib: + +#--- nonweakdef.s + +.globl _main, _weak_in_dylib, _nonweak_in_dylib + +_weak_in_dylib: +_nonweak_in_dylib: + +_main: + ret + +#--- weakdef.s + +.globl _main, _weak_in_dylib, _nonweak_in_dylib +.weak_definition _weak_in_dylib, _nonweak_in_dylib + +_weak_in_dylib: +_nonweak_in_dylib: + +_main: + ret diff --git a/lld/test/MachO/objc.s b/lld/test/MachO/objc.s new file mode 100644 index 0000000000000..f08acee694054 --- /dev/null +++ b/lld/test/MachO/objc.s @@ -0,0 +1,63 @@ +# REQUIRES: x86 +# RUN: split-file %s %t + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-objc-symbol.s -o %t/has-objc-symbol.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-objc-category.s -o %t/has-objc-category.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-swift.s -o %t/has-swift.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/no-objc.s -o %t/no-objc.o + +# RUN: rm -f %t/libHasSomeObjC.a +# RUN: llvm-ar rcs %t/libHasSomeObjC.a %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/no-objc.o + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk -lSystem %t/test.o -o %t/test \ +# RUN: -L%t -lHasSomeObjC -ObjC +# RUN: llvm-objdump --section-headers --syms %t/test | FileCheck %s --check-prefix=OBJC + +# OBJC: Sections: +# OBJC-NEXT: Idx Name Size VMA Type +# OBJC-NEXT: 0 __text {{.*}} TEXT +# OBJC-NEXT: 1 __swift {{.*}} DATA +# OBJC-NEXT: 2 __objc_catlist {{.*}} DATA +# OBJC-EMPTY: +# OBJC-NEXT: SYMBOL TABLE: +# OBJC-NEXT: g F __TEXT,__text _main +# OBJC-NEXT: g F __TEXT,__text _OBJC_CLASS_$_MyObject + +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk -lSystem %t/test.o -o %t/test \ +# RUN: -L%t -lHasSomeObjC +# RUN: llvm-objdump --section-headers --syms %t/test | FileCheck %s --check-prefix=NO-OBJC + +# NO-OBJC: Sections: +# NO-OBJC-NEXT: Idx Name Size VMA Type +# NO-OBJC-NEXT: 0 __text {{.*}} TEXT +# NO-OBJC-EMPTY: +# NO-OBJC-NEXT: SYMBOL TABLE: +# NO-OBJC-NEXT: g F __TEXT,__text _main +# NO-OBJC-EMPTY: + +#--- has-objc-symbol.s +.globl _OBJC_CLASS_$_MyObject +_OBJC_CLASS_$_MyObject: + +#--- has-objc-category.s +.section __DATA,__objc_catlist +.quad 0x1234 + +#--- has-swift.s +.section __TEXT,__swift +.quad 0x1234 + +#--- no-objc.s +## This archive member should not be pulled in since it does not contain any +## ObjC-related data. +.globl _foo +.section __DATA,foo + +foo: + .quad 0x1234 + +#--- test.s +.globl _main +_main: + ret diff --git a/lld/test/MachO/reexport-stub.s b/lld/test/MachO/reexport-stub.s new file mode 100644 index 0000000000000..d2139c805d37f --- /dev/null +++ b/lld/test/MachO/reexport-stub.s @@ -0,0 +1,28 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## This test verifies that a non-TBD dylib can re-export a TBD library. + +# RUN: echo "" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/reexporter.o +# RUN: lld -flavor darwinnew -dylib -syslibroot %S/Inputs/MacOSX.sdk -lc++ -sub_library libc++ \ +# RUN: %t/reexporter.o -o %t/libreexporter.dylib +# RUN: llvm-objdump --macho --all-headers %t/libreexporter.dylib | FileCheck %s --check-prefix=DYLIB-HEADERS +# DYLIB-HEADERS: cmd LC_REEXPORT_DYLIB +# DYLIB-HEADERS-NOT: Load command +# DYLIB-HEADERS: name /usr/lib/libc++.dylib + +# RUN: llvm-mc -filetype obj -triple x86_64-apple-darwin %s -o %t/test.o +# RUN: lld -flavor darwinnew -o %t/test -syslibroot %S/Inputs/MacOSX.sdk -lSystem -L%t -lreexporter %t/test.o +# RUN: llvm-objdump --bind --no-show-raw-insn -d %t/test | FileCheck %s + +# CHECK: Bind table: +# CHECK-DAG: __DATA __data {{.*}} pointer 0 libreexporter ___gxx_personality_v0 + +.text +.globl _main + +_main: + ret + +.data + .quad ___gxx_personality_v0 diff --git a/lld/test/MachO/stub-link.s b/lld/test/MachO/stub-link.s index 0d6b7fec3f2aa..04d01047b32c6 100644 --- a/lld/test/MachO/stub-link.s +++ b/lld/test/MachO/stub-link.s @@ -3,7 +3,7 @@ # RUN: mkdir -p %t # # RUN: llvm-mc -filetype obj -triple x86_64-apple-darwin %s -o %t/test.o -# RUN: lld -flavor darwinnew -o %t/test -syslibroot %S/Inputs/MacOSX.sdk -lSystem -framework CoreFoundation %t/test.o +# RUN: lld -flavor darwinnew -o %t/test -syslibroot %S/Inputs/MacOSX.sdk -lSystem -lc++ -framework CoreFoundation %t/test.o # # RUN: llvm-objdump --bind --no-show-raw-insn -d -r %t/test | FileCheck %s @@ -16,11 +16,13 @@ # CHECK-DAG: __DATA __data {{.*}} pointer 0 CoreFoundation _OBJC_METACLASS_$_NSObject # CHECK-DAG: __DATA __data {{.*}} pointer 0 CoreFoundation _OBJC_IVAR_$_NSConstantArray._count # CHECK-DAG: __DATA __data {{.*}} pointer 0 CoreFoundation _OBJC_EHTYPE_$_NSException +# CHECK-DAG: __DATA __data {{.*}} pointer 0 libc++ ___gxx_personality_v0 .section __TEXT,__text .global _main _main: +## This symbol is defined in an inner TAPI document within libSystem.tbd. movq ___nan@GOTPCREL(%rip), %rax ret @@ -29,3 +31,9 @@ _main: .quad _OBJC_METACLASS_$_NSObject .quad _OBJC_IVAR_$_NSConstantArray._count .quad _OBJC_EHTYPE_$_NSException + +## This symbol is defined in libc++abi.tbd, but we are linking test.o against +## libc++.tbd (which re-exports libc++abi). Linking against this symbol verifies +## that .tbd file re-exports can refer not just to TAPI documents within the +## same .tbd file, but to other on-disk files as well. + .quad ___gxx_personality_v0 diff --git a/lld/test/MachO/sub-library.s b/lld/test/MachO/sub-library.s index e858eaf0bff5a..bbaafd4a5d810 100644 --- a/lld/test/MachO/sub-library.s +++ b/lld/test/MachO/sub-library.s @@ -52,7 +52,7 @@ # RUN: rm -f %t/libgoodbye.dylib # RUN: not lld -flavor darwinnew -o %t/sub-library -Z -L%t -lsuper %t/sub-library.o 2>&1 \ # RUN: | FileCheck %s --check-prefix=MISSING-REEXPORT -DDIR=%t -# MISSING-REEXPORT: error: unable to read re-exported dylib at [[DIR]]/libgoodbye.dylib +# MISSING-REEXPORT: error: unable to locate re-export with install name [[DIR]]/libgoodbye.dylib .text .globl _main diff --git a/lld/test/MachO/syslibroot.test b/lld/test/MachO/syslibroot.test index e9d87abd0cc11..0d2d822273e46 100644 --- a/lld/test/MachO/syslibroot.test +++ b/lld/test/MachO/syslibroot.test @@ -18,6 +18,10 @@ CHECK-ABSOLUTE-PATH-REROOTED: Library search paths: CHECK-ABSOLUTE-PATH-REROOTED: [[ROOT]]/Library/libxml2-development CHECK-ABSOLUTE-PATH-REROOTED: [[ROOT]]/usr/lib +RUN: lld -flavor darwinnew -v -Z -syslibroot %t -L %t/Library/libxml2-development | FileCheck %s -check-prefix CHECK-PATH-WITHOUT-REROOT -DPATH=%t/Library/libxml2-development +CHECK-PATH-WITHOUT-REROOT: Library search paths: +CHECK-PATH-WITHOUT-REROOT-NEXT: [[PATH]] + # NOTE: the match here is fuzzy because the default search paths exist on Linux # and macOS, but not on Windows (that is we ignore `/var/empty`). This allows # us to run the test uniformly on all the platforms. diff --git a/lld/test/MachO/weak-binding.s b/lld/test/MachO/weak-binding.s new file mode 100644 index 0000000000000..3474d35ce921b --- /dev/null +++ b/lld/test/MachO/weak-binding.s @@ -0,0 +1,143 @@ +# REQUIRES: x86 +# RUN: split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/libfoo.s -o %t/libfoo.o +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk -dylib %t/libfoo.o -o %t/libfoo.dylib +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk %t/test.o -L%t -lfoo -o %t/test -lSystem +# RUN: llvm-objdump -d --no-show-raw-insn --bind --lazy-bind --weak-bind --full-contents %t/test | \ +# RUN: FileCheck %s + +# CHECK: Contents of section __la_symbol_ptr: +## Check that this section contains a nonzero pointer. It should point to +## _weak_external_fn, but we don't have a good way of testing the exact value as +## the bytes here are in little-endian order. +# CHECK-NEXT: {{[0-9a-f]+}} {{[0-9a-f ]*[1-9a-f]+[0-9a-f ]*}} + +# CHECK: Contents of section __got: +## Check that this section contains a nonzero pointer. It should point to +## _weak_external_for_gotpcrel. +# CHECK-NEXT: {{[0-9a-f]+}} {{[0-9a-f ]*[1-9a-f]+[0-9a-f ]*}} + +# CHECK: <_main>: +# CHECK-NEXT: movq [[#]](%rip), %rax # [[#%X,WEAK_DY_GOT_ADDR:]] +# CHECK-NEXT: movq [[#]](%rip), %rax # [[#%X,WEAK_EXT_GOT_ADDR:]] +# CHECK-NEXT: leaq [[#]](%rip), %rax # [[#%X,WEAK_INT_GOT_ADDR:]] +# CHECK-NEXT: movq [[#]](%rip), %rax # [[#%X,WEAK_TLV_ADDR:]] +# CHECK-NEXT: movq [[#]](%rip), %rax # [[#%X,WEAK_DY_TLV_ADDR:]] +# CHECK-NEXT: leaq [[#]](%rip), %rax # [[#%X,WEAK_INT_TLV_ADDR:]] +# CHECK-NEXT: callq 0x{{[0-9a-f]*}} +# CHECK-NEXT: callq 0x{{[0-9a-f]*}} +# CHECK-NEXT: callq 0x{{[0-9a-f]*}} + +# CHECK-LABEL: Bind table: +# CHECK-DAG: __DATA __data 0x[[#%x,WEAK_DY:]] pointer 0 libfoo _weak_dysym +# CHECK-DAG: __DATA __thread_vars 0x{{[0-9a-f]*}} pointer 0 libSystem __tlv_bootstrap +# CHECK-DAG: __DATA __thread_ptrs 0x[[#WEAK_DY_TLV_ADDR]] pointer 0 libfoo _weak_dysym_tlv +# CHECK-DAG: __DATA_CONST __got 0x[[#WEAK_DY_GOT_ADDR]] pointer 0 libfoo _weak_dysym_for_gotpcrel +# CHECK-DAG: __DATA __la_symbol_ptr 0x[[#%x,WEAK_DY_FN:]] pointer 0 libfoo _weak_dysym_fn +## Check that we don't have any other bindings +# CHECK-NOT: pointer + +# CHECK-LABEL: Lazy bind table: +## Verify that we have no lazy bindings +# CHECK-NOT: pointer + +# CHECK-LABEL: Weak bind table: +# CHECK-DAG: __DATA_CONST __got 0x[[#WEAK_DY_GOT_ADDR]] pointer 0 _weak_dysym_for_gotpcrel +# CHECK-DAG: __DATA_CONST __got 0x[[#WEAK_EXT_GOT_ADDR]] pointer 0 _weak_external_for_gotpcrel +# CHECK-DAG: __DATA __data 0x[[#WEAK_DY]] pointer 0 _weak_dysym +# CHECK-DAG: __DATA __thread_ptrs 0x[[#WEAK_TLV_ADDR]] pointer 0 _weak_tlv +# CHECK-DAG: __DATA __thread_ptrs 0x[[#WEAK_DY_TLV_ADDR]] pointer 0 _weak_dysym_tlv +# CHECK-DAG: __DATA __data 0x{{[0-9a-f]*}} pointer 2 _weak_external +# CHECK-DAG: __DATA __la_symbol_ptr 0x[[#WEAK_DY_FN]] pointer 0 _weak_dysym_fn +# CHECK-DAG: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} pointer 0 _weak_external_fn +## Check that we don't have any other bindings +# CHECK-NOT: pointer + +## Weak internal symbols don't get bindings +# RUN: llvm-objdump --macho --bind --lazy-bind --weak-bind %t/test | FileCheck %s --check-prefix=WEAK-INTERNAL +# WEAK-INTERNAL-NOT: _weak_internal +# WEAK-INTERNAL-NOT: _weak_internal_fn +# WEAK-INTERNAL-NOT: _weak_internal_tlv + +#--- libfoo.s + +.globl _weak_dysym +.weak_definition _weak_dysym +_weak_dysym: + .quad 0x1234 + +.globl _weak_dysym_for_gotpcrel +.weak_definition _weak_dysym_for_gotpcrel +_weak_dysym_for_gotpcrel: + .quad 0x1234 + +.globl _weak_dysym_fn +.weak_definition _weak_dysym_fn +_weak_dysym_fn: + ret + +.section __DATA,__thread_vars,thread_local_variables + +.globl _weak_dysym_tlv +.weak_definition _weak_dysym_tlv +_weak_dysym_tlv: + .quad 0x1234 + +#--- test.s + +.globl _main, _weak_external, _weak_external_for_gotpcrel, _weak_external_fn +.weak_definition _weak_external, _weak_external_for_gotpcrel, _weak_external_fn, _weak_internal, _weak_internal_for_gotpcrel, _weak_internal_fn + +_main: + mov _weak_dysym_for_gotpcrel@GOTPCREL(%rip), %rax + mov _weak_external_for_gotpcrel@GOTPCREL(%rip), %rax + mov _weak_internal_for_gotpcrel@GOTPCREL(%rip), %rax + mov _weak_tlv@TLVP(%rip), %rax + mov _weak_dysym_tlv@TLVP(%rip), %rax + mov _weak_internal_tlv@TLVP(%rip), %rax + callq _weak_dysym_fn + callq _weak_external_fn + callq _weak_internal_fn + mov $0, %rax + ret + +_weak_external: + .quad 0x1234 + +_weak_external_for_gotpcrel: + .quad 0x1234 + +_weak_external_fn: + ret + +_weak_internal: + .quad 0x1234 + +_weak_internal_for_gotpcrel: + .quad 0x1234 + +_weak_internal_fn: + ret + +.data + .quad _weak_dysym + .quad _weak_external + 2 + .quad _weak_internal + +.tbss _weak_tlv$tlv$init, 4, 2 +.tbss _weak_internal_tlv$tlv$init, 4, 2 + +.section __DATA,__thread_vars,thread_local_variables +.globl _weak_tlv +.weak_definition _weak_tlv, _weak_internal_tlv + +_weak_tlv: + .quad __tlv_bootstrap + .quad 0 + .quad _weak_tlv$tlv$init + +_weak_internal_tlv: + .quad __tlv_bootstrap + .quad 0 + .quad _weak_internal_tlv$tlv$init diff --git a/lld/test/MachO/weak-definition-order.s b/lld/test/MachO/weak-definition-order.s index 6770a5f76b391..b3b23c816bbaa 100644 --- a/lld/test/MachO/weak-definition-order.s +++ b/lld/test/MachO/weak-definition-order.s @@ -23,12 +23,11 @@ # RUN: @executable_path/libweak2.dylib %t/weak2.o -o %t/libweak2.dylib # RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/dylib12 -Z -L%t -lweak1 -lweak2 %t/test.o -# RUN: llvm-objdump --macho --lazy-bind %t/dylib12 | FileCheck %s --check-prefix=DYLIB1 +# RUN: llvm-objdump --macho --bind %t/dylib12 | FileCheck %s --check-prefix=DYLIB1 # RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/dylib21 -Z -L%t -lweak2 -lweak1 %t/test.o -# RUN: llvm-objdump --macho --lazy-bind %t/dylib21 | FileCheck %s --check-prefix=DYLIB2 -## TODO: these should really be in the weak binding section, not the lazy binding section -# DYLIB1: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} libweak1 _foo -# DYLIB2: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} libweak2 _foo +# RUN: llvm-objdump --macho --bind %t/dylib21 | FileCheck %s --check-prefix=DYLIB2 +# DYLIB1: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} pointer 0 libweak1 _foo +# DYLIB2: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} pointer 0 libweak2 _foo .globl _main _main: diff --git a/lld/test/MachO/weak-header-flags.s b/lld/test/MachO/weak-header-flags.s new file mode 100644 index 0000000000000..eb799cd2be0ca --- /dev/null +++ b/lld/test/MachO/weak-header-flags.s @@ -0,0 +1,51 @@ +# REQUIRES: x86 +# RUN: split-file %s %t + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/libweak-defines.s -o %t/libweak-defines.o +# RUN: lld -flavor darwinnew -syslibroot %S/Inputs/MacOSX.sdk -dylib %t/libweak-defines.o -o %t/libweak-defines.dylib +# RUN: llvm-readobj --file-headers %t/libweak-defines.dylib | FileCheck %s --check-prefix=WEAK-DEFINES-AND-BINDS + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/binds-to-weak.s -o %t/binds-to-weak.o +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -L%t -lweak-defines -o %t/binds-to-weak %t/binds-to-weak.o +# RUN: llvm-readobj --file-headers %t/binds-to-weak | FileCheck %s --check-prefix=WEAK-BINDS-ONLY + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/overrides-weak.s -o %t/overrides-weak.o +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -L%t -lweak-defines -o %t/overrides-weak %t/overrides-weak.o +# RUN: llvm-readobj --file-headers %t/overrides-weak | FileCheck %s --check-prefix=WEAK-DEFINES-ONLY + +# WEAK-DEFINES-AND-BINDS: MH_BINDS_TO_WEAK +# WEAK-DEFINES-AND-BINDS: MH_WEAK_DEFINES + +# WEAK-BINDS-ONLY-NOT: MH_WEAK_DEFINES +# WEAK-BINDS-ONLY: MH_BINDS_TO_WEAK +# WEAK-BINDS-ONLY-NOT: MH_WEAK_DEFINES + +# WEAK-DEFINES-ONLY-NOT: MH_BINDS_TO_WEAK +# WEAK-DEFINES-ONLY: MH_WEAK_DEFINES +# WEAK-DEFINES-ONLY-NOT: MH_BINDS_TO_WEAK + +#--- libweak-defines.s + +.globl _foo +.weak_definition _foo +_foo: + ret + +#--- binds-to-weak.s + +.globl _main +_main: + callq _foo + ret + +## Don't generate MH_WEAK_DEFINES for weak locals +.weak_definition _weak_local +_weak_local: + +#--- overrides-weak.s + +.globl _main, _foo +_foo: + +_main: + ret diff --git a/lld/test/MachO/x86-64-reloc-got-load.s b/lld/test/MachO/x86-64-reloc-got-load.s new file mode 100644 index 0000000000000..158936d0b3608 --- /dev/null +++ b/lld/test/MachO/x86-64-reloc-got-load.s @@ -0,0 +1,18 @@ +# REQUIRES: x86 + +## Check that we perform relaxation for GOT_LOAD relocations to defined symbols. +## Note: GOT_LOAD relocations to dylib symbols are already tested in dylink.s. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: lld -flavor darwinnew -o %t %t.o +# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s +# CHECK: leaq [[#]](%rip), %rax # {{.*}} <_foo> + +.globl _main, _foo + +_main: + movq _foo@GOTPCREL(%rip), %rax + ret + +_foo: + .space 0 diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test index faac3a0be57d0..c9fa6ea9f97d8 100644 --- a/lld/test/MinGW/driver.test +++ b/lld/test/MinGW/driver.test @@ -136,14 +136,18 @@ PDB-DEFAULT-NOT: -pdb:{{.*}} RUN: ld.lld -### -m i386pep foo.o --large-address-aware | FileCheck -check-prefix LARGE-ADDRESS-AWARE %s LARGE-ADDRESS-AWARE: -largeaddressaware -RUN: ld.lld -### -m i386pe foo.o | FileCheck -check-prefix DEFAULT-DISABLE-FLAGS %s -RUN: ld.lld -### -m i386pep foo.o | FileCheck -check-prefix DEFAULT-DISABLE-FLAGS %s -DEFAULT-DISABLE-FLAGS: -dynamicbase:no -RUN: ld.lld -### -m i386pe --dynamicbase foo.o | FileCheck -check-prefix NO-DEFAULT-DISABLE-FLAGS %s -RUN: ld.lld -### -m i386pep -dynamicbase foo.o | FileCheck -check-prefix NO-DEFAULT-DISABLE-FLAGS %s -RUN: ld.lld -### -m thumb2pe foo.o | FileCheck -check-prefix NO-DEFAULT-DISABLE-FLAGS %s -RUN: ld.lld -### -m arm64pe foo.o | FileCheck -check-prefix NO-DEFAULT-DISABLE-FLAGS %s -NO-DEFAULT-DISABLE-FLAGS-NOT: -dynamicbase:no +RUN: ld.lld -### -m i386pe foo.o --no-dynamicbase | FileCheck -check-prefix DISABLE-DYNAMICBASE %s +DISABLE-DYNAMICBASE: -dynamicbase:no +RUN: ld.lld -### -m i386pe --dynamicbase foo.o | FileCheck -check-prefix NO-DISABLE-DYNAMICBASE %s +RUN: ld.lld -### -m i386pep -dynamicbase foo.o | FileCheck -check-prefix NO-DISABLE-DYNAMICBASE %s +RUN: ld.lld -### -m i386pe foo.o | FileCheck -check-prefix NO-DISABLE-DYNAMICBASE %s +RUN: ld.lld -### -m i386pep foo.o | FileCheck -check-prefix NO-DISABLE-DYNAMICBASE %s +RUN: ld.lld -### -m thumb2pe foo.o | FileCheck -check-prefix NO-DISABLE-DYNAMICBASE %s +RUN: ld.lld -### -m arm64pe foo.o | FileCheck -check-prefix NO-DISABLE-DYNAMICBASE %s +# On arm, dynamicbase can't be disabled, so --no-dynamicbase is ignored +RUN: ld.lld -### -m thumb2pe foo.o --no-dynamicbase | FileCheck -check-prefix NO-DISABLE-DYNAMICBASE %s +RUN: ld.lld -### -m arm64pe foo.o --no-dynamicbase | FileCheck -check-prefix NO-DISABLE-DYNAMICBASE %s +NO-DISABLE-DYNAMICBASE-NOT: -dynamicbase:no RUN: ld.lld -### -m i386pep foo.o --image-base 0x1230000 | FileCheck -check-prefix IMAGE-BASE %s RUN: ld.lld -### -m i386pep foo.o -image-base 0x1230000 | FileCheck -check-prefix IMAGE-BASE %s @@ -260,3 +264,11 @@ ALIGN: -align:0x2000 RUN: ld.lld -### -m i386pe foo.o -no-seh | FileCheck -check-prefix NOSEH %s RUN: ld.lld -### -m i386pe foo.o --no-seh | FileCheck -check-prefix NOSEH %s NOSEH: -noseh + +RUN: ld.lld -### -m i386pep foo.o --no-allow-multiple-definition --allow-multiple-definition | FileCheck -check-prefix ALLOW_MULTIPLE_DEFINITION %s +RUN: ld.lld -### -m i386pep foo.o -no-allow-multiple-definition -allow-multiple-definition | FileCheck -check-prefix ALLOW_MULTIPLE_DEFINITION %s +ALLOW_MULTIPLE_DEFINITION: -force:multiple + +RUN: ld.lld -### -m i386pep foo.o --allow-multiple-definition --no-allow-multiple-definition | FileCheck -check-prefix NO_ALLOW_MULTIPLE_DEFINITION %s +RUN: ld.lld -### -m i386pep foo.o -allow-multiple-definition -no-allow-multiple-definition | FileCheck -check-prefix NO_ALLOW_MULTIPLE_DEFINITION %s +NO_ALLOW_MULTIPLE_DEFINITION-NOT: -force:multiple diff --git a/lldb/docs/lldb-platform-packets.txt b/lldb/docs/lldb-platform-packets.txt index 23d1cacc5f7ee..8d3fed7ab3410 100644 --- a/lldb/docs/lldb-platform-packets.txt +++ b/lldb/docs/lldb-platform-packets.txt @@ -237,6 +237,27 @@ incompatible with the flags that gdb specifies. // Continues to return the results of the qfProcessInfo. Once all matches // have been sent, Exx is returned to indicate end of matches. +//---------------------------------------------------------------------- +// qPathComplete +// +// BRIEF +// Get a list of matched disk files/directories by passing a boolean flag +// and a partial path. +// +// EXAMPLE +// +// receive: qPathComplete:0,6d61696e +// send: M6d61696e2e637070 +// receive: qPathComplete:1,746573 +// send: M746573742f,74657374732f +// +// If the first argument is zero, the result should contain all +// files (including directories) starting with the given path. If the +// argument is one, the result should contain only directories. +// +// The result should be a comma-separated list of hex-encoded paths. +// Paths denoting a directory should end with a directory separator ('/' or '\'). + //---------------------------------------------------------------------- // vFile:size: // diff --git a/lldb/include/lldb/Core/Module.h b/lldb/include/lldb/Core/Module.h index 8bd70ab16b5ab..9eb7477730c17 100644 --- a/lldb/include/lldb/Core/Module.h +++ b/lldb/include/lldb/Core/Module.h @@ -506,10 +506,6 @@ class Module : public std::enable_shared_from_this, return m_object_mod_time; } - void SetObjectModificationTime(const llvm::sys::TimePoint<> &mod_time) { - m_mod_time = mod_time; - } - /// This callback will be called by SymbolFile implementations when /// parsing a compile unit that contains SDK information. /// \param sysroot will be added to the path remapping dictionary. diff --git a/lldb/include/lldb/Interpreter/CommandCompletions.h b/lldb/include/lldb/Interpreter/CommandCompletions.h index 1d8972e0ca036..c80bde0e719bf 100644 --- a/lldb/include/lldb/Interpreter/CommandCompletions.h +++ b/lldb/include/lldb/Interpreter/CommandCompletions.h @@ -45,10 +45,15 @@ class CommandCompletions { eThreadIndexCompletion = (1u << 17), eWatchPointIDCompletion = (1u << 18), eBreakpointNameCompletion = (1u << 19), + eProcessIDCompletion = (1u << 20), + eProcessNameCompletion = (1u << 21), + eRemoteDiskFileCompletion = (1u << 22), + eRemoteDiskDirectoryCompletion = (1u << 23), + eTypeCategoryNameCompletion = (1u << 24), // This item serves two purposes. It is the last element in the enum, so // you can add custom enums starting from here in your Option class. Also // if you & in this bit the base code will not process the option. - eCustomCompletion = (1u << 20) + eCustomCompletion = (1u << 24) }; static bool InvokeCommonCompletionCallbacks( @@ -70,6 +75,14 @@ class CommandCompletions { StringList &matches, TildeExpressionResolver &Resolver); + static void RemoteDiskFiles(CommandInterpreter &interpreter, + CompletionRequest &request, + SearchFilter *searcher); + + static void RemoteDiskDirectories(CommandInterpreter &interpreter, + CompletionRequest &request, + SearchFilter *searcher); + static void SourceFiles(CommandInterpreter &interpreter, CompletionRequest &request, SearchFilter *searcher); @@ -110,6 +123,12 @@ class CommandCompletions { CompletionRequest &request, SearchFilter *searcher); + static void ProcessIDs(CommandInterpreter &interpreter, + CompletionRequest &request, SearchFilter *searcher); + + static void ProcessNames(CommandInterpreter &interpreter, + CompletionRequest &request, SearchFilter *searcher); + static void DisassemblyFlavors(CommandInterpreter &interpreter, CompletionRequest &request, SearchFilter *searcher); @@ -128,6 +147,10 @@ class CommandCompletions { static void WatchPointIDs(CommandInterpreter &interpreter, CompletionRequest &request, SearchFilter *searcher); + + static void TypeCategoryNames(CommandInterpreter &interpreter, + CompletionRequest &request, + SearchFilter *searcher); }; } // namespace lldb_private diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h index 8902b5f4eaa77..40814da3de4ae 100644 --- a/lldb/include/lldb/Symbol/UnwindPlan.h +++ b/lldb/include/lldb/Symbol/UnwindPlan.h @@ -393,6 +393,7 @@ class UnwindPlan { m_plan_is_sourced_from_compiler(rhs.m_plan_is_sourced_from_compiler), m_plan_is_valid_at_all_instruction_locations( rhs.m_plan_is_valid_at_all_instruction_locations), + m_plan_is_for_signal_trap(rhs.m_plan_is_for_signal_trap), m_lsda_address(rhs.m_lsda_address), m_personality_func_addr(rhs.m_personality_func_addr) { m_row_list.reserve(rhs.m_row_list.size()); diff --git a/lldb/include/lldb/Target/Platform.h b/lldb/include/lldb/Target/Platform.h index 6234b8244b3f3..9335f73b37df1 100644 --- a/lldb/include/lldb/Target/Platform.h +++ b/lldb/include/lldb/Target/Platform.h @@ -523,6 +523,9 @@ class Platform : public PluginInterface { return UINT64_MAX; } + virtual void AutoCompleteDiskFileOrDirectory(CompletionRequest &request, + bool only_dir) {} + virtual uint64_t ReadFile(lldb::user_id_t fd, uint64_t offset, void *dst, uint64_t dst_len, Status &error) { error.SetErrorStringWithFormat( diff --git a/lldb/include/lldb/Utility/Reproducer.h b/lldb/include/lldb/Utility/Reproducer.h index 4dc6ddd51394f..d6cde44850901 100644 --- a/lldb/include/lldb/Utility/Reproducer.h +++ b/lldb/include/lldb/Utility/Reproducer.h @@ -22,6 +22,7 @@ #include namespace lldb_private { +class UUID; namespace repro { class Reproducer; @@ -196,6 +197,7 @@ class Reproducer { static Reproducer &Instance(); static llvm::Error Initialize(ReproducerMode mode, llvm::Optional root); + static void Initialize(); static bool Initialized(); static void Terminate(); diff --git a/lldb/include/lldb/Utility/ReproducerProvider.h b/lldb/include/lldb/Utility/ReproducerProvider.h index b84b8a67c4ca4..abb13f0edd43c 100644 --- a/lldb/include/lldb/Utility/ReproducerProvider.h +++ b/lldb/include/lldb/Utility/ReproducerProvider.h @@ -12,6 +12,7 @@ #include "lldb/Utility/FileSpec.h" #include "lldb/Utility/ProcessInfo.h" #include "lldb/Utility/Reproducer.h" +#include "lldb/Utility/UUID.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileCollector.h" @@ -205,6 +206,41 @@ class HomeDirectoryProvider : public DirectoryProvider { static char ID; }; +/// Provider for mapping UUIDs to symbol and executable files. +class SymbolFileProvider : public Provider { +public: + SymbolFileProvider(const FileSpec &directory) + : Provider(directory), m_symbol_files() {} + + void AddSymbolFile(const UUID *uuid, const FileSpec &module_path, + const FileSpec &symbol_path); + void Keep() override; + + struct Entry { + Entry() = default; + Entry(std::string uuid) : uuid(std::move(uuid)) {} + Entry(std::string uuid, std::string module_path, std::string symbol_path) + : uuid(std::move(uuid)), module_path(std::move(module_path)), + symbol_path(std::move(symbol_path)) {} + + bool operator==(const Entry &rhs) const { return uuid == rhs.uuid; } + bool operator<(const Entry &rhs) const { return uuid < rhs.uuid; } + + std::string uuid; + std::string module_path; + std::string symbol_path; + }; + + struct Info { + static const char *name; + static const char *file; + }; + static char ID; + +private: + std::vector m_symbol_files; +}; + /// The MultiProvider is a provider that hands out recorder which can be used /// to capture data for different instances of the same object. The recorders /// can be passed around or stored as an instance member. @@ -345,6 +381,16 @@ template class MultiLoader { unsigned m_index = 0; }; +class SymbolFileLoader { +public: + SymbolFileLoader(Loader *loader); + std::pair GetPaths(const UUID *uuid) const; + +private: + // Sorted list of UUID to path mappings. + std::vector m_symbol_files; +}; + /// Helper to read directories written by the DirectoryProvider. template llvm::Expected GetDirectoryFrom(repro::Loader *loader) { @@ -357,4 +403,20 @@ llvm::Expected GetDirectoryFrom(repro::Loader *loader) { } // namespace repro } // namespace lldb_private +LLVM_YAML_IS_SEQUENCE_VECTOR(lldb_private::repro::SymbolFileProvider::Entry) + +namespace llvm { +namespace yaml { +template <> +struct MappingTraits { + static void mapping(IO &io, + lldb_private::repro::SymbolFileProvider::Entry &entry) { + io.mapRequired("uuid", entry.uuid); + io.mapRequired("module-path", entry.module_path); + io.mapRequired("symbol-path", entry.symbol_path); + } +}; +} // namespace yaml +} // namespace llvm + #endif // LLDB_UTILITY_REPRODUCER_PROVIDER_H diff --git a/lldb/include/lldb/Utility/Scalar.h b/lldb/include/lldb/Utility/Scalar.h index 9d3a20c93898c..332bb00489d01 100644 --- a/lldb/include/lldb/Utility/Scalar.h +++ b/lldb/include/lldb/Utility/Scalar.h @@ -14,7 +14,7 @@ #include "lldb/lldb-enumerations.h" #include "lldb/lldb-private-types.h" #include "llvm/ADT/APFloat.h" -#include "llvm/ADT/APInt.h" +#include "llvm/ADT/APSInt.h" #include #include #include @@ -38,35 +38,35 @@ namespace lldb_private { // and values before performing these operations. Type promotion currently // follows the ANSI C type promotion rules. class Scalar { + template + static llvm::APSInt MakeAPSInt(T v) { + static_assert(std::is_integral::value, ""); + static_assert(sizeof(T) <= sizeof(uint64_t), "Conversion loses precision!"); + return llvm::APSInt( + llvm::APInt(sizeof(T) * 8, uint64_t(v), std::is_signed::value), + std::is_unsigned::value); + } + public: // FIXME: These are host types which seems to be an odd choice. enum Type { e_void = 0, - e_sint, - e_uint, + e_int, e_float, }; // Constructors and Destructors Scalar() : m_type(e_void), m_float(0.0f) {} - Scalar(int v) - : m_type(e_sint), m_integer(sizeof(v) * 8, uint64_t(v), true), - m_float(0.0f) {} + Scalar(int v) : m_type(e_int), m_integer(MakeAPSInt(v)), m_float(0.0f) {} Scalar(unsigned int v) - : m_type(e_uint), m_integer(sizeof(v) * 8, uint64_t(v), false), - m_float(0.0f) {} - Scalar(long v) - : m_type(e_sint), m_integer(sizeof(v) * 8, uint64_t(v), true), - m_float(0.0f) {} + : m_type(e_int), m_integer(MakeAPSInt(v)), m_float(0.0f) {} + Scalar(long v) : m_type(e_int), m_integer(MakeAPSInt(v)), m_float(0.0f) {} Scalar(unsigned long v) - : m_type(e_uint), m_integer(sizeof(v) * 8, uint64_t(v), false), - m_float(0.0f) {} + : m_type(e_int), m_integer(MakeAPSInt(v)), m_float(0.0f) {} Scalar(long long v) - : m_type(e_sint), m_integer(sizeof(v) * 8, uint64_t(v), true), - m_float(0.0f) {} + : m_type(e_int), m_integer(MakeAPSInt(v)), m_float(0.0f) {} Scalar(unsigned long long v) - : m_type(e_uint), m_integer(sizeof(v) * 8, uint64_t(v), false), - m_float(0.0f) {} + : m_type(e_int), m_integer(MakeAPSInt(v)), m_float(0.0f) {} Scalar(float v) : m_type(e_float), m_float(v) {} Scalar(double v) : m_type(e_float), m_float(v) {} Scalar(long double v) : m_type(e_float), m_float(double(v)) { @@ -75,7 +75,7 @@ class Scalar { llvm::APFloat::rmNearestTiesToEven, &ignore); } Scalar(llvm::APInt v) - : m_type(e_sint), m_integer(std::move(v)), m_float(0.0f) {} + : m_type(e_int), m_integer(std::move(v), false), m_float(0.0f) {} bool SignExtend(uint32_t bit_pos); @@ -108,7 +108,7 @@ class Scalar { void GetValue(Stream *s, bool show_type) const; - bool IsValid() const { return (m_type >= e_sint) && (m_type <= e_float); } + bool IsValid() const { return (m_type >= e_int) && (m_type <= e_float); } /// Convert to an integer with \p bits and the given signedness. void TruncOrExtendTo(uint16_t bits, bool sign); @@ -116,6 +116,7 @@ class Scalar { bool IntegralPromote(uint16_t bits, bool sign); bool FloatPromote(const llvm::fltSemantics &semantics); + bool IsSigned() const; bool MakeSigned(); bool MakeUnsigned(); @@ -234,7 +235,7 @@ class Scalar { // Classes that inherit from Scalar can see and modify these Scalar::Type m_type; - llvm::APInt m_integer; + llvm::APSInt m_integer; llvm::APFloat m_float; template T GetAs(T fail_value) const; diff --git a/lldb/include/lldb/Utility/StringExtractorGDBRemote.h b/lldb/include/lldb/Utility/StringExtractorGDBRemote.h index 715f3cb2541d2..efb43767e7394 100644 --- a/lldb/include/lldb/Utility/StringExtractorGDBRemote.h +++ b/lldb/include/lldb/Utility/StringExtractorGDBRemote.h @@ -76,6 +76,7 @@ class StringExtractorGDBRemote : public StringExtractor { eServerPacketType_QSetSTDERR, eServerPacketType_QSetWorkingDir, eServerPacketType_QStartNoAckMode, + eServerPacketType_qPathComplete, eServerPacketType_qPlatform_shell, eServerPacketType_qPlatform_mkdir, eServerPacketType_qPlatform_chmod, diff --git a/lldb/packages/Python/lldbsuite/test/builders/darwin.py b/lldb/packages/Python/lldbsuite/test/builders/darwin.py index f9005397d50e9..4548217c3fab8 100644 --- a/lldb/packages/Python/lldbsuite/test/builders/darwin.py +++ b/lldb/packages/Python/lldbsuite/test/builders/darwin.py @@ -23,7 +23,35 @@ def get_os_env_from_platform(platform): def get_os_from_sdk(sdk): return sdk[:sdk.find('.')], "" -from lldbsuite.test import configuration + +def get_os_and_env(): + if configuration.lldb_platform_name: + return get_os_env_from_platform(configuration.lldb_platform_name) + if configuration.apple_sdk: + return get_os_from_sdk(configuration.apple_sdk) + return None, None + + +def get_triple(): + # Construct the vendor component. + vendor = "apple" + + # Construct the os component. + os, env = get_os_and_env() + if os is None or env is None: + return None, None, None, None + + # Get the SDK from the os and env. + sdk = lldbutil.get_xcode_sdk(os, env) + if not sdk: + return None, None, None, None + + # Get the version from the SDK. + version = lldbutil.get_xcode_sdk_version(sdk) + if not version: + return None, None, None, None + + return vendor, os, version, env class BuilderDarwin(Builder): @@ -37,50 +65,24 @@ def getExtraMakeArgs(self): if configuration.dsymutil: args['DSYMUTIL'] = configuration.dsymutil - operating_system, _ = self.getOsAndEnv() + operating_system, _ = get_os_and_env() if operating_system and operating_system != "macosx": builder_dir = os.path.dirname(os.path.abspath(__file__)) test_dir = os.path.dirname(builder_dir) entitlements = os.path.join(test_dir, 'make', 'entitlements.plist') - args['CODESIGN'] = 'codesign --entitlements {}'.format(entitlements) + args['CODESIGN'] = 'codesign --entitlements {}'.format( + entitlements) # Return extra args as a formatted string. return ' '.join( {'{}="{}"'.format(key, value) for key, value in args.items()}) - def getOsAndEnv(self): - if configuration.lldb_platform_name: - return get_os_env_from_platform(configuration.lldb_platform_name) - elif configuration.apple_sdk: - return get_os_from_sdk(configuration.apple_sdk) - return None, None def getArchCFlags(self, architecture): """Returns the ARCH_CFLAGS for the make system.""" - - # Construct the arch component. - arch = architecture if architecture else configuration.arch - if not arch: - arch = subprocess.check_output(['machine' - ]).rstrip().decode('utf-8') - if not arch: - return "" - - # Construct the vendor component. - vendor = "apple" - - # Construct the os component. - os, env = self.getOsAndEnv() - if os is None or env is None: - return "" - - # Get the SDK from the os and env. - sdk = lldbutil.get_xcode_sdk(os, env) - if not sdk: - return "" - - version = lldbutil.get_xcode_sdk_version(sdk) - if not version: + # Get the triple components. + vendor, os, version, env = get_triple() + if not vendor or not os or not version or not env: return "" # Construct the triple from its components. diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index de9a9a2c70023..4180ba2716136 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -190,11 +190,11 @@ def COMPLETION_MSG(str_before, str_after, completions): def EXP_MSG(str, actual, exe): - '''A generic "'%s' returns expected result" message generator if exe. - Otherwise, it generates "'%s' matches expected result" message.''' + '''A generic "'%s' returned unexpected result" message generator if exe. + Otherwise, it generates "'%s' does not match expected result" message.''' - return "'%s' %s expected result, got '%s'" % ( - str, 'returns' if exe else 'matches', actual.strip()) + return "'%s' %s result, got '%s'" % ( + str, 'returned unexpected' if exe else 'does not match expected', actual.strip()) def SETTING_MSG(setting): diff --git a/lldb/source/Commands/CommandCompletions.cpp b/lldb/source/Commands/CommandCompletions.cpp index 109613e223c7c..0ea6d42881691 100644 --- a/lldb/source/Commands/CommandCompletions.cpp +++ b/lldb/source/Commands/CommandCompletions.cpp @@ -13,6 +13,7 @@ #include "lldb/Core/FileSpecList.h" #include "lldb/Core/Module.h" #include "lldb/Core/PluginManager.h" +#include "lldb/DataFormatters/DataVisualization.h" #include "lldb/Host/FileSystem.h" #include "lldb/Interpreter/CommandCompletions.h" #include "lldb/Interpreter/CommandInterpreter.h" @@ -71,6 +72,12 @@ bool CommandCompletions::InvokeCommonCompletionCallbacks( {eThreadIndexCompletion, CommandCompletions::ThreadIndexes}, {eWatchPointIDCompletion, CommandCompletions::WatchPointIDs}, {eBreakpointNameCompletion, CommandCompletions::BreakpointNames}, + {eProcessIDCompletion, CommandCompletions::ProcessIDs}, + {eProcessNameCompletion, CommandCompletions::ProcessNames}, + {eRemoteDiskFileCompletion, CommandCompletions::RemoteDiskFiles}, + {eRemoteDiskDirectoryCompletion, + CommandCompletions::RemoteDiskDirectories}, + {eTypeCategoryNameCompletion, CommandCompletions::TypeCategoryNames}, {eNoCompletion, nullptr} // This one has to be last in the list. }; @@ -484,6 +491,24 @@ void CommandCompletions::DiskDirectories(const llvm::Twine &partial_file_name, DiskFilesOrDirectories(partial_file_name, true, matches, Resolver); } +void CommandCompletions::RemoteDiskFiles(CommandInterpreter &interpreter, + CompletionRequest &request, + SearchFilter *searcher) { + lldb::PlatformSP platform_sp = + interpreter.GetDebugger().GetPlatformList().GetSelectedPlatform(); + if (platform_sp) + platform_sp->AutoCompleteDiskFileOrDirectory(request, false); +} + +void CommandCompletions::RemoteDiskDirectories(CommandInterpreter &interpreter, + CompletionRequest &request, + SearchFilter *searcher) { + lldb::PlatformSP platform_sp = + interpreter.GetDebugger().GetPlatformList().GetSelectedPlatform(); + if (platform_sp) + platform_sp->AutoCompleteDiskFileOrDirectory(request, true); +} + void CommandCompletions::Modules(CommandInterpreter &interpreter, CompletionRequest &request, SearchFilter *searcher) { @@ -649,6 +674,33 @@ void CommandCompletions::DisassemblyFlavors(CommandInterpreter &interpreter, } } +void CommandCompletions::ProcessIDs(CommandInterpreter &interpreter, + CompletionRequest &request, + SearchFilter *searcher) { + lldb::PlatformSP platform_sp(interpreter.GetPlatform(true)); + if (!platform_sp) + return; + ProcessInstanceInfoList process_infos; + ProcessInstanceInfoMatch match_info; + platform_sp->FindProcesses(match_info, process_infos); + for (const ProcessInstanceInfo &info : process_infos) + request.TryCompleteCurrentArg(std::to_string(info.GetProcessID()), + info.GetNameAsStringRef()); +} + +void CommandCompletions::ProcessNames(CommandInterpreter &interpreter, + CompletionRequest &request, + SearchFilter *searcher) { + lldb::PlatformSP platform_sp(interpreter.GetPlatform(true)); + if (!platform_sp) + return; + ProcessInstanceInfoList process_infos; + ProcessInstanceInfoMatch match_info; + platform_sp->FindProcesses(match_info, process_infos); + for (const ProcessInstanceInfo &info : process_infos) + request.TryCompleteCurrentArg(info.GetNameAsStringRef()); +} + void CommandCompletions::TypeLanguages(CommandInterpreter &interpreter, CompletionRequest &request, SearchFilter *searcher) { @@ -731,3 +783,14 @@ void CommandCompletions::WatchPointIDs(CommandInterpreter &interpreter, strm.GetString()); } } + +void CommandCompletions::TypeCategoryNames(CommandInterpreter &interpreter, + CompletionRequest &request, + SearchFilter *searcher) { + DataVisualization::Categories::ForEach( + [&request](const lldb::TypeCategoryImplSP &category_sp) { + request.TryCompleteCurrentArg(category_sp->GetName(), + category_sp->GetDescription()); + return true; + }); +} diff --git a/lldb/source/Commands/CommandObjectPlatform.cpp b/lldb/source/Commands/CommandObjectPlatform.cpp index 4c235cb866a63..b5409e611f058 100644 --- a/lldb/source/Commands/CommandObjectPlatform.cpp +++ b/lldb/source/Commands/CommandObjectPlatform.cpp @@ -392,7 +392,8 @@ class CommandObjectPlatformSettings : public CommandObjectParsed { "or for a platform by name.", "platform settings", 0), m_options(), - m_option_working_dir(LLDB_OPT_SET_1, false, "working-dir", 'w', 0, + m_option_working_dir(LLDB_OPT_SET_1, false, "working-dir", 'w', + CommandCompletions::eRemoteDiskDirectoryCompletion, eArgTypePath, "The working directory for the platform.") { m_options.Append(&m_option_working_dir, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1); @@ -485,6 +486,15 @@ class CommandObjectPlatformFOpen : public CommandObjectParsed { ~CommandObjectPlatformFOpen() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + if (request.GetCursorIndex() == 0) + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), + CommandCompletions::eRemoteDiskFileCompletion, request, nullptr); + } + bool DoExecute(Args &args, CommandReturnObject &result) override { PlatformSP platform_sp( GetDebugger().GetPlatformList().GetSelectedPlatform()); @@ -817,6 +827,19 @@ class CommandObjectPlatformGetFile : public CommandObjectParsed { ~CommandObjectPlatformGetFile() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + if (request.GetCursorIndex() == 0) + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), + CommandCompletions::eRemoteDiskFileCompletion, request, nullptr); + else if (request.GetCursorIndex() == 1) + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), CommandCompletions::eDiskFileCompletion, + request, nullptr); + } + bool DoExecute(Args &args, CommandReturnObject &result) override { // If the number of arguments is incorrect, issue an error message. if (args.GetArgumentCount() != 2) { @@ -882,6 +905,17 @@ class CommandObjectPlatformGetSize : public CommandObjectParsed { ~CommandObjectPlatformGetSize() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + if (request.GetCursorIndex() != 0) + return; + + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), CommandCompletions::eRemoteDiskFileCompletion, + request, nullptr); + } + bool DoExecute(Args &args, CommandReturnObject &result) override { // If the number of arguments is incorrect, issue an error message. if (args.GetArgumentCount() != 1) { @@ -927,6 +961,19 @@ class CommandObjectPlatformPutFile : public CommandObjectParsed { ~CommandObjectPlatformPutFile() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + if (request.GetCursorIndex() == 0) + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), CommandCompletions::eDiskFileCompletion, + request, nullptr); + else if (request.GetCursorIndex() == 1) + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), + CommandCompletions::eRemoteDiskFileCompletion, request, nullptr); + } + bool DoExecute(Args &args, CommandReturnObject &result) override { const char *src = args.GetArgumentAtIndex(0); const char *dst = args.GetArgumentAtIndex(1); @@ -1331,6 +1378,14 @@ class CommandObjectPlatformProcessInfo : public CommandObjectParsed { ~CommandObjectPlatformProcessInfo() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), CommandCompletions::eProcessIDCompletion, + request, nullptr); + } + protected: bool DoExecute(Args &args, CommandReturnObject &result) override { Target *target = GetDebugger().GetSelectedTarget().get(); @@ -1447,46 +1502,6 @@ class CommandObjectPlatformProcessAttach : public CommandObjectParsed { return llvm::makeArrayRef(g_platform_process_attach_options); } - void HandleOptionArgumentCompletion( - CompletionRequest &request, OptionElementVector &opt_element_vector, - int opt_element_index, CommandInterpreter &interpreter) override { - int opt_arg_pos = opt_element_vector[opt_element_index].opt_arg_pos; - int opt_defs_index = opt_element_vector[opt_element_index].opt_defs_index; - - // We are only completing the name option for now... - - // Are we in the name? - if (GetDefinitions()[opt_defs_index].short_option != 'n') - return; - - // Look to see if there is a -P argument provided, and if so use that - // plugin, otherwise use the default plugin. - - const char *partial_name = nullptr; - partial_name = request.GetParsedLine().GetArgumentAtIndex(opt_arg_pos); - - PlatformSP platform_sp(interpreter.GetPlatform(true)); - if (!platform_sp) - return; - - ProcessInstanceInfoList process_infos; - ProcessInstanceInfoMatch match_info; - if (partial_name) { - match_info.GetProcessInfo().GetExecutableFile().SetFile( - partial_name, FileSpec::Style::native); - match_info.SetNameMatchType(NameMatch::StartsWith); - } - platform_sp->FindProcesses(match_info, process_infos); - const uint32_t num_matches = process_infos.size(); - if (num_matches == 0) - return; - - for (uint32_t i = 0; i < num_matches; ++i) { - request.AddCompletion(process_infos[i].GetNameAsStringRef()); - } - return; - } - // Options table: Required for subclasses of Options. static OptionDefinition g_option_table[]; diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index 25fe2e4b8b1aa..5ef0b87c64355 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -320,49 +320,6 @@ class CommandObjectProcessAttach : public CommandObjectProcessLaunchOrAttach { return llvm::makeArrayRef(g_process_attach_options); } - void HandleOptionArgumentCompletion( - CompletionRequest &request, OptionElementVector &opt_element_vector, - int opt_element_index, CommandInterpreter &interpreter) override { - int opt_arg_pos = opt_element_vector[opt_element_index].opt_arg_pos; - int opt_defs_index = opt_element_vector[opt_element_index].opt_defs_index; - - switch (GetDefinitions()[opt_defs_index].short_option) { - case 'n': { - // Look to see if there is a -P argument provided, and if so use that - // plugin, otherwise use the default plugin. - - const char *partial_name = nullptr; - partial_name = request.GetParsedLine().GetArgumentAtIndex(opt_arg_pos); - - PlatformSP platform_sp(interpreter.GetPlatform(true)); - if (!platform_sp) - return; - ProcessInstanceInfoList process_infos; - ProcessInstanceInfoMatch match_info; - if (partial_name) { - match_info.GetProcessInfo().GetExecutableFile().SetFile( - partial_name, FileSpec::Style::native); - match_info.SetNameMatchType(NameMatch::StartsWith); - } - platform_sp->FindProcesses(match_info, process_infos); - const size_t num_matches = process_infos.size(); - if (num_matches == 0) - return; - for (size_t i = 0; i < num_matches; ++i) { - request.AddCompletion(process_infos[i].GetNameAsStringRef()); - } - } break; - - case 'P': - CommandCompletions::InvokeCommonCompletionCallbacks( - interpreter, CommandCompletions::eProcessPluginCompletion, request, - nullptr); - break; - } - } - - // Instance variables to hold the values for command options. - ProcessAttachInfo attach_info; }; diff --git a/lldb/source/Commands/CommandObjectReproducer.cpp b/lldb/source/Commands/CommandObjectReproducer.cpp index 9add2df52985d..da2d9ca5a901a 100644 --- a/lldb/source/Commands/CommandObjectReproducer.cpp +++ b/lldb/source/Commands/CommandObjectReproducer.cpp @@ -27,6 +27,7 @@ using namespace lldb_private::repro; enum ReproducerProvider { eReproducerProviderCommands, eReproducerProviderFiles, + eReproducerProviderSymbolFiles, eReproducerProviderGDB, eReproducerProviderProcessInfo, eReproducerProviderVersion, @@ -46,6 +47,11 @@ static constexpr OptionEnumValueElement g_reproducer_provider_type[] = { "files", "Files", }, + { + eReproducerProviderSymbolFiles, + "symbol-files", + "Symbol Files", + }, { eReproducerProviderGDB, "gdb", @@ -427,6 +433,29 @@ class CommandObjectReproducerDump : public CommandObjectParsed { result.SetStatus(eReturnStatusSuccessFinishResult); return true; } + case eReproducerProviderSymbolFiles: { + Expected symbol_files = + loader->LoadBuffer(); + if (!symbol_files) { + SetError(result, symbol_files.takeError()); + return false; + } + + std::vector entries; + llvm::yaml::Input yin(*symbol_files); + yin >> entries; + + for (const auto &entry : entries) { + result.AppendMessageWithFormat("- uuid: %s\n", + entry.uuid.c_str()); + result.AppendMessageWithFormat(" module path: %s\n", + entry.module_path.c_str()); + result.AppendMessageWithFormat(" symbol path: %s\n", + entry.symbol_path.c_str()); + } + result.SetStatus(eReturnStatusSuccessFinishResult); + return true; + } case eReproducerProviderVersion: { Expected version = loader->LoadBuffer(); if (!version) { diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 3cd4ad88afc70..30fdaf9ec9a2d 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -3421,10 +3421,35 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { continue; result.GetOutputStream().Printf( - "UNWIND PLANS for %s`%s (start addr 0x%" PRIx64 ")\n\n", + "UNWIND PLANS for %s`%s (start addr 0x%" PRIx64 ")\n", sc.module_sp->GetPlatformFileSpec().GetFilename().AsCString(), funcname.AsCString(), start_addr); + Args args; + target->GetUserSpecifiedTrapHandlerNames(args); + size_t count = args.GetArgumentCount(); + for (size_t i = 0; i < count; i++) { + const char *trap_func_name = args.GetArgumentAtIndex(i); + if (strcmp(funcname.GetCString(), trap_func_name) == 0) + result.GetOutputStream().Printf( + "This function is " + "treated as a trap handler function via user setting.\n"); + } + PlatformSP platform_sp(target->GetPlatform()); + if (platform_sp) { + const std::vector trap_handler_names( + platform_sp->GetTrapHandlerSymbolNames()); + for (ConstString trap_name : trap_handler_names) { + if (trap_name == funcname) { + result.GetOutputStream().Printf( + "This function's " + "name is listed by the platform as a trap handler.\n"); + } + } + } + + result.GetOutputStream().Printf("\n"); + UnwindPlanSP non_callsite_unwind_plan = func_unwinders_sp->GetUnwindPlanAtNonCallSite(*target, *thread); if (non_callsite_unwind_plan) { @@ -4725,8 +4750,6 @@ class CommandObjectTargetStopHookDelete : public CommandObjectParsed { void HandleArgumentCompletion(CompletionRequest &request, OptionElementVector &opt_element_vector) override { - if (request.GetCursorIndex()) - return; CommandCompletions::InvokeCommonCompletionCallbacks( GetCommandInterpreter(), CommandCompletions::eStopHookIDCompletion, request, nullptr); diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp index b23f91de0ce6c..d820e7abd21f2 100644 --- a/lldb/source/Commands/CommandObjectType.cpp +++ b/lldb/source/Commands/CommandObjectType.cpp @@ -1769,6 +1769,14 @@ class CommandObjectTypeCategoryDefine : public CommandObjectParsed { ~CommandObjectTypeCategoryDefine() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), + CommandCompletions::eTypeCategoryNameCompletion, request, nullptr); + } + protected: bool DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); @@ -1865,6 +1873,14 @@ class CommandObjectTypeCategoryEnable : public CommandObjectParsed { ~CommandObjectTypeCategoryEnable() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), + CommandCompletions::eTypeCategoryNameCompletion, request, nullptr); + } + protected: bool DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); @@ -1927,6 +1943,14 @@ class CommandObjectTypeCategoryDelete : public CommandObjectParsed { ~CommandObjectTypeCategoryDelete() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), + CommandCompletions::eTypeCategoryNameCompletion, request, nullptr); + } + protected: bool DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); @@ -2032,6 +2056,14 @@ class CommandObjectTypeCategoryDisable : public CommandObjectParsed { ~CommandObjectTypeCategoryDisable() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), + CommandCompletions::eTypeCategoryNameCompletion, request, nullptr); + } + protected: bool DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); @@ -2089,6 +2121,16 @@ class CommandObjectTypeCategoryList : public CommandObjectParsed { ~CommandObjectTypeCategoryList() override = default; + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + if (request.GetCursorIndex()) + return; + CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), + CommandCompletions::eTypeCategoryNameCompletion, request, nullptr); + } + protected: bool DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt index 01a25045081f9..a4057d11077f3 100644 --- a/lldb/source/Core/CMakeLists.txt +++ b/lldb/source/Core/CMakeLists.txt @@ -11,8 +11,8 @@ set(LLDB_LIBEDIT_LIBS) if (LLDB_ENABLE_CURSES) list(APPEND LLDB_CURSES_LIBS ${CURSES_LIBRARIES} ${PANEL_LIBRARIES}) - if(LLVM_ENABLE_TERMINFO) - list(APPEND LLDB_CURSES_LIBS ${TERMINFO_LIB}) + if(LLVM_ENABLE_TERMINFO AND HAVE_TERMINFO) + list(APPEND LLDB_CURSES_LIBS ${TERMINFO_LIBS}) endif() if (LLVM_BUILD_STATIC) list(APPEND LLDB_CURSES_LIBS gpm) diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp index ab67e3038cf0a..43e888a68725b 100644 --- a/lldb/source/Core/ValueObjectVariable.cpp +++ b/lldb/source/Core/ValueObjectVariable.cpp @@ -132,8 +132,11 @@ bool ValueObjectVariable::UpdateValue() { if (variable->GetLocationIsConstantValueData()) { // expr doesn't contain DWARF bytes, it contains the constant variable // value bytes themselves... - if (expr.GetExpressionData(m_data)) + if (expr.GetExpressionData(m_data)) { + if (m_data.GetDataStart() && m_data.GetByteSize()) + m_value.SetBytes(m_data.GetDataStart(), m_data.GetByteSize()); m_value.SetContext(Value::eContextTypeVariable, variable); + } else m_error.SetErrorString("empty constant data"); // constant bytes can't be edited - sorry diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp index 944cf5e58b405..58e54e84dc8c6 100644 --- a/lldb/source/Interpreter/CommandObject.cpp +++ b/lldb/source/Interpreter/CommandObject.cpp @@ -1068,7 +1068,7 @@ CommandObject::ArgumentTableEntry CommandObject::g_arguments_data[] = { { eArgTypeLogCategory, "log-category", CommandCompletions::eNoCompletion, { nullptr, false }, "The name of a category within a log channel, e.g. all (try \"log list\" to see a list of all channels and their categories." }, { eArgTypeLogChannel, "log-channel", CommandCompletions::eNoCompletion, { nullptr, false }, "The name of a log channel, e.g. process.gdb-remote (try \"log list\" to see a list of all channels and their categories)." }, { eArgTypeMethod, "method", CommandCompletions::eNoCompletion, { nullptr, false }, "A C++ method name." }, - { eArgTypeName, "name", CommandCompletions::eNoCompletion, { nullptr, false }, "Help text goes here." }, + { eArgTypeName, "name", CommandCompletions::eTypeCategoryNameCompletion, { nullptr, false }, "Help text goes here." }, { eArgTypeNewPathPrefix, "new-path-prefix", CommandCompletions::eNoCompletion, { nullptr, false }, "Help text goes here." }, { eArgTypeNumLines, "num-lines", CommandCompletions::eNoCompletion, { nullptr, false }, "The number of lines to use." }, { eArgTypeNumberPerLine, "number-per-line", CommandCompletions::eNoCompletion, { nullptr, false }, "The number of items per line to display." }, @@ -1078,9 +1078,9 @@ CommandObject::ArgumentTableEntry CommandObject::g_arguments_data[] = { { eArgTypePath, "path", CommandCompletions::eDiskFileCompletion, { nullptr, false }, "Path." }, { eArgTypePermissionsNumber, "perms-numeric", CommandCompletions::eNoCompletion, { nullptr, false }, "Permissions given as an octal number (e.g. 755)." }, { eArgTypePermissionsString, "perms=string", CommandCompletions::eNoCompletion, { nullptr, false }, "Permissions given as a string value (e.g. rw-r-xr--)." }, - { eArgTypePid, "pid", CommandCompletions::eNoCompletion, { nullptr, false }, "The process ID number." }, + { eArgTypePid, "pid", CommandCompletions::eProcessIDCompletion, { nullptr, false }, "The process ID number." }, { eArgTypePlugin, "plugin", CommandCompletions::eProcessPluginCompletion, { nullptr, false }, "Help text goes here." }, - { eArgTypeProcessName, "process-name", CommandCompletions::eNoCompletion, { nullptr, false }, "The name of the process." }, + { eArgTypeProcessName, "process-name", CommandCompletions::eProcessNameCompletion, { nullptr, false }, "The name of the process." }, { eArgTypePythonClass, "python-class", CommandCompletions::eNoCompletion, { nullptr, false }, "The name of a Python class." }, { eArgTypePythonFunction, "python-function", CommandCompletions::eNoCompletion, { nullptr, false }, "The name of a Python function." }, { eArgTypePythonScript, "python-script", CommandCompletions::eNoCompletion, { nullptr, false }, "Source code written in Python." }, diff --git a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp index 21bf7f4ac46d3..e1eb15c3e8c92 100644 --- a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp +++ b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp @@ -661,6 +661,11 @@ PlatformRemoteGDBServer::GetFileSize(const FileSpec &file_spec) { return m_gdb_client.GetFileSize(file_spec); } +void PlatformRemoteGDBServer::AutoCompleteDiskFileOrDirectory( + CompletionRequest &request, bool only_dir) { + m_gdb_client.AutoCompleteDiskFileOrDirectory(request, only_dir); +} + uint64_t PlatformRemoteGDBServer::ReadFile(lldb::user_id_t fd, uint64_t offset, void *dst, uint64_t dst_len, Status &error) { diff --git a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h index 0602be1fa377a..3562b2bb09dfc 100644 --- a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h +++ b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h @@ -127,6 +127,9 @@ class PlatformRemoteGDBServer : public Platform, private UserIDResolver { lldb::user_id_t GetFileSize(const FileSpec &file_spec) override; + void AutoCompleteDiskFileOrDirectory(CompletionRequest &request, + bool only_dir) override; + Status PutFile(const FileSpec &source, const FileSpec &destination, uint32_t uid = UINT32_MAX, uint32_t gid = UINT32_MAX) override; diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp index f7b398ce620d6..0aef36c7e2314 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp @@ -118,7 +118,7 @@ NativeRegisterContextLinux_arm64::ReadRegister(const RegisterInfo *reg_info, : ""); uint8_t *src; - uint32_t offset; + uint32_t offset = LLDB_INVALID_INDEX32; uint64_t sve_vg; std::vector sve_reg_non_live; @@ -172,7 +172,6 @@ NativeRegisterContextLinux_arm64::ReadRegister(const RegisterInfo *reg_info, offset = CalculateSVEOffset(GetRegisterInfoAtIndex(sve_reg_num)); } - offset = CalculateSVEOffset(reg_info); assert(offset < GetSVEBufferSize()); src = (uint8_t *)GetSVEBuffer() + offset; } @@ -234,7 +233,7 @@ Status NativeRegisterContextLinux_arm64::WriteRegister( : ""); uint8_t *dst; - uint32_t offset; + uint32_t offset = LLDB_INVALID_INDEX32; std::vector sve_reg_non_live; if (IsGPR(reg)) { @@ -291,7 +290,6 @@ Status NativeRegisterContextLinux_arm64::WriteRegister( offset = CalculateSVEOffset(GetRegisterInfoAtIndex(sve_reg_num)); } - offset = CalculateSVEOffset(reg_info); assert(offset < GetSVEBufferSize()); dst = (uint8_t *)GetSVEBuffer() + offset; ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size); diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp index cf0daccc2d0c2..0949b99185234 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp @@ -1701,14 +1701,9 @@ Status GDBRemoteCommunicationClient::GetWatchpointSupportInfo(uint32_t &num) { // Set num to 0 first. num = 0; if (m_supports_watchpoint_support_info != eLazyBoolNo) { - char packet[64]; - const int packet_len = - ::snprintf(packet, sizeof(packet), "qWatchpointSupportInfo:"); - assert(packet_len < (int)sizeof(packet)); - UNUSED_IF_ASSERT_DISABLED(packet_len); StringExtractorGDBRemote response; - if (SendPacketAndWaitForResponse(packet, response, false) == - PacketResult::Success) { + if (SendPacketAndWaitForResponse("qWatchpointSupportInfo:", response, + false) == PacketResult::Success) { m_supports_watchpoint_support_info = eLazyBoolYes; llvm::StringRef name; llvm::StringRef value; @@ -2982,6 +2977,31 @@ lldb::user_id_t GDBRemoteCommunicationClient::GetFileSize( return UINT64_MAX; } +void GDBRemoteCommunicationClient::AutoCompleteDiskFileOrDirectory( + CompletionRequest &request, bool only_dir) { + lldb_private::StreamString stream; + stream.PutCString("qPathComplete:"); + stream.PutHex32(only_dir ? 1 : 0); + stream.PutChar(','); + stream.PutStringAsRawHex8(request.GetCursorArgumentPrefix()); + StringExtractorGDBRemote response; + if (SendPacketAndWaitForResponse(stream.GetString(), response, false) == + PacketResult::Success) { + StreamString strm; + char ch = response.GetChar(); + if (ch != 'M') + return; + while (response.Peek()) { + strm.Clear(); + while ((ch = response.GetHexU8(0, false)) != '\0') + strm.PutChar(ch); + request.AddCompletion(strm.GetString()); + if (response.GetChar() != ',') + break; + } + } +} + Status GDBRemoteCommunicationClient::GetFilePermissions(const FileSpec &file_spec, uint32_t &file_permissions) { diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h index 8df08cbde735e..0159125a433b8 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h @@ -375,6 +375,9 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase { lldb::user_id_t GetFileSize(const FileSpec &file_spec); + void AutoCompleteDiskFileOrDirectory(CompletionRequest &request, + bool only_dir); + Status GetFilePermissions(const FileSpec &file_spec, uint32_t &file_permissions); diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp index d14b79a03d17f..7e94afb9ec686 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp @@ -26,12 +26,14 @@ #include "lldb/Host/FileAction.h" #include "lldb/Host/Host.h" #include "lldb/Host/HostInfo.h" +#include "lldb/Interpreter/CommandCompletions.h" #include "lldb/Target/Platform.h" #include "lldb/Target/UnixSignals.h" #include "lldb/Utility/GDBRemote.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/StreamString.h" #include "lldb/Utility/StructuredData.h" +#include "lldb/Utility/TildeExpressionResolver.h" #include "lldb/Utility/UriParser.h" #include "lldb/Utility/StringExtractorGDBRemote.h" @@ -68,6 +70,9 @@ GDBRemoteCommunicationServerPlatform::GDBRemoteCommunicationServerPlatform( RegisterMemberFunctionHandler( StringExtractorGDBRemote::eServerPacketType_qProcessInfo, &GDBRemoteCommunicationServerPlatform::Handle_qProcessInfo); + RegisterMemberFunctionHandler( + StringExtractorGDBRemote::eServerPacketType_qPathComplete, + &GDBRemoteCommunicationServerPlatform::Handle_qPathComplete); RegisterMemberFunctionHandler( StringExtractorGDBRemote::eServerPacketType_QSetWorkingDir, &GDBRemoteCommunicationServerPlatform::Handle_QSetWorkingDir); @@ -333,6 +338,38 @@ GDBRemoteCommunicationServerPlatform::Handle_qProcessInfo( return SendPacketNoLock(response.GetString()); } +GDBRemoteCommunication::PacketResult +GDBRemoteCommunicationServerPlatform::Handle_qPathComplete( + StringExtractorGDBRemote &packet) { + packet.SetFilePos(::strlen("qPathComplete:")); + const bool only_dir = (packet.GetHexMaxU32(false, 0) == 1); + if (packet.GetChar() != ',') + return SendErrorResponse(85); + std::string path; + packet.GetHexByteString(path); + + StringList matches; + StandardTildeExpressionResolver resolver; + if (only_dir) + CommandCompletions::DiskDirectories(path, matches, resolver); + else + CommandCompletions::DiskFiles(path, matches, resolver); + + StreamString response; + response.PutChar('M'); + llvm::StringRef separator; + std::sort(matches.begin(), matches.end()); + for (const auto &match : matches) { + response << separator; + separator = ","; + // encode result strings into hex bytes to avoid unexpected error caused by + // special characters like '$'. + response.PutStringAsRawHex8(match.c_str()); + } + + return SendPacketNoLock(response.GetString()); +} + GDBRemoteCommunication::PacketResult GDBRemoteCommunicationServerPlatform::Handle_qGetWorkingDir( StringExtractorGDBRemote &packet) { diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.h index a8cacea788355..8b3122d1423ea 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.h +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.h @@ -81,6 +81,8 @@ class GDBRemoteCommunicationServerPlatform PacketResult Handle_qKillSpawnedProcess(StringExtractorGDBRemote &packet); + PacketResult Handle_qPathComplete(StringExtractorGDBRemote &packet); + PacketResult Handle_qProcessInfo(StringExtractorGDBRemote &packet); PacketResult Handle_qGetWorkingDir(StringExtractorGDBRemote &packet); diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp index 0c7f4cbbb859d..5f2982b3c09c3 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp +++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp @@ -267,6 +267,88 @@ llvm::ArrayRef MinidumpParser::GetModuleList() { return {}; } +static bool +CreateRegionsCacheFromLinuxMaps(MinidumpParser &parser, + std::vector ®ions) { + auto data = parser.GetStream(StreamType::LinuxMaps); + if (data.empty()) + return false; + ParseLinuxMapRegions(llvm::toStringRef(data), + [&](const lldb_private::MemoryRegionInfo ®ion, + const lldb_private::Status &status) -> bool { + if (status.Success()) + regions.push_back(region); + return true; + }); + return !regions.empty(); +} + +/// Check for the memory regions starting at \a load_addr for a contiguous +/// section that has execute permissions that matches the module path. +/// +/// When we load a breakpad generated minidump file, we might have the +/// /proc//maps text for a process that details the memory map of the +/// process that the minidump is describing. This checks the sorted memory +/// regions for a section that has execute permissions. A sample maps files +/// might look like: +/// +/// 00400000-00401000 r--p 00000000 fd:01 2838574 /tmp/a.out +/// 00401000-00402000 r-xp 00001000 fd:01 2838574 /tmp/a.out +/// 00402000-00403000 r--p 00002000 fd:01 2838574 /tmp/a.out +/// 00403000-00404000 r--p 00002000 fd:01 2838574 /tmp/a.out +/// 00404000-00405000 rw-p 00003000 fd:01 2838574 /tmp/a.out +/// ... +/// +/// This function should return true when given 0x00400000 and "/tmp/a.out" +/// is passed in as the path since it has a consecutive memory region for +/// "/tmp/a.out" that has execute permissions at 0x00401000. This will help us +/// differentiate if a file has been memory mapped into a process for reading +/// and breakpad ends up saving a minidump file that has two module entries for +/// a given file: one that is read only for the entire file, and then one that +/// is the real executable that is loaded into memory for execution. For memory +/// mapped files they will typically show up and r--p permissions and a range +/// matcning the entire range of the file on disk: +/// +/// 00800000-00805000 r--p 00000000 fd:01 2838574 /tmp/a.out +/// 00805000-00806000 r-xp 00001000 fd:01 1234567 /usr/lib/libc.so +/// +/// This function should return false when asked about 0x00800000 with +/// "/tmp/a.out" as the path. +/// +/// \param[in] path +/// The path to the module to check for in the memory regions. Only sequential +/// memory regions whose paths match this path will be considered when looking +/// for execute permissions. +/// +/// \param[in] regions +/// A sorted list of memory regions obtained from a call to +/// CreateRegionsCacheFromLinuxMaps. +/// +/// \param[in] base_of_image +/// The load address of this module from BaseOfImage in the modules list. +/// +/// \return +/// True if a contiguous region of memory belonging to the module with a +/// matching path exists that has executable permissions. Returns false if +/// \a regions is empty or if there are no regions with execute permissions +/// that match \a path. + +static bool CheckForLinuxExecutable(ConstString path, + const MemoryRegionInfos ®ions, + lldb::addr_t base_of_image) { + if (regions.empty()) + return false; + lldb::addr_t addr = base_of_image; + MemoryRegionInfo region = MinidumpParser::GetMemoryRegionInfo(regions, addr); + while (region.GetName() == path) { + if (region.GetExecutable() == MemoryRegionInfo::eYes) + return true; + addr += region.GetRange().GetByteSize(); + region = MinidumpParser::GetMemoryRegionInfo(regions, addr); + } + return false; +} + std::vector MinidumpParser::GetFilteredModuleList() { Log *log = GetLogIfAnyCategoriesSet(LIBLLDB_LOG_MODULES); auto ExpectedModules = GetMinidumpFile().getModuleList(); @@ -276,6 +358,15 @@ std::vector MinidumpParser::GetFilteredModuleList() { return {}; } + // Create memory regions from the linux maps only. We do this to avoid issues + // with breakpad generated minidumps where if someone has mmap'ed a shared + // library into memory to accesss its data in the object file, we can get a + // minidump with two mappings for a binary: one whose base image points to a + // memory region that is read + execute and one that is read only. + MemoryRegionInfos linux_regions; + if (CreateRegionsCacheFromLinuxMaps(*this, linux_regions)) + llvm::sort(linux_regions); + // map module_name -> filtered_modules index typedef llvm::StringMap MapType; MapType module_name_to_filtered_index; @@ -304,10 +395,25 @@ std::vector MinidumpParser::GetFilteredModuleList() { // "filtered_modules.size()" above. filtered_modules.push_back(&module); } else { + // We have a duplicate module entry. Check the linux regions to see if + // the module we already have is not really a mapped executable. If it + // isn't check to see if the current duplicate module entry is a real + // mapped executable, and if so, replace it. This can happen when a + // process mmap's in the file for an executable in order to read bytes + // from the executable file. A memory region mapping will exist for the + // mmap'ed version and for the loaded executable, but only one will have + // a consecutive region that is executable in the memory regions. + auto dup_module = filtered_modules[iter->second]; + ConstString name(*ExpectedName); + if (!CheckForLinuxExecutable(name, linux_regions, + dup_module->BaseOfImage) && + CheckForLinuxExecutable(name, linux_regions, module.BaseOfImage)) { + filtered_modules[iter->second] = &module; + continue; + } // This module has been seen. Modules are sometimes mentioned multiple // times when they are mapped discontiguously, so find the module with // the lowest "base_of_image" and use that as the filtered module. - auto dup_module = filtered_modules[iter->second]; if (module.BaseOfImage < dup_module->BaseOfImage) filtered_modules[iter->second] = &module; } @@ -411,22 +517,6 @@ llvm::ArrayRef MinidumpParser::GetMemory(lldb::addr_t addr, return range->range_ref.slice(offset, overlap); } -static bool -CreateRegionsCacheFromLinuxMaps(MinidumpParser &parser, - std::vector ®ions) { - auto data = parser.GetStream(StreamType::LinuxMaps); - if (data.empty()) - return false; - ParseLinuxMapRegions(llvm::toStringRef(data), - [&](const lldb_private::MemoryRegionInfo ®ion, - const lldb_private::Status &status) -> bool { - if (status.Success()) - regions.push_back(region); - return true; - }); - return !regions.empty(); -} - static bool CreateRegionsCacheFromMemoryInfoList(MinidumpParser &parser, std::vector ®ions) { @@ -500,10 +590,10 @@ CreateRegionsCacheFromMemory64List(MinidumpParser &parser, uint64_t base_rva; std::tie(memory64_list, base_rva) = MinidumpMemoryDescriptor64::ParseMemory64List(data); - + if (memory64_list.empty()) return false; - + regions.reserve(memory64_list.size()); for (const auto &memory_desc : memory64_list) { if (memory_desc.data_size == 0) @@ -597,3 +687,30 @@ MinidumpParser::GetStreamTypeAsString(StreamType stream_type) { } return "unknown stream type"; } + +MemoryRegionInfo +MinidumpParser::GetMemoryRegionInfo(const MemoryRegionInfos ®ions, + lldb::addr_t load_addr) { + MemoryRegionInfo region; + auto pos = llvm::upper_bound(regions, load_addr); + if (pos != regions.begin() && + std::prev(pos)->GetRange().Contains(load_addr)) { + return *std::prev(pos); + } + + if (pos == regions.begin()) + region.GetRange().SetRangeBase(0); + else + region.GetRange().SetRangeBase(std::prev(pos)->GetRange().GetRangeEnd()); + + if (pos == regions.end()) + region.GetRange().SetRangeEnd(UINT64_MAX); + else + region.GetRange().SetRangeEnd(pos->GetRange().GetRangeBase()); + + region.SetReadable(MemoryRegionInfo::eNo); + region.SetWritable(MemoryRegionInfo::eNo); + region.SetExecutable(MemoryRegionInfo::eNo); + region.SetMapped(MemoryRegionInfo::eNo); + return region; +} diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h index c4d7612b5f8d6..ff7134ff18158 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h +++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h @@ -96,6 +96,9 @@ class MinidumpParser { llvm::object::MinidumpFile &GetMinidumpFile() { return *m_file; } + static MemoryRegionInfo GetMemoryRegionInfo(const MemoryRegionInfos ®ions, + lldb::addr_t load_addr); + private: MinidumpParser(lldb::DataBufferSP data_sp, std::unique_ptr file); diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp index fc8ee346f4491..17fdfdb4f345d 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp @@ -121,6 +121,72 @@ class PlaceholderObjectFile : public ObjectFile { lldb::addr_t m_base; lldb::addr_t m_size; }; + +/// Duplicate the HashElfTextSection() from the breakpad sources. +/// +/// Breakpad, a Google crash log reporting tool suite, creates minidump files +/// for many different architectures. When using Breakpad to create ELF +/// minidumps, it will check for a GNU build ID when creating a minidump file +/// and if one doesn't exist in the file, it will say the UUID of the file is a +/// checksum of up to the first 4096 bytes of the .text section. Facebook also +/// uses breakpad and modified this hash to avoid collisions so we can +/// calculate and check for this as well. +/// +/// The breakpad code might end up hashing up to 15 bytes that immediately +/// follow the .text section in the file, so this code must do exactly what it +/// does so we can get an exact match for the UUID. +/// +/// \param[in] module_sp The module to grab the .text section from. +/// +/// \param[in/out] breakpad_uuid A vector that will receive the calculated +/// breakpad .text hash. +/// +/// \param[in/out] facebook_uuid A vector that will receive the calculated +/// facebook .text hash. +/// +void HashElfTextSection(ModuleSP module_sp, std::vector &breakpad_uuid, + std::vector &facebook_uuid) { + SectionList *sect_list = module_sp->GetSectionList(); + if (sect_list == nullptr) + return; + SectionSP sect_sp = sect_list->FindSectionByName(ConstString(".text")); + if (!sect_sp) + return; + constexpr size_t kMDGUIDSize = 16; + constexpr size_t kBreakpadPageSize = 4096; + // The breakpad code has a bug where it might access beyond the end of a + // .text section by up to 15 bytes, so we must ensure we round up to the + // next kMDGUIDSize byte boundary. + DataExtractor data; + const size_t text_size = sect_sp->GetFileSize(); + const size_t read_size = std::min( + llvm::alignTo(text_size, kMDGUIDSize), kBreakpadPageSize); + sect_sp->GetObjectFile()->GetData(sect_sp->GetFileOffset(), read_size, data); + + breakpad_uuid.assign(kMDGUIDSize, 0); + facebook_uuid.assign(kMDGUIDSize, 0); + + // The only difference between the breakpad hash and the facebook hash is the + // hashing of the text section size into the hash prior to hashing the .text + // contents. + for (size_t i = 0; i < kMDGUIDSize; i++) + facebook_uuid[i] ^= text_size % 255; + + // This code carefully duplicates how the hash was created in Breakpad + // sources, including the error where it might has an extra 15 bytes past the + // end of the .text section if the .text section is less than a page size in + // length. + const uint8_t *ptr = data.GetDataStart(); + const uint8_t *ptr_end = data.GetDataEnd(); + while (ptr < ptr_end) { + for (unsigned i = 0; i < kMDGUIDSize; i++) { + breakpad_uuid[i] ^= ptr[i]; + facebook_uuid[i] ^= ptr[i]; + } + ptr += kMDGUIDSize; + } +} + } // namespace ConstString ProcessMinidump::GetPluginNameStatic() { @@ -338,32 +404,6 @@ ArchSpec ProcessMinidump::GetArchitecture() { return ArchSpec(triple); } -static MemoryRegionInfo GetMemoryRegionInfo(const MemoryRegionInfos ®ions, - lldb::addr_t load_addr) { - MemoryRegionInfo region; - auto pos = llvm::upper_bound(regions, load_addr); - if (pos != regions.begin() && - std::prev(pos)->GetRange().Contains(load_addr)) { - return *std::prev(pos); - } - - if (pos == regions.begin()) - region.GetRange().SetRangeBase(0); - else - region.GetRange().SetRangeBase(std::prev(pos)->GetRange().GetRangeEnd()); - - if (pos == regions.end()) - region.GetRange().SetRangeEnd(UINT64_MAX); - else - region.GetRange().SetRangeEnd(pos->GetRange().GetRangeBase()); - - region.SetReadable(MemoryRegionInfo::eNo); - region.SetWritable(MemoryRegionInfo::eNo); - region.SetExecutable(MemoryRegionInfo::eNo); - region.SetMapped(MemoryRegionInfo::eNo); - return region; -} - void ProcessMinidump::BuildMemoryRegions() { if (m_memory_regions) return; @@ -388,7 +428,7 @@ void ProcessMinidump::BuildMemoryRegions() { MemoryRegionInfo::RangeType section_range(load_addr, section_sp->GetByteSize()); MemoryRegionInfo region = - ::GetMemoryRegionInfo(*m_memory_regions, load_addr); + MinidumpParser::GetMemoryRegionInfo(*m_memory_regions, load_addr); if (region.GetMapped() != MemoryRegionInfo::eYes && region.GetRange().GetRangeBase() <= section_range.GetRangeBase() && section_range.GetRangeEnd() <= region.GetRange().GetRangeEnd()) { @@ -409,7 +449,7 @@ void ProcessMinidump::BuildMemoryRegions() { Status ProcessMinidump::GetMemoryRegionInfo(lldb::addr_t load_addr, MemoryRegionInfo ®ion) { BuildMemoryRegions(); - region = ::GetMemoryRegionInfo(*m_memory_regions, load_addr); + region = MinidumpParser::GetMemoryRegionInfo(*m_memory_regions, load_addr); return Status(); } @@ -494,10 +534,33 @@ void ProcessMinidump::ReadModuleList() { const bool match = dmp_bytes.empty() || mod_bytes.empty() || mod_bytes.take_front(dmp_bytes.size()) == dmp_bytes; if (!match) { + // Breakpad generates minindump files, and if there is no GNU build + // ID in the binary, it will calculate a UUID by hashing first 4096 + // bytes of the .text section and using that as the UUID for a module + // in the minidump. Facebook uses a modified breakpad client that + // uses a slightly modified this hash to avoid collisions. Check for + // UUIDs from the minindump that match these cases and accept the + // module we find if they do match. + std::vector breakpad_uuid; + std::vector facebook_uuid; + HashElfTextSection(module_sp, breakpad_uuid, facebook_uuid); + if (dmp_bytes == llvm::ArrayRef(breakpad_uuid)) { + LLDB_LOG(log, "Breakpad .text hash match for {0}.", name); + } else if (dmp_bytes == llvm::ArrayRef(facebook_uuid)) { + LLDB_LOG(log, "Facebook .text hash match for {0}.", name); + } else { + // The UUID wasn't a partial match and didn't match the .text hash + // so remove the module from the target, we will need to create a + // placeholder object file. GetTarget().GetImages().Remove(module_sp); module_sp.reset(); + } + } else { + LLDB_LOG(log, "Partial uuid match for {0}.", name); } } + } else { + LLDB_LOG(log, "Full uuid match for {0}.", name); } if (module_sp) { // Watch out for place holder modules that have different paths, but the diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h index b401352c693dd..fe6a55520978f 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h @@ -42,6 +42,7 @@ class DWARFFormValue { DWARFFormValue(const DWARFUnit *unit) : m_unit(unit) {} DWARFFormValue(const DWARFUnit *unit, dw_form_t form) : m_unit(unit), m_form(form) {} + const DWARFUnit *GetUnit() const { return m_unit; } void SetUnit(const DWARFUnit *unit) { m_unit = unit; } dw_form_t Form() const { return m_form; } dw_form_t& FormRef() { return m_form; } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 5ce392a57e0c8..271821b245175 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -3111,18 +3111,15 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc, const char *name = nullptr; const char *mangled = nullptr; Declaration decl; - uint32_t i; DWARFFormValue type_die_form; DWARFExpression location; bool is_external = false; bool is_artificial = false; - bool location_is_const_value_data = false; - bool has_explicit_location = false; - DWARFFormValue const_value; + DWARFFormValue const_value_form, location_form; Variable::RangeList scope_ranges; // AccessType accessibility = eAccessNone; - for (i = 0; i < num_attributes; ++i) { + for (size_t i = 0; i < num_attributes; ++i) { dw_attr_t attr = attributes.AttributeAtIndex(i); DWARFFormValue form_value; @@ -3152,91 +3149,11 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc, is_external = form_value.Boolean(); break; case DW_AT_const_value: - // If we have already found a DW_AT_location attribute, ignore this - // attribute. - if (!has_explicit_location) { - location_is_const_value_data = true; - // The constant value will be either a block, a data value or a - // string. - auto debug_info_data = die.GetData(); - if (DWARFFormValue::IsBlockForm(form_value.Form())) { - // Retrieve the value as a block expression. - uint32_t block_offset = - form_value.BlockData() - debug_info_data.GetDataStart(); - uint32_t block_length = form_value.Unsigned(); - location = DWARFExpression( - module, - DataExtractor(debug_info_data, block_offset, block_length), - die.GetCU()); - } else if (DWARFFormValue::IsDataForm(form_value.Form())) { - // Retrieve the value as a data expression. - uint32_t data_offset = attributes.DIEOffsetAtIndex(i); - if (auto data_length = form_value.GetFixedSize()) - location = DWARFExpression( - module, - DataExtractor(debug_info_data, data_offset, *data_length), - die.GetCU()); - else { - const uint8_t *data_pointer = form_value.BlockData(); - if (data_pointer) { - form_value.Unsigned(); - } else if (DWARFFormValue::IsDataForm(form_value.Form())) { - // we need to get the byte size of the type later after we - // create the variable - const_value = form_value; - } - } - } else { - // Retrieve the value as a string expression. - if (form_value.Form() == DW_FORM_strp) { - uint32_t data_offset = attributes.DIEOffsetAtIndex(i); - if (auto data_length = form_value.GetFixedSize()) - location = DWARFExpression(module, - DataExtractor(debug_info_data, - data_offset, - *data_length), - die.GetCU()); - } else { - const char *str = form_value.AsCString(); - uint32_t string_offset = - str - (const char *)debug_info_data.GetDataStart(); - uint32_t string_length = strlen(str) + 1; - location = DWARFExpression(module, - DataExtractor(debug_info_data, - string_offset, - string_length), - die.GetCU()); - } - } - } + const_value_form = form_value; + break; + case DW_AT_location: + location_form = form_value; break; - case DW_AT_location: { - location_is_const_value_data = false; - has_explicit_location = true; - if (DWARFFormValue::IsBlockForm(form_value.Form())) { - auto data = die.GetData(); - - uint32_t block_offset = - form_value.BlockData() - data.GetDataStart(); - uint32_t block_length = form_value.Unsigned(); - location = DWARFExpression( - module, DataExtractor(data, block_offset, block_length), - die.GetCU()); - } else { - DataExtractor data = die.GetCU()->GetLocationData(); - dw_offset_t offset = form_value.Unsigned(); - if (form_value.Form() == DW_FORM_loclistx) - offset = die.GetCU()->GetLoclistOffset(offset).getValueOr(-1); - if (data.ValidOffset(offset)) { - data = DataExtractor(data, offset, data.GetByteSize() - offset); - location = DWARFExpression(module, data, die.GetCU()); - assert(func_low_pc != LLDB_INVALID_ADDRESS); - location.SetLocationListAddresses( - attributes.CompileUnitAtIndex(i)->GetBaseAddress(), - func_low_pc); - } - } - } break; case DW_AT_specification: spec_die = form_value.Reference(); break; @@ -3262,6 +3179,66 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc, } } + // Prefer DW_AT_location over DW_AT_const_value. Both can be emitted e.g. + // for static constexpr member variables -- DW_AT_const_value will be + // present in the class declaration and DW_AT_location in the DIE defining + // the member. + bool location_is_const_value_data = false; + bool has_explicit_location = false; + bool use_type_size_for_value = false; + if (location_form.IsValid()) { + has_explicit_location = true; + if (DWARFFormValue::IsBlockForm(location_form.Form())) { + const DWARFDataExtractor &data = die.GetData(); + + uint32_t block_offset = + location_form.BlockData() - data.GetDataStart(); + uint32_t block_length = location_form.Unsigned(); + location = DWARFExpression( + module, DataExtractor(data, block_offset, block_length), + die.GetCU()); + } else { + DataExtractor data = die.GetCU()->GetLocationData(); + dw_offset_t offset = location_form.Unsigned(); + if (location_form.Form() == DW_FORM_loclistx) + offset = die.GetCU()->GetLoclistOffset(offset).getValueOr(-1); + if (data.ValidOffset(offset)) { + data = DataExtractor(data, offset, data.GetByteSize() - offset); + location = DWARFExpression(module, data, die.GetCU()); + assert(func_low_pc != LLDB_INVALID_ADDRESS); + location.SetLocationListAddresses( + location_form.GetUnit()->GetBaseAddress(), func_low_pc); + } + } + } else if (const_value_form.IsValid()) { + location_is_const_value_data = true; + // The constant value will be either a block, a data value or a + // string. + const DWARFDataExtractor &debug_info_data = die.GetData(); + if (DWARFFormValue::IsBlockForm(const_value_form.Form())) { + // Retrieve the value as a block expression. + uint32_t block_offset = + const_value_form.BlockData() - debug_info_data.GetDataStart(); + uint32_t block_length = const_value_form.Unsigned(); + location = DWARFExpression( + module, + DataExtractor(debug_info_data, block_offset, block_length), + die.GetCU()); + } else if (DWARFFormValue::IsDataForm(const_value_form.Form())) { + // Constant value size does not have to match the size of the + // variable. We will fetch the size of the type after we create + // it. + use_type_size_for_value = true; + } else if (const char *str = const_value_form.AsCString()) { + uint32_t string_length = strlen(str) + 1; + location = DWARFExpression( + module, + DataExtractor(str, string_length, die.GetCU()->GetByteOrder(), + die.GetCU()->GetAddressByteSize()), + die.GetCU()); + } + } + const DWARFDIE parent_context_die = GetDeclContextDIEContainingDIE(die); const dw_tag_t parent_tag = die.GetParent().Tag(); bool is_static_member = @@ -3441,12 +3418,12 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc, } if (symbol_context_scope) { - SymbolFileTypeSP type_sp( - new SymbolFileType(*this, GetUID(type_die_form.Reference()))); + auto type_sp = std::make_shared( + *this, GetUID(type_die_form.Reference())); - if (const_value.Form() && type_sp && type_sp->GetType()) + if (use_type_size_for_value && type_sp->GetType()) location.UpdateValue( - const_value.Unsigned(), + const_value_form.Unsigned(), type_sp->GetType()->GetByteSize(nullptr).getValueOr(0), die.GetCU()->GetAddressByteSize()); diff --git a/lldb/source/Symbol/LocateSymbolFile.cpp b/lldb/source/Symbol/LocateSymbolFile.cpp index 95ae2ca7917a0..af4bbb6e53608 100644 --- a/lldb/source/Symbol/LocateSymbolFile.cpp +++ b/lldb/source/Symbol/LocateSymbolFile.cpp @@ -16,6 +16,7 @@ #include "lldb/Utility/DataBuffer.h" #include "lldb/Utility/DataExtractor.h" #include "lldb/Utility/Log.h" +#include "lldb/Utility/Reproducer.h" #include "lldb/Utility/StreamString.h" #include "lldb/Utility/Timer.h" #include "lldb/Utility/UUID.h" @@ -225,6 +226,7 @@ static FileSpec LocateExecutableSymbolFileDsym(const ModuleSpec &module_spec) { } else { dsym_module_spec.GetSymbolFileSpec() = symbol_fspec; } + return dsym_module_spec.GetSymbolFileSpec(); } @@ -248,6 +250,7 @@ ModuleSpec Symbols::LocateExecutableObjectFile(const ModuleSpec &module_spec) { } else { LocateMacOSXFilesUsingDebugSymbols(module_spec, result); } + return result; } diff --git a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp index 251605085c588..fe30ceeabcb2b 100644 --- a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp +++ b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp @@ -27,6 +27,7 @@ #include "lldb/Utility/DataExtractor.h" #include "lldb/Utility/Endian.h" #include "lldb/Utility/Log.h" +#include "lldb/Utility/ReproducerProvider.h" #include "lldb/Utility/StreamString.h" #include "lldb/Utility/Timer.h" #include "lldb/Utility/UUID.h" @@ -53,6 +54,17 @@ int LocateMacOSXFilesUsingDebugSymbols(const ModuleSpec &module_spec, return_module_spec.GetFileSpec().Clear(); return_module_spec.GetSymbolFileSpec().Clear(); + const UUID *uuid = module_spec.GetUUIDPtr(); + const ArchSpec *arch = module_spec.GetArchitecturePtr(); + + if (repro::Loader *l = repro::Reproducer::Instance().GetLoader()) { + static repro::SymbolFileLoader symbol_file_loader(l); + std::pair paths = symbol_file_loader.GetPaths(uuid); + return_module_spec.GetFileSpec() = paths.first; + return_module_spec.GetSymbolFileSpec() = paths.second; + return 1; + } + int items_found = 0; if (g_dlsym_DBGCopyFullDSYMURLForUUID == nullptr || @@ -69,9 +81,6 @@ int LocateMacOSXFilesUsingDebugSymbols(const ModuleSpec &module_spec, return items_found; } - const UUID *uuid = module_spec.GetUUIDPtr(); - const ArchSpec *arch = module_spec.GetArchitecturePtr(); - if (uuid && uuid->IsValid()) { // Try and locate the dSYM file using DebugSymbols first llvm::ArrayRef module_uuid = uuid->GetBytes(); @@ -247,6 +256,12 @@ int LocateMacOSXFilesUsingDebugSymbols(const ModuleSpec &module_spec, } } + if (repro::Generator *g = repro::Reproducer::Instance().GetGenerator()) { + g->GetOrCreate().AddSymbolFile( + uuid, return_module_spec.GetFileSpec(), + return_module_spec.GetSymbolFileSpec()); + } + return items_found; } @@ -464,6 +479,25 @@ bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, const UUID *uuid_ptr = module_spec.GetUUIDPtr(); const FileSpec *file_spec_ptr = module_spec.GetFileSpecPtr(); + if (repro::Loader *l = repro::Reproducer::Instance().GetLoader()) { + static repro::SymbolFileLoader symbol_file_loader(l); + std::pair paths = symbol_file_loader.GetPaths(uuid_ptr); + if (paths.first) + module_spec.GetFileSpec() = paths.first; + if (paths.second) + module_spec.GetSymbolFileSpec() = paths.second; + return true; + } + + // Lambda to capture the state of module_spec before returning from this + // function. + auto RecordResult = [&]() { + if (repro::Generator *g = repro::Reproducer::Instance().GetGenerator()) { + g->GetOrCreate().AddSymbolFile( + uuid_ptr, module_spec.GetFileSpec(), module_spec.GetSymbolFileSpec()); + } + }; + // It's expensive to check for the DBGShellCommands defaults setting, only do // it once per lldb run and cache the result. static bool g_have_checked_for_dbgshell_command = false; @@ -489,6 +523,7 @@ bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, // When g_dbgshell_command is NULL, the user has not enabled the use of an // external program to find the symbols, don't run it for them. if (!force_lookup && g_dbgshell_command == NULL) { + RecordResult(); return false; } @@ -613,8 +648,10 @@ bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, ::CFDictionaryGetKeysAndValues(plist.get(), NULL, (const void **)&values[0]); if (num_values == 1) { - return GetModuleSpecInfoFromUUIDDictionary(values[0], - module_spec); + success = GetModuleSpecInfoFromUUIDDictionary(values[0], + module_spec); + RecordResult(); + return success; } else { for (CFIndex i = 0; i < num_values; ++i) { ModuleSpec curr_module_spec; @@ -623,6 +660,7 @@ bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, if (module_spec.GetArchitecture().IsCompatibleMatch( curr_module_spec.GetArchitecture())) { module_spec = curr_module_spec; + RecordResult(); return true; } } @@ -644,5 +682,6 @@ bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, } } } + RecordResult(); return success; } diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp index ecf0575b9a57e..378523d008968 100644 --- a/lldb/source/Symbol/Type.cpp +++ b/lldb/source/Symbol/Type.cpp @@ -375,6 +375,7 @@ llvm::Optional Type::GetByteSize(ExecutionContextScope *exe_scope) { if (ArchSpec arch = m_symbol_file->GetObjectFile()->GetArchitecture()) { m_byte_size = arch.GetAddressByteSize(); m_byte_size_has_value = true; + return m_byte_size; } } break; } diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp index e8906f38e2ffe..278a79d8c0ecf 100644 --- a/lldb/source/Symbol/UnwindPlan.cpp +++ b/lldb/source/Symbol/UnwindPlan.cpp @@ -527,6 +527,18 @@ void UnwindPlan::Dump(Stream &s, Thread *thread, lldb::addr_t base_addr) const { s.Printf("not specified.\n"); break; } + s.Printf("This UnwindPlan is for a trap handler function: "); + switch (m_plan_is_for_signal_trap) { + case eLazyBoolYes: + s.Printf("yes.\n"); + break; + case eLazyBoolNo: + s.Printf("no.\n"); + break; + case eLazyBoolCalculate: + s.Printf("not specified.\n"); + break; + } if (m_plan_valid_address_range.GetBaseAddress().IsValid() && m_plan_valid_address_range.GetByteSize() > 0) { s.PutCString("Address range of this UnwindPlan: "); diff --git a/lldb/source/Utility/Reproducer.cpp b/lldb/source/Utility/Reproducer.cpp index 9276c7449d7b2..68c64195f55ee 100644 --- a/lldb/source/Utility/Reproducer.cpp +++ b/lldb/source/Utility/Reproducer.cpp @@ -73,6 +73,10 @@ llvm::Error Reproducer::Initialize(ReproducerMode mode, return Error::success(); } +void Reproducer::Initialize() { + llvm::cantFail(Initialize(repro::ReproducerMode::Off, llvm::None)); +} + bool Reproducer::Initialized() { return InstanceImpl().operator bool(); } void Reproducer::Terminate() { diff --git a/lldb/source/Utility/ReproducerProvider.cpp b/lldb/source/Utility/ReproducerProvider.cpp index 54f3a870b7dd8..f5556659390bf 100644 --- a/lldb/source/Utility/ReproducerProvider.cpp +++ b/lldb/source/Utility/ReproducerProvider.cpp @@ -105,6 +105,61 @@ void ProcessInfoRecorder::Record(const ProcessInstanceInfoList &process_infos) { m_os.flush(); } +void SymbolFileProvider::AddSymbolFile(const UUID *uuid, + const FileSpec &module_file, + const FileSpec &symbol_file) { + if (!uuid || (!module_file && !symbol_file)) + return; + m_symbol_files.emplace_back(uuid->GetAsString(), module_file.GetPath(), + symbol_file.GetPath()); +} + +void SymbolFileProvider::Keep() { + FileSpec file = this->GetRoot().CopyByAppendingPathComponent(Info::file); + std::error_code ec; + llvm::raw_fd_ostream os(file.GetPath(), ec, llvm::sys::fs::OF_Text); + if (ec) + return; + + // Remove duplicates. + llvm::sort(m_symbol_files.begin(), m_symbol_files.end()); + m_symbol_files.erase( + std::unique(m_symbol_files.begin(), m_symbol_files.end()), + m_symbol_files.end()); + + llvm::yaml::Output yout(os); + yout << m_symbol_files; +} + +SymbolFileLoader::SymbolFileLoader(Loader *loader) { + if (!loader) + return; + + FileSpec file = loader->GetFile(); + if (!file) + return; + + auto error_or_file = llvm::MemoryBuffer::getFile(file.GetPath()); + if (auto err = error_or_file.getError()) + return; + + llvm::yaml::Input yin((*error_or_file)->getBuffer()); + yin >> m_symbol_files; +} + +std::pair +SymbolFileLoader::GetPaths(const UUID *uuid) const { + if (!uuid) + return {}; + + auto it = std::lower_bound(m_symbol_files.begin(), m_symbol_files.end(), + SymbolFileProvider::Entry(uuid->GetAsString())); + if (it == m_symbol_files.end()) + return {}; + return std::make_pair(FileSpec(it->module_path), + FileSpec(it->symbol_path)); +} + void ProviderBase::anchor() {} char CommandProvider::ID = 0; char FileProvider::ID = 0; @@ -113,6 +168,7 @@ char VersionProvider::ID = 0; char WorkingDirectoryProvider::ID = 0; char HomeDirectoryProvider::ID = 0; char ProcessInfoProvider::ID = 0; +char SymbolFileProvider::ID = 0; const char *CommandProvider::Info::file = "command-interpreter.yaml"; const char *CommandProvider::Info::name = "command-interpreter"; const char *FileProvider::Info::file = "files.yaml"; @@ -125,3 +181,5 @@ const char *HomeDirectoryProvider::Info::file = "home.txt"; const char *HomeDirectoryProvider::Info::name = "home"; const char *ProcessInfoProvider::Info::file = "process-info.yaml"; const char *ProcessInfoProvider::Info::name = "process-info"; +const char *SymbolFileProvider::Info::file = "symbol-files.yaml"; +const char *SymbolFileProvider::Info::name = "symbol-files"; diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp index e5a1454561f26..3dd2813b5eb07 100644 --- a/lldb/source/Utility/Scalar.cpp +++ b/lldb/source/Utility/Scalar.cpp @@ -25,6 +25,7 @@ using namespace lldb_private; using llvm::APFloat; using llvm::APInt; +using llvm::APSInt; Scalar::Category Scalar::GetCategory(Scalar::Type type) { switch (type) { @@ -32,32 +33,19 @@ Scalar::Category Scalar::GetCategory(Scalar::Type type) { return Category::Void; case Scalar::e_float: return Category::Float; - case Scalar::e_sint: - case Scalar::e_uint: + case Scalar::e_int: return Category::Integral; } llvm_unreachable("Unhandled type!"); } -static bool IsSigned(Scalar::Type type) { - switch (type) { - case Scalar::e_void: - case Scalar::e_uint: - return false; - case Scalar::e_sint: - case Scalar::e_float: - return true; - } - llvm_unreachable("Unhandled type!"); -} - Scalar::PromotionKey Scalar::GetPromoKey() const { Category cat = GetCategory(m_type); switch (cat) { case Category::Void: - return {cat, 0, false}; + return PromotionKey{cat, 0, false}; case Category::Integral: - return {cat, m_integer.getBitWidth(), !IsSigned(m_type)}; + return PromotionKey{cat, m_integer.getBitWidth(), m_integer.isUnsigned()}; case Category::Float: return GetFloatPromoKey(m_float.getSemantics()); } @@ -70,7 +58,7 @@ Scalar::PromotionKey Scalar::GetFloatPromoKey(const llvm::fltSemantics &sem) { &APFloat::x87DoubleExtended()}; for (const auto &entry : llvm::enumerate(order)) { if (entry.value() == &sem) - return {Category::Float, entry.index(), false}; + return PromotionKey{Category::Float, entry.index(), false}; } llvm_unreachable("Unsupported semantics!"); } @@ -83,8 +71,7 @@ Scalar::Type Scalar::PromoteToMaxType(Scalar &lhs, Scalar &rhs) { case Category::Void: break; case Category::Integral: - a.IntegralPromote(b.UInt128(APInt()).getBitWidth(), - IsSigned(b.GetType())); + a.IntegralPromote(b.m_integer.getBitWidth(), b.m_integer.isSigned()); break; case Category::Float: a.FloatPromote(b.m_float.getSemantics()); @@ -158,8 +145,7 @@ size_t Scalar::GetByteSize() const { switch (m_type) { case e_void: break; - case e_sint: - case e_uint: + case e_int: return (m_integer.getBitWidth() / 8); case e_float: return m_float.bitcastToAPInt().getBitWidth() / 8; @@ -187,7 +173,7 @@ void Scalar::GetValue(Stream *s, bool show_type) const { case Category::Void: break; case Category::Integral: - s->PutCString(m_integer.toString(10, IsSigned(m_type))); + s->PutCString(m_integer.toString(10)); break; case Category::Float: llvm::SmallString<24> string; @@ -198,8 +184,8 @@ void Scalar::GetValue(Stream *s, bool show_type) const { } void Scalar::TruncOrExtendTo(uint16_t bits, bool sign) { - m_integer = sign ? m_integer.sextOrTrunc(bits) : m_integer.zextOrTrunc(bits); - m_type = sign ? e_sint : e_uint; + m_integer.setIsSigned(sign); + m_integer = m_integer.extOrTrunc(bits); } bool Scalar::IntegralPromote(uint16_t bits, bool sign) { @@ -210,11 +196,8 @@ bool Scalar::IntegralPromote(uint16_t bits, bool sign) { case Category::Integral: if (GetPromoKey() > PromotionKey(Category::Integral, bits, !sign)) break; - if (IsSigned(m_type)) - m_integer = m_integer.sextOrTrunc(bits); - else - m_integer = m_integer.zextOrTrunc(bits); - m_type = sign ? e_sint : e_uint; + m_integer = m_integer.extOrTrunc(bits); + m_integer.setIsSigned(sign); return true; } return false; @@ -227,7 +210,7 @@ bool Scalar::FloatPromote(const llvm::fltSemantics &semantics) { break; case Category::Integral: m_float = llvm::APFloat(semantics); - m_float.convertFromAPInt(m_integer, IsSigned(m_type), + m_float.convertFromAPInt(m_integer, m_integer.isSigned(), llvm::APFloat::rmNearestTiesToEven); success = true; break; @@ -248,27 +231,34 @@ const char *Scalar::GetValueTypeAsCString(Scalar::Type type) { switch (type) { case e_void: return "void"; - case e_sint: - return "signed int"; - case e_uint: - return "unsigned int"; + case e_int: + return "int"; case e_float: return "float"; } return "???"; } +bool Scalar::IsSigned() const { + switch (m_type) { + case e_void: + return false; + case e_int: + return m_integer.isSigned(); + case e_float: + return true; + } + llvm_unreachable("Unrecognized type!"); +} + bool Scalar::MakeSigned() { bool success = false; switch (m_type) { case e_void: break; - case e_sint: - success = true; - break; - case e_uint: - m_type = e_sint; + case e_int: + m_integer.setIsSigned(true); success = true; break; case e_float: @@ -285,11 +275,8 @@ bool Scalar::MakeUnsigned() { switch (m_type) { case e_void: break; - case e_sint: - m_type = e_uint; - success = true; - break; - case e_uint: + case e_int: + m_integer.setIsUnsigned(true); success = true; break; case e_float: @@ -312,10 +299,12 @@ template T Scalar::GetAs(T fail_value) const { switch (GetCategory(m_type)) { case Category::Void: break; - case Category::Integral: - if (IsSigned(m_type)) - return m_integer.sextOrTrunc(sizeof(T) * 8).getSExtValue(); - return m_integer.zextOrTrunc(sizeof(T) * 8).getZExtValue(); + case Category::Integral: { + APSInt ext = m_integer.extOrTrunc(sizeof(T) * 8); + if (ext.isSigned()) + return ext.getSExtValue(); + return ext.getZExtValue(); + } case Category::Float: return ToAPInt(m_float, sizeof(T) * 8, std::is_unsigned::value) .getSExtValue(); @@ -388,7 +377,7 @@ float Scalar::Float(float fail_value) const { case Category::Void: break; case Category::Integral: - if (IsSigned(m_type)) + if (m_integer.isSigned()) return llvm::APIntOps::RoundSignedAPIntToFloat(m_integer); return llvm::APIntOps::RoundAPIntToFloat(m_integer); @@ -408,7 +397,7 @@ double Scalar::Double(double fail_value) const { case Category::Void: break; case Category::Integral: - if (IsSigned(m_type)) + if (m_integer.isSigned()) return llvm::APIntOps::RoundSignedAPIntToDouble(m_integer); return llvm::APIntOps::RoundAPIntToDouble(m_integer); @@ -449,7 +438,7 @@ Scalar &Scalar::operator+=(Scalar rhs) { Scalar &Scalar::operator<<=(const Scalar &rhs) { if (GetCategory(m_type) == Category::Integral && GetCategory(rhs.m_type) == Category::Integral) - m_integer <<= rhs.m_integer; + static_cast(m_integer) <<= rhs.m_integer; else m_type = e_void; return *this; @@ -472,15 +461,13 @@ Scalar &Scalar::operator>>=(const Scalar &rhs) { m_type = e_void; break; - case e_sint: - case e_uint: + case e_int: switch (rhs.m_type) { case e_void: case e_float: m_type = e_void; break; - case e_sint: - case e_uint: + case e_int: m_integer = m_integer.ashr(rhs.m_integer); break; } @@ -503,13 +490,11 @@ bool Scalar::AbsoluteValue() { case e_void: break; - case e_sint: + case e_int: if (m_integer.isNegative()) m_integer = -m_integer; return true; - case e_uint: - return true; case e_float: m_float.clearSign(); return true; @@ -571,10 +556,7 @@ const Scalar lldb_private::operator/(Scalar lhs, Scalar rhs) { case Scalar::Category::Void: break; case Scalar::Category::Integral: - if (IsSigned(result.m_type)) - result.m_integer = lhs.m_integer.sdiv(rhs.m_integer); - else - result.m_integer = lhs.m_integer.udiv(rhs.m_integer); + result.m_integer = lhs.m_integer / rhs.m_integer; return result; case Scalar::Category::Float: result.m_float = lhs.m_float / rhs.m_float; @@ -631,10 +613,7 @@ const Scalar lldb_private::operator%(Scalar lhs, Scalar rhs) { if ((result.m_type = Scalar::PromoteToMaxType(lhs, rhs)) != Scalar::e_void) { if (!rhs.IsZero() && Scalar::GetCategory(result.m_type) == Scalar::Category::Integral) { - if (IsSigned(result.m_type)) - result.m_integer = lhs.m_integer.srem(rhs.m_integer); - else - result.m_integer = lhs.m_integer.urem(rhs.m_integer); + result.m_integer = lhs.m_integer % rhs.m_integer; return result; } } @@ -702,13 +681,9 @@ Status Scalar::SetValueFromCString(const char *value_str, Encoding encoding, value_str, byte_size); break; } - if (is_signed) { - m_type = e_sint; - m_integer = integer.sextOrTrunc(8 * byte_size); - } else { - m_type = e_uint; - m_integer = integer.zextOrTrunc(8 * byte_size); - } + m_type = e_int; + m_integer = + APSInt(std::move(integer), !is_signed).extOrTrunc(8 * byte_size); break; } @@ -754,9 +729,10 @@ Status Scalar::SetValueFromData(const DataExtractor &data, case lldb::eEncodingSint: { if (data.GetByteSize() < byte_size) return Status("insufficient data"); - m_type = encoding == lldb::eEncodingSint ? e_sint : e_uint; + m_type = e_int; + m_integer = + APSInt(APInt::getNullValue(8 * byte_size), encoding == eEncodingUint); if (data.GetByteOrder() == endian::InlHostByteOrder()) { - m_integer = APInt::getNullValue(8 * byte_size); llvm::LoadIntFromMemory(m_integer, data.GetDataStart(), byte_size); } else { std::vector buffer(byte_size); @@ -792,17 +768,16 @@ bool Scalar::SignExtend(uint32_t sign_bit_pos) { case Scalar::e_float: return false; - case Scalar::e_sint: - case Scalar::e_uint: + case Scalar::e_int: if (max_bit_pos == sign_bit_pos) return true; else if (sign_bit_pos < (max_bit_pos - 1)) { llvm::APInt sign_bit = llvm::APInt::getSignMask(sign_bit_pos + 1); llvm::APInt bitwize_and = m_integer & sign_bit; if (bitwize_and.getBoolValue()) { - const llvm::APInt mask = + llvm::APInt mask = ~(sign_bit) + llvm::APInt(m_integer.getBitWidth(), 1); - m_integer |= mask; + m_integer |= APSInt(std::move(mask), m_integer.isUnsigned()); } return true; } @@ -846,16 +821,9 @@ bool Scalar::ExtractBitfield(uint32_t bit_size, uint32_t bit_offset) { case Scalar::e_float: break; - case Scalar::e_sint: - m_integer = m_integer.ashr(bit_offset) - .sextOrTrunc(bit_size) - .sextOrSelf(8 * GetByteSize()); - return true; - - case Scalar::e_uint: - m_integer = m_integer.lshr(bit_offset) - .zextOrTrunc(bit_size) - .zextOrSelf(8 * GetByteSize()); + case Scalar::e_int: + m_integer >>= bit_offset; + m_integer = m_integer.extOrTrunc(bit_size).extOrTrunc(8 * GetByteSize()); return true; } return false; @@ -870,8 +838,7 @@ bool lldb_private::operator==(Scalar lhs, Scalar rhs) { switch (Scalar::PromoteToMaxType(lhs, rhs)) { case Scalar::e_void: break; - case Scalar::e_sint: - case Scalar::e_uint: + case Scalar::e_int: return lhs.m_integer == rhs.m_integer; case Scalar::e_float: result = lhs.m_float.compare(rhs.m_float); @@ -893,10 +860,8 @@ bool lldb_private::operator<(Scalar lhs, Scalar rhs) { switch (Scalar::PromoteToMaxType(lhs, rhs)) { case Scalar::e_void: break; - case Scalar::e_sint: - return lhs.m_integer.slt(rhs.m_integer); - case Scalar::e_uint: - return lhs.m_integer.ult(rhs.m_integer); + case Scalar::e_int: + return lhs.m_integer < rhs.m_integer; case Scalar::e_float: result = lhs.m_float.compare(rhs.m_float); if (result == llvm::APFloat::cmpLessThan) @@ -921,8 +886,7 @@ bool Scalar::ClearBit(uint32_t bit) { switch (m_type) { case e_void: break; - case e_sint: - case e_uint: + case e_int: m_integer.clearBit(bit); return true; case e_float: @@ -935,8 +899,7 @@ bool Scalar::SetBit(uint32_t bit) { switch (m_type) { case e_void: break; - case e_sint: - case e_uint: + case e_int: m_integer.setBit(bit); return true; case e_float: diff --git a/lldb/source/Utility/StringExtractorGDBRemote.cpp b/lldb/source/Utility/StringExtractorGDBRemote.cpp index cfe7577e4863a..2901500b29e30 100644 --- a/lldb/source/Utility/StringExtractorGDBRemote.cpp +++ b/lldb/source/Utility/StringExtractorGDBRemote.cpp @@ -233,6 +233,8 @@ StringExtractorGDBRemote::GetServerPacketType() const { return eServerPacketType_qPlatform_chmod; if (PACKET_MATCHES("qProcessInfo")) return eServerPacketType_qProcessInfo; + if (PACKET_STARTS_WITH("qPathComplete:")) + return eServerPacketType_qPathComplete; break; case 'Q': diff --git a/lldb/test/API/functionalities/asan/TestMemoryHistory.py b/lldb/test/API/functionalities/asan/TestMemoryHistory.py index 37c34984f43b2..0b8dc20f27c53 100644 --- a/lldb/test/API/functionalities/asan/TestMemoryHistory.py +++ b/lldb/test/API/functionalities/asan/TestMemoryHistory.py @@ -15,6 +15,9 @@ class AsanTestCase(TestBase): mydir = TestBase.compute_mydir(__file__) + @expectedFailureAll( + oslist=["linux"], + bugnumber="non-core functionality, need to reenable and fix later (DES 2014.11.07)") @skipIfFreeBSD # llvm.org/pr21136 runtimes not yet available by default @expectedFailureNetBSD @skipUnlessAddressSanitizer diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py index 4c81288b38364..b80594b7568bc 100644 --- a/lldb/test/API/functionalities/completion/TestCompletion.py +++ b/lldb/test/API/functionalities/completion/TestCompletion.py @@ -5,6 +5,7 @@ import os +from multiprocessing import Process import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -117,6 +118,38 @@ def test_process_plugin_completion(self): self.complete_from_to('process ' + subcommand + ' mac', 'process ' + subcommand + ' mach-o-core') + def completions_contain_str(self, input, needle): + interp = self.dbg.GetCommandInterpreter() + match_strings = lldb.SBStringList() + num_matches = interp.HandleCompletion(input, len(input), 0, -1, match_strings) + found_needle = False + for match in match_strings: + if needle in match: + found_needle = True + break + self.assertTrue(found_needle, "Returned completions: " + "\n".join(match_strings)) + + + @skipIfRemote + def test_common_completion_process_pid_and_name(self): + # The LLDB process itself and the process already attached to are both + # ignored by the process discovery mechanism, thus we need a process known + # to us here. + self.build() + server = self.spawnSubprocess( + self.getBuildArtifact("a.out"), + ["-x"], # Arg "-x" makes the subprocess wait for input thus it won't be terminated too early + install_remote=False) + self.assertIsNotNone(server) + pid = server.pid + + self.complete_from_to('process attach -p ', [str(pid)]) + self.complete_from_to('platform process attach -p ', [str(pid)]) + self.complete_from_to('platform process info ', [str(pid)]) + + self.completions_contain_str('process attach -n ', "a.out") + self.completions_contain_str('platform process attach -n ', "a.out") + def test_process_signal(self): # The tab completion for "process signal" won't work without a running process. self.complete_from_to('process signal ', @@ -445,6 +478,12 @@ def test_common_completion_thread_index(self): for subcommand in subcommands: self.complete_from_to('thread ' + subcommand + ' ', ['1']) + def test_common_completion_type_category_name(self): + subcommands = ['delete', 'list', 'enable', 'disable', 'define'] + for subcommand in subcommands: + self.complete_from_to('type category ' + subcommand + ' ', ['default']) + self.complete_from_to('type filter add -w ', ['default']) + def test_command_argument_completion(self): """Test completion of command arguments""" self.complete_from_to("watchpoint set variable -", ["-w", "-s"]) diff --git a/lldb/test/API/functionalities/completion/main.cpp b/lldb/test/API/functionalities/completion/main.cpp index eba81dc4c54cd..06ff5773e8a9d 100644 --- a/lldb/test/API/functionalities/completion/main.cpp +++ b/lldb/test/API/functionalities/completion/main.cpp @@ -1,3 +1,5 @@ +#include + class Foo { public: @@ -11,14 +13,16 @@ namespace { int Quux (void) { return 0; } } struct Container { int MemberVar; }; -int main() -{ - Foo fooo; - Foo *ptr_fooo = &fooo; - fooo.Bar(1, 2); +int main(int argc, char *argv[]) { + if (argc > 1 && std::string(argv[1]) == "-x") + std::cin.get(); + + Foo fooo; + Foo *ptr_fooo = &fooo; + fooo.Bar(1, 2); - Container container; - Container *ptr_container = &container; - int q = Quux(); - return container.MemberVar = 3; // Break here + Container container; + Container *ptr_container = &container; + int q = Quux(); + return container.MemberVar = 3; // Break here } diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteDiskFileCompletion.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteDiskFileCompletion.py new file mode 100644 index 0000000000000..90f830c4a2787 --- /dev/null +++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteDiskFileCompletion.py @@ -0,0 +1,29 @@ +from gdbclientutils import * + +class TestGDBRemoteDiskFileCompletion(GDBRemoteTestBase): + + def test_autocomplete_request(self): + """Test remote disk completion on remote-gdb-server plugin""" + + class Responder(MockGDBServerResponder): + def qPathComplete(self): + return "M{},{}".format( + "test".encode().hex(), + "123".encode().hex() + ) + + self.server.responder = Responder() + + try: + self.runCmd("platform select remote-gdb-server") + self.runCmd("platform connect connect://localhost:%d" % + self.server.port) + self.assertTrue(self.dbg.GetSelectedPlatform().IsConnected()) + + self.complete_from_to('platform get-size ', ['test', '123']) + self.complete_from_to('platform get-file ', ['test', '123']) + self.complete_from_to('platform put-file foo ', ['test', '123']) + self.complete_from_to('platform file open ', ['test', '123']) + self.complete_from_to('platform settings -w ', ['test', '123']) + finally: + self.dbg.GetSelectedPlatform().DisconnectRemote() diff --git a/lldb/test/API/functionalities/gdb_remote_client/gdbclientutils.py b/lldb/test/API/functionalities/gdb_remote_client/gdbclientutils.py index feda8873d8724..eb789e861d9c3 100644 --- a/lldb/test/API/functionalities/gdb_remote_client/gdbclientutils.py +++ b/lldb/test/API/functionalities/gdb_remote_client/gdbclientutils.py @@ -178,6 +178,8 @@ def respond(self, packet): return self.qsProcessInfo() if packet.startswith("qfProcessInfo"): return self.qfProcessInfo(packet) + if packet.startswith("qPathComplete:"): + return self.qPathComplete() return self.other(packet) @@ -282,6 +284,9 @@ def QListThreadsInStopReply(self): def qMemoryRegionInfo(self): return "" + def qPathComplete(self): + return "" + """ Raised when we receive a packet for which there is no default action. Override the responder class to implement behavior suitable for the test at diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py index cc6d6fb37caed..c4dcddba631b9 100644 --- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py +++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py @@ -179,6 +179,69 @@ def test_partial_uuid_mismatch(self): "/invalid/path/on/current/system/libuuidmismatch.so", "7295E17C-6668-9E05-CBB5-DEE5003865D5") + def test_breakpad_hash_match(self): + """ + Breakpad creates minidump files using CvRecord in each module whose + signature is set to PDB70 where the UUID is a hash generated by + breakpad of the .text section. This is only done when the + executable has no ELF build ID. + + This test verifies that if we have a minidump with a 16 byte UUID, + that we are able to associate a symbol file with no ELF build ID + and match it up by hashing the .text section. + """ + so_path = self.getBuildArtifact("libbreakpad.so") + self.yaml2obj("libbreakpad.yaml", so_path) + cmd = 'settings set target.exec-search-paths "%s"' % (os.path.dirname(so_path)) + self.dbg.HandleCommand(cmd) + modules = self.get_minidump_modules("linux-arm-breakpad-uuid-match.yaml") + self.assertEqual(1, len(modules)) + # LLDB makes up it own UUID as well when there is no build ID so we + # will check that this matches. + self.verify_module(modules[0], so_path, "D9C480E8") + + def test_breakpad_overflow_hash_match(self): + """ + This is a similar to test_breakpad_hash_match, but it verifies that + if the .text section does not end on a 16 byte boundary, then it + will overflow into the next section's data by up to 15 bytes. This + verifies that we are able to match what breakpad does as it will do + this. + """ + so_path = self.getBuildArtifact("libbreakpad.so") + self.yaml2obj("libbreakpad-overflow.yaml", so_path) + cmd = 'settings set target.exec-search-paths "%s"' % (os.path.dirname(so_path)) + self.dbg.HandleCommand(cmd) + modules = self.get_minidump_modules("linux-arm-breakpad-uuid-match.yaml") + self.assertEqual(1, len(modules)) + # LLDB makes up it own UUID as well when there is no build ID so we + # will check that this matches. + self.verify_module(modules[0], so_path, "48EB9FD7") + + + def test_facebook_hash_match(self): + """ + Breakpad creates minidump files using CvRecord in each module whose + signature is set to PDB70 where the UUID is a hash generated by + breakpad of the .text section and Facebook modified this hash to + avoid collisions. This is only done when the executable has no ELF + build ID. + + This test verifies that if we have a minidump with a 16 byte UUID, + that we are able to associate a symbol file with no ELF build ID + and match it up by hashing the .text section like Facebook does. + """ + so_path = self.getBuildArtifact("libbreakpad.so") + self.yaml2obj("libbreakpad.yaml", so_path) + cmd = 'settings set target.exec-search-paths "%s"' % (os.path.dirname(so_path)) + self.dbg.HandleCommand(cmd) + modules = self.get_minidump_modules("linux-arm-facebook-uuid-match.yaml") + self.assertEqual(1, len(modules)) + # LLDB makes up it own UUID as well when there is no build ID so we + # will check that this matches. + self.verify_module(modules[0], so_path, "D9C480E8") + + def test_relative_module_name(self): old_cwd = os.getcwd() self.addTearDownHook(lambda: os.chdir(old_cwd)) diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-overflow.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-overflow.yaml new file mode 100644 index 0000000000000..807a468f3d4bb --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-overflow.yaml @@ -0,0 +1,21 @@ +--- !ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_ARM + Flags: [ EF_ARM_SOFT_FLOAT, EF_ARM_EABI_VER5 ] +Sections: +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000010000 + AddressAlign: 0x0000000000000001 + Content: 04 + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_WRITE ] + Address: 0x0000000000010001 + AddressAlign: 0x0000000000000001 + Content: 0000001400000003000000474E5500 diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad.yaml new file mode 100644 index 0000000000000..53e96f601aa82 --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad.yaml @@ -0,0 +1,15 @@ +--- !ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_ARM + Flags: [ EF_ARM_SOFT_FLOAT, EF_ARM_EABI_VER5 ] +Sections: +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000010000 + AddressAlign: 0x0000000000000004 + Content: 040000001400000003000000474E5500 diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-breakpad-uuid-match.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-breakpad-uuid-match.yaml new file mode 100644 index 0000000000000..37848982c5864 --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-breakpad-uuid-match.yaml @@ -0,0 +1,15 @@ +--- !minidump +Streams: + - Type: SystemInfo + Processor Arch: ARM + Platform ID: Linux + CSD Version: '15E216' + CPU: + CPUID: 0x00000000 + - Type: ModuleList + Modules: + - Base of Image: 0x0000000000001000 + Size of Image: 0x00001000 + Module Name: '/invalid/path/on/current/system/libbreakpad.so' + CodeView Record: 52534453040000001400000003000000474e55000000000000 +... diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-facebook-uuid-match.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-facebook-uuid-match.yaml new file mode 100644 index 0000000000000..203fc669a0b87 --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-facebook-uuid-match.yaml @@ -0,0 +1,15 @@ +--- !minidump +Streams: + - Type: SystemInfo + Processor Arch: ARM + Platform ID: Linux + CSD Version: '15E216' + CPU: + CPUID: 0x00000000 + - Type: ModuleList + Modules: + - Base of Image: 0x0000000000001000 + Size of Image: 0x00001000 + Module Name: '/invalid/path/on/current/system/libbreakpad.so' + CodeView Record: 52534453141010100410101013101010575e45100000000000 +... diff --git a/lldb/test/API/macosx/queues/TestQueues.py b/lldb/test/API/macosx/queues/TestQueues.py index e177daa54fa34..711c99a7d400d 100644 --- a/lldb/test/API/macosx/queues/TestQueues.py +++ b/lldb/test/API/macosx/queues/TestQueues.py @@ -192,7 +192,6 @@ def queues(self): user_initiated_thread = lldb.SBThread() user_interactive_thread = lldb.SBThread() utility_thread = lldb.SBThread() - unspecified_thread = lldb.SBThread() background_thread = lldb.SBThread() for th in process.threads: if th.GetName() == "user initiated QoS": @@ -201,8 +200,6 @@ def queues(self): user_interactive_thread = th if th.GetName() == "utility QoS": utility_thread = th - if th.GetName() == "unspecified QoS": - unspecified_thread = th if th.GetName() == "background QoS": background_thread = th @@ -213,9 +210,6 @@ def queues(self): user_interactive_thread.IsValid(), "Found user interactive QoS thread") self.assertTrue(utility_thread.IsValid(), "Found utility QoS thread") - self.assertTrue( - unspecified_thread.IsValid(), - "Found unspecified QoS thread") self.assertTrue( background_thread.IsValid(), "Found background QoS thread") @@ -248,16 +242,6 @@ def queues(self): stream.GetData(), "Utility", "utility QoS thread name is valid") stream.Clear() - self.assertTrue( - unspecified_thread.GetInfoItemByPathAsString( - "requested_qos.printable_name", - stream), - "Get QoS printable string for unspecified QoS thread") - qosName = stream.GetData() - self.assertTrue( - qosName == "User Initiated" or qosName == "Default", - "unspecified QoS thread name is valid: " + str(qosName)) - stream.Clear() self.assertTrue( background_thread.GetInfoItemByPathAsString( "requested_qos.printable_name", diff --git a/lldb/test/API/macosx/queues/main.c b/lldb/test/API/macosx/queues/main.c index 3978b92bff1a6..2bf390b1330a6 100644 --- a/lldb/test/API/macosx/queues/main.c +++ b/lldb/test/API/macosx/queues/main.c @@ -136,15 +136,9 @@ int main (int argc, const char **argv) while (1) sleep (10); }); - dispatch_async (dispatch_get_global_queue(QOS_CLASS_UNSPECIFIED, 0), ^{ - pthread_setname_np ("unspecified QoS"); - atomic_fetch_add(&thread_count, 1); - while (1) - sleep (10); - }); // Unfortunately there is no pthread_barrier on darwin. - while ((atomic_load(&thread_count) < 13) || (finished_enqueueing_work == 0)) + while ((atomic_load(&thread_count) < 12) || (finished_enqueueing_work == 0)) sleep (1); stopper (); diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py new file mode 100644 index 0000000000000..94e628c811af3 --- /dev/null +++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py @@ -0,0 +1,63 @@ +import tempfile +import gdbremote_testcase +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbgdbserverutils import * + +class GdbRemoteCompletionTestCase(gdbremote_testcase.GdbRemoteTestCaseBase): + mydir = TestBase.compute_mydir(__file__) + + def init_lldb_server(self): + self.debug_monitor_exe = get_lldb_server_exe() + if not self.debug_monitor_exe: + self.skipTest("lldb-server exe not found") + port_file = tempfile.NamedTemporaryFile().name + commandline_args = [ + "platform", + "--listen", + "*:0", + "--socket-file", + port_file + ] + server = self.spawnSubprocess( + get_lldb_server_exe(), + commandline_args, + install_remote=False) + self.assertIsNotNone(server) + self.stub_hostname = "localhost" + self.port = int(lldbutil.wait_for_file_on_target(self, port_file)) + self.sock = self.create_socket() + + self.add_no_ack_remote_stream() + + def generate_hex_path(self, target): + return str(os.path.join(self.getBuildDir(), target)).encode().hex() + + @skipIfDarwinEmbedded # lldb-server tests not updated to work on ios etc yet + @llgs_test + def test_autocomplete_path(self): + self.build() + self.init_lldb_server() + + # Test file-included completion when flag is set to 0. + self.test_sequence.add_log_lines( + ["read packet: $qPathComplete:0,{}#00".format( + self.generate_hex_path("main")), + "send packet: $M{},{}#00".format( + self.generate_hex_path("main.d"), + self.generate_hex_path("main.o")) + ], + True) + + # Test directory-only completion when flag is set to 1. + os.makedirs(os.path.join(self.getBuildDir(), "test")) + self.test_sequence.add_log_lines( + ["read packet: $qPathComplete:1,{}#00".format( + self.generate_hex_path("tes")), + "send packet: $M{}{}#00".format( + self.generate_hex_path("test"), + os.path.sep.encode().hex()) # "test/" or "test\". + ], + True) + + self.expect_gdbremote_sequence() diff --git a/lldb/test/Shell/Minidump/Windows/arm-fp-unwind.test b/lldb/test/Shell/Minidump/Windows/arm-fp-unwind.test index ed871ac544a86..7c056b612a4e1 100644 --- a/lldb/test/Shell/Minidump/Windows/arm-fp-unwind.test +++ b/lldb/test/Shell/Minidump/Windows/arm-fp-unwind.test @@ -12,6 +12,7 @@ CHECK: Assembly language inspection UnwindPlan: CHECK-NEXT: This UnwindPlan originally sourced from EmulateInstructionARM CHECK-NEXT: This UnwindPlan is sourced from the compiler: no. CHECK-NEXT: This UnwindPlan is valid at all instruction locations: yes. +CHECK-NEXT: This UnwindPlan is for a trap handler function: no. CHECK-NEXT: row[0]: 0: CFA=sp +0 => CHECK-NEXT: row[1]: 4: CFA=sp +8 => fp=[CFA-8] lr=[CFA-4] CHECK-NEXT: row[2]: 6: CFA=fp +8 => fp=[CFA-8] lr=[CFA-4] diff --git a/lldb/test/Shell/Reproducer/Inputs/core b/lldb/test/Shell/Reproducer/Inputs/core new file mode 100644 index 0000000000000..4b8aabe65a8b6 Binary files /dev/null and b/lldb/test/Shell/Reproducer/Inputs/core differ diff --git a/lldb/test/Shell/Reproducer/Inputs/dsymforuuid.sh b/lldb/test/Shell/Reproducer/Inputs/dsymforuuid.sh new file mode 100755 index 0000000000000..ce5ade741ed65 --- /dev/null +++ b/lldb/test/Shell/Reproducer/Inputs/dsymforuuid.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +echo "" +echo "" +echo "" +echo "" +echo " AD52358C-94F8-3796-ADD6-B20FFAC00E5C" +echo " " +echo " DBGArchitecture" +echo " x86_64" +echo " DBGBuildSourcePath" +echo " /path/to/build/sources" +echo " DBGSourcePath" +echo " /path/to/actual/sources" +echo " DBGDSYMPath" +echo " /path/to/foo.dSYM/Contents/Resources/DWARF/foo" +echo " DBGSymbolRichExecutable" +echo " /path/to/unstripped/executable" +echo " " +echo "" +echo "" diff --git a/lldb/test/Shell/Reproducer/TestDebugSymbols.test b/lldb/test/Shell/Reproducer/TestDebugSymbols.test new file mode 100644 index 0000000000000..6a3cc1249cbd1 --- /dev/null +++ b/lldb/test/Shell/Reproducer/TestDebugSymbols.test @@ -0,0 +1,14 @@ +# REQUIRES: system-darwin + +# RUN: rm -rf %t.repro +# RUN: env LLDB_APPLE_DSYMFORUUID_EXECUTABLE=%S/Inputs/dsymforuuid.sh %lldb --capture --capture-path %t.repro -c %S/Inputs/core -o 'reproducer generate' + +# RUN: cat %t.repro/symbol-files.yaml | FileCheck %s --check-prefix YAML +# YAML: AD52358C-94F8-3796-ADD6-B20FFAC00E5C +# YAML: /path/to/unstripped/executable +# YAML: /path/to/foo.dSYM/Contents/Resources/DWARF/foo + +# RUN: %lldb -b -o 'reproducer dump -p symbol-files -f %t.repro' | FileCheck %s --check-prefix DUMP +# DUMP: uuid: AD52358C-94F8-3796-ADD6-B20FFAC00E5C +# DUMP-NEXT: module path: /path/to/unstripped/executable +# DUMP-NEXT: symbol path: /path/to/foo.dSYM/Contents/Resources/DWARF/foo diff --git a/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-parsing.test b/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-parsing.test index dd98939aa82e5..539b8096b58bc 100644 --- a/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-parsing.test +++ b/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-parsing.test @@ -10,6 +10,7 @@ image show-unwind -n func0 # CHECK-NEXT: This UnwindPlan originally sourced from breakpad STACK CFI # CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes. # CHECK-NEXT: This UnwindPlan is valid at all instruction locations: no. +# CHECK-NEXT: This UnwindPlan is for a trap handler function: no. # CHECK-NEXT: Address range of this UnwindPlan: [stack-cfi-parsing.out..module_image + 0-0x0000000000000002) # CHECK-NEXT: row[0]: 0: CFA=DW_OP_breg7 +0 => rbp=DW_OP_breg7 +0 rip=DW_OP_pick 0x0 # CHECK-NEXT: row[1]: 1: CFA=DW_OP_breg7 +0 => rbx=DW_OP_breg2 +0 rbp=DW_OP_breg0 +0 rip=DW_OP_pick 0x0 diff --git a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-raSearch.test b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-raSearch.test index 1e280738900a6..1c1dabec59447 100644 --- a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-raSearch.test +++ b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-raSearch.test @@ -12,6 +12,7 @@ image show-unwind -n call_many # CHECK: This UnwindPlan originally sourced from breakpad STACK WIN # CHECK: This UnwindPlan is sourced from the compiler: yes. # CHECK: This UnwindPlan is valid at all instruction locations: no. +# CHECK: This UnwindPlan is for a trap handler function: no. # CHECK: Address range of this UnwindPlan: [unwind-via-stack-win.exe..module_image + 4112-0x0000107d) # CHECK: row[0]: 0: CFA=RaSearch@SP+0 => esp=DW_OP_pick 0x0, DW_OP_consts +4, DW_OP_plus eip=DW_OP_pick 0x0, DW_OP_deref diff --git a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-cfi.test b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-cfi.test index 62d0ef5ce5981..dade708519f5b 100644 --- a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-cfi.test +++ b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-cfi.test @@ -12,6 +12,7 @@ image show-unwind -n bar # CHECK-NEXT: This UnwindPlan originally sourced from breakpad STACK CFI # CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes. # CHECK-NEXT: This UnwindPlan is valid at all instruction locations: no. +# CHECK-NEXT: This UnwindPlan is for a trap handler function: no. # CHECK-NEXT: Address range of this UnwindPlan: [unwind-via-stack-cfi..module_image + 1056-0x0000000000000449) # CHECK-NEXT: row[0]: 0: CFA=DW_OP_breg6 +0, DW_OP_deref => rbp=DW_OP_pick 0x0, DW_OP_deref rsp=DW_OP_pick 0x0, DW_OP_consts +16, DW_OP_plus rip=DW_OP_pick 0x0, DW_OP_consts +8, DW_OP_plus, DW_OP_deref diff --git a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-win.test b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-win.test index 2e4ac58c9bea2..5698a4f63878a 100644 --- a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-win.test +++ b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-win.test @@ -14,6 +14,7 @@ image show-unwind -n call_many # CHECK: This UnwindPlan originally sourced from breakpad STACK WIN # CHECK: This UnwindPlan is sourced from the compiler: yes. # CHECK: This UnwindPlan is valid at all instruction locations: no. +# CHECK: This UnwindPlan is for a trap handler function: no. # CHECK: Address range of this UnwindPlan: [unwind-via-stack-win.exe..module_image + 4112-0x0000107d) # CHECK: row[0]: 0: CFA=DW_OP_breg7 +0, DW_OP_consts +80, DW_OP_plus => esp=DW_OP_pick 0x0, DW_OP_consts +4, DW_OP_plus eip=DW_OP_pick 0x0, DW_OP_deref @@ -42,6 +43,7 @@ image show-unwind -n temporary_var # CHECK: This UnwindPlan originally sourced from breakpad STACK WIN # CHECK: This UnwindPlan is sourced from the compiler: yes. # CHECK: This UnwindPlan is valid at all instruction locations: no. +# CHECK: This UnwindPlan is for a trap handler function: no. # CHECK: Address range of this UnwindPlan: [unwind-via-stack-win.exe..module_image + 4400-0x00001134) # CHECK: row[0]: 0: CFA=DW_OP_breg7 +0 => esp=DW_OP_pick 0x0, DW_OP_consts +4, DW_OP_plus eip=DW_OP_pick 0x0, DW_OP_deref diff --git a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_const_value-bitfields.s b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_const_value-bitfields.s new file mode 100644 index 0000000000000..40f10d39130ec --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_const_value-bitfields.s @@ -0,0 +1,408 @@ +# RUN: llvm-mc -filetype=obj -o %t -triple x86_64-apple-macosx10.15.0 %s +# RUN: %lldb %t -o "target variable constant" -b | FileCheck %s + +# CHECK: (lldb) target variable constant +# CHECK: (U) constant = { +# CHECK: raw = 1688469761 +# CHECK: = (a = 1, b = 1, c = 36, d = 2, e = 36, f = 1) +# CHECK: } + +# This is testing when how ValueObjectVariable handles the case where the +# DWARFExpression holds the data that represents a constant value. +# Compile at -O1 allows us to capture this case. Below is the code used +# to generate the assembly: +# +# typedef union +# { +# unsigned raw; +# struct +# { +# unsigned a : 8; +# unsigned b : 8; +# unsigned c : 6; +# unsigned d : 2; +# unsigned e : 6; +# unsigned f : 2; +# } ; +# } U; +# +# static U __attribute__((used)) _type_anchor; +# static const int constant = 0x64A40101; +# +# int g() { return constant; } +# +# int main() { +# U u; +# u.raw = 0x64A40101; +# } +# +# Compiled as follows: +# +# clang -gdwarf-4 -O1 dw_at_const_value_bug.c -S -o dw_at_const_value_bug.s +# +# I was able to obtain a global of type U with DW_AT_const_value but was able +# to using int. This required modifying the DW_AT_type of constant to be type +# U. After that stripping as much of the assembly as possible to give us a +# smaller reproducer. + + +.zerofill __DATA,__bss,__type_anchor,4,2 ## @_type_anchor + .no_dead_strip __type_anchor + .section __DWARF,__debug_str,regular,debug +Linfo_string: + .zero 90 + .asciz "constant" ## string offset=90 + .asciz "int" ## string offset=99 + .asciz "_type_anchor" ## string offset=103 + .asciz "U" ## string offset=116 + .asciz "raw" ## string offset=118 + .asciz "unsigned int" ## string offset=122 + .asciz "a" ## string offset=135 + .asciz "b" ## string offset=137 + .asciz "c" ## string offset=139 + .asciz "d" ## string offset=141 + .asciz "e" ## string offset=143 + .asciz "f" ## string offset=145 + .asciz "g" ## string offset=147 + .asciz "main" ## string offset=149 + .asciz "u" ## string offset=154 + .section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 1 ## Abbreviation Code + .byte 17 ## DW_TAG_compile_unit + .byte 1 ## DW_CHILDREN_yes + .byte 37 ## DW_AT_producer + .byte 14 ## DW_FORM_strp + .byte 19 ## DW_AT_language + .byte 5 ## DW_FORM_data2 + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 66 ## DW_AT_stmt_list + .byte 23 ## DW_FORM_sec_offset + .byte 27 ## DW_AT_comp_dir + .byte 14 ## DW_FORM_strp + .ascii "\264B" ## DW_AT_GNU_pubnames + .byte 25 ## DW_FORM_flag_present + .ascii "\341\177" ## DW_AT_APPLE_optimized + .byte 25 ## DW_FORM_flag_present + .byte 17 ## DW_AT_low_pc + .byte 1 ## DW_FORM_addr + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 2 ## Abbreviation Code + .byte 52 ## DW_TAG_variable + .byte 0 ## DW_CHILDREN_no + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 28 ## DW_AT_const_value + .byte 15 ## DW_FORM_udata + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 3 ## Abbreviation Code + .byte 38 ## DW_TAG_const_type + .byte 0 ## DW_CHILDREN_no + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 4 ## Abbreviation Code + .byte 36 ## DW_TAG_base_type + .byte 0 ## DW_CHILDREN_no + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 62 ## DW_AT_encoding + .byte 11 ## DW_FORM_data1 + .byte 11 ## DW_AT_byte_size + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 5 ## Abbreviation Code + .byte 52 ## DW_TAG_variable + .byte 0 ## DW_CHILDREN_no + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 2 ## DW_AT_location + .byte 24 ## DW_FORM_exprloc + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 6 ## Abbreviation Code + .byte 22 ## DW_TAG_typedef + .byte 0 ## DW_CHILDREN_no + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 7 ## Abbreviation Code + .byte 23 ## DW_TAG_union_type + .byte 1 ## DW_CHILDREN_yes + .byte 11 ## DW_AT_byte_size + .byte 11 ## DW_FORM_data1 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 8 ## Abbreviation Code + .byte 13 ## DW_TAG_member + .byte 0 ## DW_CHILDREN_no + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 56 ## DW_AT_data_member_location + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 9 ## Abbreviation Code + .byte 13 ## DW_TAG_member + .byte 0 ## DW_CHILDREN_no + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 56 ## DW_AT_data_member_location + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 10 ## Abbreviation Code + .byte 19 ## DW_TAG_structure_type + .byte 1 ## DW_CHILDREN_yes + .byte 11 ## DW_AT_byte_size + .byte 11 ## DW_FORM_data1 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 11 ## Abbreviation Code + .byte 13 ## DW_TAG_member + .byte 0 ## DW_CHILDREN_no + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 13 ## DW_AT_bit_size + .byte 11 ## DW_FORM_data1 + .byte 107 ## DW_AT_data_bit_offset + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 12 ## Abbreviation Code + .byte 46 ## DW_TAG_subprogram + .byte 0 ## DW_CHILDREN_no + .byte 17 ## DW_AT_low_pc + .byte 1 ## DW_FORM_addr + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 64 ## DW_AT_frame_base + .byte 24 ## DW_FORM_exprloc + .byte 122 ## DW_AT_call_all_calls + .byte 25 ## DW_FORM_flag_present + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 63 ## DW_AT_external + .byte 25 ## DW_FORM_flag_present + .ascii "\341\177" ## DW_AT_APPLE_optimized + .byte 25 ## DW_FORM_flag_present + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 13 ## Abbreviation Code + .byte 46 ## DW_TAG_subprogram + .byte 1 ## DW_CHILDREN_yes + .byte 17 ## DW_AT_low_pc + .byte 1 ## DW_FORM_addr + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 64 ## DW_AT_frame_base + .byte 24 ## DW_FORM_exprloc + .byte 122 ## DW_AT_call_all_calls + .byte 25 ## DW_FORM_flag_present + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 63 ## DW_AT_external + .byte 25 ## DW_FORM_flag_present + .ascii "\341\177" ## DW_AT_APPLE_optimized + .byte 25 ## DW_FORM_flag_present + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 14 ## Abbreviation Code + .byte 52 ## DW_TAG_variable + .byte 0 ## DW_CHILDREN_no + .byte 28 ## DW_AT_const_value + .byte 15 ## DW_FORM_udata + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 0 ## EOM(3) + .section __DWARF,__debug_info,regular,debug +Lsection_info: +Lcu_begin0: +.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit + .long Lset0 +Ldebug_info_start0: + .short 4 ## DWARF version number +.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section + .long Lset1 + .byte 8 ## Address Size (in bytes) + .byte 1 ## Abbrev [1] 0xb:0x112 DW_TAG_compile_unit + .long 0 ## DW_AT_producer + .short 12 ## DW_AT_language + .long 47 ## DW_AT_name + .long 0 ## DW_AT_stmt_list + .long 71 ## DW_AT_comp_dir + ## DW_AT_GNU_pubnames + ## DW_AT_APPLE_optimized + .quad 0 ## DW_AT_low_pc + .long 0 + .byte 2 ## Abbrev [2] 0x2a:0x10 DW_TAG_variable + .long 90 ## DW_AT_name + .long 91 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 16 ## DW_AT_decl_line + .ascii "\201\202\220\245\006" ## DW_AT_const_value + .byte 3 ## Abbrev [3] 0x3a:0x5 DW_TAG_const_type + .long 63 ## DW_AT_type + .byte 4 ## Abbrev [4] 0x3f:0x7 DW_TAG_base_type + .long 99 ## DW_AT_name + .byte 5 ## DW_AT_encoding + .byte 4 ## DW_AT_byte_size + .byte 5 ## Abbrev [5] 0x46:0x15 DW_TAG_variable + .long 103 ## DW_AT_name + .long 91 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 15 ## DW_AT_decl_line + .byte 9 ## DW_AT_location + .byte 3 + .quad __type_anchor + .byte 6 ## Abbrev [6] 0x5b:0xb DW_TAG_typedef + .long 102 ## DW_AT_type + .long 116 ## DW_AT_name + .byte 1 ## DW_AT_decl_file + .byte 13 ## DW_AT_decl_line + .byte 7 ## Abbrev [7] 0x66:0x6c DW_TAG_union_type + .byte 4 ## DW_AT_byte_size + .byte 1 ## DW_AT_decl_file + .byte 1 ## DW_AT_decl_line + .byte 8 ## Abbrev [8] 0x6a:0xc DW_TAG_member + .long 118 ## DW_AT_name + .long 210 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 3 ## DW_AT_decl_line + .byte 0 ## DW_AT_data_member_location + .byte 9 ## Abbrev [9] 0x76:0x8 DW_TAG_member + .long 126 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 4 ## DW_AT_decl_line + .byte 0 ## DW_AT_data_member_location + .byte 10 ## Abbrev [10] 0x7e:0x53 DW_TAG_structure_type + .byte 4 ## DW_AT_byte_size + .byte 1 ## DW_AT_decl_file + .byte 4 ## DW_AT_decl_line + .byte 11 ## Abbrev [11] 0x82:0xd DW_TAG_member + .long 135 ## DW_AT_name + .long 210 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 6 ## DW_AT_decl_line + .byte 8 ## DW_AT_bit_size + .byte 0 ## DW_AT_data_bit_offset + .byte 11 ## Abbrev [11] 0x8f:0xd DW_TAG_member + .long 137 ## DW_AT_name + .long 210 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 7 ## DW_AT_decl_line + .byte 8 ## DW_AT_bit_size + .byte 8 ## DW_AT_data_bit_offset + .byte 11 ## Abbrev [11] 0x9c:0xd DW_TAG_member + .long 139 ## DW_AT_name + .long 210 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 8 ## DW_AT_decl_line + .byte 6 ## DW_AT_bit_size + .byte 16 ## DW_AT_data_bit_offset + .byte 11 ## Abbrev [11] 0xa9:0xd DW_TAG_member + .long 141 ## DW_AT_name + .long 210 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 9 ## DW_AT_decl_line + .byte 2 ## DW_AT_bit_size + .byte 22 ## DW_AT_data_bit_offset + .byte 11 ## Abbrev [11] 0xb6:0xd DW_TAG_member + .long 143 ## DW_AT_name + .long 210 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 10 ## DW_AT_decl_line + .byte 6 ## DW_AT_bit_size + .byte 24 ## DW_AT_data_bit_offset + .byte 11 ## Abbrev [11] 0xc3:0xd DW_TAG_member + .long 145 ## DW_AT_name + .long 210 ## DW_AT_type + .byte 1 ## DW_AT_decl_file + .byte 11 ## DW_AT_decl_line + .byte 2 ## DW_AT_bit_size + .byte 30 ## DW_AT_data_bit_offset + .byte 0 ## End Of Children Mark + .byte 0 ## End Of Children Mark + .byte 4 ## Abbrev [4] 0xd2:0x7 DW_TAG_base_type + .long 122 ## DW_AT_name + .byte 7 ## DW_AT_encoding + .byte 4 ## DW_AT_byte_size + .byte 14 ## Abbrev [14] 0x10b:0x10 DW_TAG_variable + .ascii "\201\202\220\245\006" ## DW_AT_const_value + .long 154 ## DW_AT_name + .byte 1 ## DW_AT_decl_file + .byte 21 ## DW_AT_decl_line + .long 91 ## DW_AT_type + .byte 0 ## End Of Children Mark + .byte 0 ## End Of Children Mark +Ldebug_info_end0: diff --git a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_const_value.s b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_const_value.s new file mode 100644 index 0000000000000..2275ff25ce97b --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_const_value.s @@ -0,0 +1,175 @@ +# Test handling of (optimized-out/location-less) variables whose value is +# specified by DW_AT_const_value + +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux %s -o %t +# RUN: %lldb %t \ +# RUN: -o "target variable udata data1 data2 data4 data8 string strp ref4 udata_ptr" \ +# RUN: -o exit | FileCheck %s + +# CHECK-LABEL: target variable +## Variable specified via DW_FORM_udata. This is typical for clang (10). +# CHECK: (unsigned long) udata = 4742474247424742 +## Variables specified via fixed-size forms. This is typical for gcc (9). +# CHECK: (unsigned long) data1 = 47 +# CHECK: (unsigned long) data2 = 4742 +# CHECK: (unsigned long) data4 = 47424742 +# CHECK: (unsigned long) data8 = 4742474247424742 +## Variables specified using string forms. This behavior purely speculative -- I +## don't know of any compiler that would represent character strings this way. +# CHECK: (char [7]) string = "string" +# CHECK: (char [7]) strp = "strp" +## Bogus attribute form. Let's make sure we don't crash at least. +# CHECK: (char [7]) ref4 = +## A variable of pointer type. +# CHECK: (unsigned long *) udata_ptr = 0xdeadbeefbaadf00d + + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 8 # DW_FORM_string + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 15 # DW_TAG_pointer_type + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 1 # DW_TAG_array_type + .byte 1 # DW_CHILDREN_yes + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 33 # DW_TAG_subrange_type + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 55 # DW_AT_count + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 6 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) +.macro var code, form + .byte \code # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 28 # DW_AT_const_value + .byte \form + .byte 0 # EOM(1) + .byte 0 # EOM(2) +.endm + var 10, 0xf # DW_FORM_udata + var 11, 0xb # DW_FORM_data1 + var 12, 0x5 # DW_FORM_data2 + var 13, 0x6 # DW_FORM_data4 + var 14, 0x7 # DW_FORM_data8 + var 15, 0x8 # DW_FORM_string + var 16, 0xe # DW_FORM_strp + var 17, 0x13 # DW_FORM_ref4 + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .asciz "const.c" # DW_AT_name +.Lchar_arr: + .byte 4 # Abbrev DW_TAG_array_type + .long .Lchar-.Lcu_begin0 # DW_AT_type + .byte 5 # Abbrev DW_TAG_subrange_type + .long .Lulong-.Lcu_begin0 # DW_AT_type + .byte 7 # DW_AT_count + .byte 0 # End Of Children Mark +.Lchar: + .byte 6 # Abbrev DW_TAG_base_type + .asciz "char" # DW_AT_name + .byte 1 # DW_AT_byte_size + .byte 6 # DW_AT_encoding +.Lulong: + .byte 6 # Abbrev DW_TAG_base_type + .asciz "unsigned long" # DW_AT_name + .byte 8 # DW_AT_byte_size + .byte 7 # DW_AT_encoding +.Lulong_ptr: + .byte 2 # Abbrev DW_TAG_pointer_type + .long .Lulong-.Lcu_begin0 # DW_AT_type + + .byte 10 # Abbrev DW_TAG_variable + .asciz "udata" # DW_AT_name + .long .Lulong-.Lcu_begin0 # DW_AT_type + .uleb128 4742474247424742 # DW_AT_const_value + + .byte 11 # Abbrev DW_TAG_variable + .asciz "data1" # DW_AT_name + .long .Lulong-.Lcu_begin0 # DW_AT_type + .byte 47 # DW_AT_const_value + + .byte 12 # Abbrev DW_TAG_variable + .asciz "data2" # DW_AT_name + .long .Lulong-.Lcu_begin0 # DW_AT_type + .word 4742 # DW_AT_const_value + + .byte 13 # Abbrev DW_TAG_variable + .asciz "data4" # DW_AT_name + .long .Lulong-.Lcu_begin0 # DW_AT_type + .long 47424742 # DW_AT_const_value + + .byte 14 # Abbrev DW_TAG_variable + .asciz "data8" # DW_AT_name + .long .Lulong-.Lcu_begin0 # DW_AT_type + .quad 4742474247424742 # DW_AT_const_value + + .byte 15 # Abbrev DW_TAG_variable + .asciz "string" # DW_AT_name + .long .Lchar_arr-.Lcu_begin0 # DW_AT_type + .asciz "string" # DW_AT_const_value + + .byte 16 # Abbrev DW_TAG_variable + .asciz "strp" # DW_AT_name + .long .Lchar_arr-.Lcu_begin0 # DW_AT_type + .long .Lstrp # DW_AT_const_value + + .byte 17 # Abbrev DW_TAG_variable + .asciz "ref4" # DW_AT_name + .long .Lchar_arr-.Lcu_begin0 # DW_AT_type + .long .Lulong-.Lcu_begin0 # DW_AT_const_value + + .byte 10 # Abbrev DW_TAG_variable + .asciz "udata_ptr" # DW_AT_name + .long .Lulong_ptr-.Lcu_begin0 # DW_AT_type + .uleb128 0xdeadbeefbaadf00d # DW_AT_const_value + + .byte 0 # End Of Children Mark +.Ldebug_info_end0: + + .section .debug_str,"MS",@progbits,1 +.Lstrp: + .asciz "strp" diff --git a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_location-DW_AT_const_value.s b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_location-DW_AT_const_value.s new file mode 100644 index 0000000000000..08ee77175f770 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_location-DW_AT_const_value.s @@ -0,0 +1,144 @@ +## Test that we don't get confused by variables with both location and +## const_value attributes. Such values are produced in C++ for class-level +## static constexpr variables. + +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux %s -o %t +# RUN: %lldb %t -o "target variable A::x A::y" -o exit | FileCheck %s + +# CHECK-LABEL: target variable +# CHECK: (const int) A::x = 142 +# CHECK: (const int) A::y = 242 + + .section .rodata,"a",@progbits + .p2align 2 +_ZN1A1xE: + .long 142 +_ZN1A1yE: + .long 242 + + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 8 # DW_FORM_string + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 19 # DW_TAG_structure_type + .byte 1 # DW_CHILDREN_yes + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 13 # DW_TAG_member + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 60 # DW_AT_declaration + .byte 25 # DW_FORM_flag_present + .byte 28 # DW_AT_const_value + .byte 13 # DW_FORM_sdata + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 38 # DW_TAG_const_type + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 6 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 7 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 71 # DW_AT_specification + .byte 19 # DW_FORM_ref4 + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 110 # DW_AT_linkage_name + .byte 8 # DW_FORM_string + .byte 0 # EOM(1) + .byte 0 # EOM(2) +## This deliberately inverts the order of the specification and location +## attributes. + .byte 8 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 71 # DW_AT_specification + .byte 19 # DW_FORM_ref4 + .byte 110 # DW_AT_linkage_name + .byte 8 # DW_FORM_string + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .asciz "a.cc" # DW_AT_name + .byte 7 # Abbrev DW_TAG_variable + .long .LA__x-.Lcu_begin0 # DW_AT_specification + .byte 9 # DW_AT_location + .byte 3 + .quad _ZN1A1xE + .asciz "_ZN1A1xE" # DW_AT_linkage_name + .byte 8 # Abbrev DW_TAG_variable + .byte 9 # DW_AT_location + .byte 3 + .quad _ZN1A1yE + .long .LA__y-.Lcu_begin0 # DW_AT_specification + .asciz "_ZN1A1yE" # DW_AT_linkage_name + .byte 3 # Abbrev DW_TAG_structure_type + .asciz "A" # DW_AT_name + .byte 1 # DW_AT_byte_size +.LA__x: + .byte 4 # Abbrev DW_TAG_member + .asciz "x" # DW_AT_name + .long .Lconst_int-.Lcu_begin0 # DW_AT_type + # DW_AT_declaration + .sleb128 147 # DW_AT_const_value +.LA__y: + .byte 4 # Abbrev DW_TAG_member + .asciz "y" # DW_AT_name + .long .Lconst_int-.Lcu_begin0 # DW_AT_type + # DW_AT_declaration + .sleb128 247 # DW_AT_const_value + .byte 0 # End Of Children Mark +.Lconst_int: + .byte 5 # Abbrev DW_TAG_const_type + .long .Lint-.Lcu_begin0 # DW_AT_type +.Lint: + .byte 6 # Abbrev DW_TAG_base_type + .asciz "int" # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_end0: diff --git a/lldb/tools/argdumper/CMakeLists.txt b/lldb/tools/argdumper/CMakeLists.txt index 924946325194a..29a2186af3cbf 100644 --- a/lldb/tools/argdumper/CMakeLists.txt +++ b/lldb/tools/argdumper/CMakeLists.txt @@ -4,3 +4,5 @@ add_lldb_tool(lldb-argdumper ADD_TO_FRAMEWORK LINK_COMPONENTS Support ) + +add_dependencies(liblldb lldb-argdumper) diff --git a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp index 8f4b45b056b30..25d7e237bd204 100644 --- a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp +++ b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp @@ -689,6 +689,130 @@ TEST_F(MinidumpParserTest, MinidumpDuplicateModuleMinAddress) { EXPECT_EQ(0x0000000000001000u, filtered_modules[0]->BaseOfImage); } +TEST_F(MinidumpParserTest, MinidumpDuplicateModuleMappedFirst) { + ASSERT_THAT_ERROR(SetUpFromYaml(R"( +--- !minidump +Streams: + - Type: ModuleList + Modules: + - Base of Image: 0x400d0000 + Size of Image: 0x00002000 + Module Name: '/usr/lib/libc.so' + CodeView Record: '' + - Base of Image: 0x400d3000 + Size of Image: 0x00001000 + Module Name: '/usr/lib/libc.so' + CodeView Record: '' + - Type: LinuxMaps + Text: | + 400d0000-400d2000 r--p 00000000 b3:04 227 /usr/lib/libc.so + 400d2000-400d3000 rw-p 00000000 00:00 0 + 400d3000-400d4000 r-xp 00010000 b3:04 227 /usr/lib/libc.so + 400d4000-400d5000 rwxp 00001000 b3:04 227 /usr/lib/libc.so +... +)"), + llvm::Succeeded()); + // If we have a module mentioned twice in the module list, and we have full + // linux maps for all of the memory regions, make sure we pick the one that + // has a consecutive region with a matching path that has executable + // permissions. If clients open an object file with mmap, breakpad can create + // multiple mappings for a library errnoneously and the lowest address isn't + // always the right address. In this case we check the consective memory + // regions whose path matches starting at the base of image address and make + // sure one of the regions is executable and prefer that one. + // + // This test will make sure that if the executable is second in the module + // list, that it will become the selected module in the filtered list. + std::vector filtered_modules = + parser->GetFilteredModuleList(); + ASSERT_EQ(1u, filtered_modules.size()); + EXPECT_EQ(0x400d3000u, filtered_modules[0]->BaseOfImage); +} + +TEST_F(MinidumpParserTest, MinidumpDuplicateModuleMappedSecond) { + ASSERT_THAT_ERROR(SetUpFromYaml(R"( +--- !minidump +Streams: + - Type: ModuleList + Modules: + - Base of Image: 0x400d0000 + Size of Image: 0x00002000 + Module Name: '/usr/lib/libc.so' + CodeView Record: '' + - Base of Image: 0x400d3000 + Size of Image: 0x00001000 + Module Name: '/usr/lib/libc.so' + CodeView Record: '' + - Type: LinuxMaps + Text: | + 400d0000-400d1000 r-xp 00010000 b3:04 227 /usr/lib/libc.so + 400d1000-400d2000 rwxp 00001000 b3:04 227 /usr/lib/libc.so + 400d2000-400d3000 rw-p 00000000 00:00 0 + 400d3000-400d5000 r--p 00000000 b3:04 227 /usr/lib/libc.so +... +)"), + llvm::Succeeded()); + // If we have a module mentioned twice in the module list, and we have full + // linux maps for all of the memory regions, make sure we pick the one that + // has a consecutive region with a matching path that has executable + // permissions. If clients open an object file with mmap, breakpad can create + // multiple mappings for a library errnoneously and the lowest address isn't + // always the right address. In this case we check the consective memory + // regions whose path matches starting at the base of image address and make + // sure one of the regions is executable and prefer that one. + // + // This test will make sure that if the executable is first in the module + // list, that it will remain the correctly selected module in the filtered + // list. + std::vector filtered_modules = + parser->GetFilteredModuleList(); + ASSERT_EQ(1u, filtered_modules.size()); + EXPECT_EQ(0x400d0000u, filtered_modules[0]->BaseOfImage); +} + +TEST_F(MinidumpParserTest, MinidumpDuplicateModuleSeparateCode) { + ASSERT_THAT_ERROR(SetUpFromYaml(R"( +--- !minidump +Streams: + - Type: ModuleList + Modules: + - Base of Image: 0x400d0000 + Size of Image: 0x00002000 + Module Name: '/usr/lib/libc.so' + CodeView Record: '' + - Base of Image: 0x400d5000 + Size of Image: 0x00001000 + Module Name: '/usr/lib/libc.so' + CodeView Record: '' + - Type: LinuxMaps + Text: | + 400d0000-400d3000 r--p 00000000 b3:04 227 /usr/lib/libc.so + 400d3000-400d5000 rw-p 00000000 00:00 0 + 400d5000-400d6000 r--p 00000000 b3:04 227 /usr/lib/libc.so + 400d6000-400d7000 r-xp 00010000 b3:04 227 /usr/lib/libc.so + 400d7000-400d8000 rwxp 00001000 b3:04 227 /usr/lib/libc.so +... +)"), + llvm::Succeeded()); + // If we have a module mentioned twice in the module list, and we have full + // linux maps for all of the memory regions, make sure we pick the one that + // has a consecutive region with a matching path that has executable + // permissions. If clients open an object file with mmap, breakpad can create + // multiple mappings for a library errnoneously and the lowest address isn't + // always the right address. In this case we check the consective memory + // regions whose path matches starting at the base of image address and make + // sure one of the regions is executable and prefer that one. + // + // This test will make sure if binaries are compiled with "-z separate-code", + // where the first region for a binary won't be marked as executable, that + // it gets selected by detecting the second consecutive mapping at 0x400d7000 + // when asked about the a module mamed "/usr/lib/libc.so" at 0x400d5000. + std::vector filtered_modules = + parser->GetFilteredModuleList(); + ASSERT_EQ(1u, filtered_modules.size()); + EXPECT_EQ(0x400d5000u, filtered_modules[0]->BaseOfImage); +} + TEST_F(MinidumpParserTest, MinidumpModuleOrder) { ASSERT_THAT_ERROR(SetUpFromYaml(R"( --- !minidump @@ -721,4 +845,3 @@ TEST_F(MinidumpParserTest, MinidumpModuleOrder) { parser->GetMinidumpFile().getString(filtered_modules[1]->ModuleNameRVA), llvm::HasValue("/tmp/b")); } - diff --git a/lldb/unittests/Symbol/LocateSymbolFileTest.cpp b/lldb/unittests/Symbol/LocateSymbolFileTest.cpp index 268faeaf1dbbd..c51b1ba490461 100644 --- a/lldb/unittests/Symbol/LocateSymbolFileTest.cpp +++ b/lldb/unittests/Symbol/LocateSymbolFileTest.cpp @@ -14,13 +14,14 @@ #include "lldb/Host/HostInfo.h" #include "lldb/Symbol/LocateSymbolFile.h" #include "lldb/Target/Target.h" +#include "lldb/Utility/Reproducer.h" using namespace lldb_private; namespace { class SymbolsTest : public ::testing::Test { public: - SubsystemRAII subsystems; + SubsystemRAII subsystems; }; } // namespace diff --git a/lldb/unittests/Symbol/PostfixExpressionTest.cpp b/lldb/unittests/Symbol/PostfixExpressionTest.cpp index 7def709a60901..aee153d5ccefc 100644 --- a/lldb/unittests/Symbol/PostfixExpressionTest.cpp +++ b/lldb/unittests/Symbol/PostfixExpressionTest.cpp @@ -111,7 +111,7 @@ ParseFPOAndStringify(llvm::StringRef prog) { ParseFPOProgram(prog, alloc); std::vector> result; for (const auto &p : parsed) - result.emplace_back(p.first, ASTPrinter::Print(p.second)); + result.emplace_back(p.first.str(), ASTPrinter::Print(p.second)); return result; } diff --git a/lldb/unittests/TestingSupport/CMakeLists.txt b/lldb/unittests/TestingSupport/CMakeLists.txt index 67ebe32426291..4599ada1ec506 100644 --- a/lldb/unittests/TestingSupport/CMakeLists.txt +++ b/lldb/unittests/TestingSupport/CMakeLists.txt @@ -6,6 +6,7 @@ add_lldb_library(lldbUtilityHelpers LINK_LIBS lldbUtility lldbSymbolHelpers + gtest LINK_COMPONENTS Support diff --git a/lldb/unittests/TestingSupport/Symbol/CMakeLists.txt b/lldb/unittests/TestingSupport/Symbol/CMakeLists.txt index 3faec7c8030b8..cdd65ca17fed2 100644 --- a/lldb/unittests/TestingSupport/Symbol/CMakeLists.txt +++ b/lldb/unittests/TestingSupport/Symbol/CMakeLists.txt @@ -1,8 +1,15 @@ set_property(DIRECTORY PROPERTY EXCLUDE_FROM_ALL ON) add_lldb_library(lldbSymbolHelpers YAMLModuleTester.cpp - ) -target_include_directories(lldbSymbolHelpers PUBLIC - ${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include - ${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock/include) + LINK_LIBS + lldbCore + lldbHost + lldbPluginExpressionParserClang + lldbPluginSymbolFileDWARF + lldbPluginTypeSystemClang + lldbUtilityHelpers + + LINK_COMPONENTS + ObjectYAML + ) diff --git a/lldb/unittests/Utility/ScalarTest.cpp b/lldb/unittests/Utility/ScalarTest.cpp index 48cadfce466b5..1e65cd535733d 100644 --- a/lldb/unittests/Utility/ScalarTest.cpp +++ b/lldb/unittests/Utility/ScalarTest.cpp @@ -292,15 +292,15 @@ TEST(ScalarTest, Division) { TEST(ScalarTest, Promotion) { Scalar a(47); EXPECT_TRUE(a.IntegralPromote(64, true)); - EXPECT_EQ(Scalar::e_sint, a.GetType()); + EXPECT_TRUE(a.IsSigned()); EXPECT_EQ(APInt(64, 47), a.UInt128(APInt())); EXPECT_FALSE(a.IntegralPromote(32, true)); EXPECT_FALSE(a.IntegralPromote(32, false)); - EXPECT_EQ(Scalar::e_sint, a.GetType()); + EXPECT_TRUE(a.IsSigned()); EXPECT_TRUE(a.IntegralPromote(64, false)); - EXPECT_EQ(Scalar::e_uint, a.GetType()); + EXPECT_FALSE(a.IsSigned()); EXPECT_EQ(APInt(64, 47), a.UInt128(APInt())); EXPECT_FALSE(a.IntegralPromote(64, true)); @@ -362,7 +362,8 @@ TEST(ScalarTest, SetValueFromCString) { TEST(ScalarTest, APIntConstructor) { for (auto &width : {8, 16, 32}) { Scalar A(APInt(width, 24)); - EXPECT_EQ(A.GetType(), Scalar::e_sint); + EXPECT_TRUE(A.IsSigned()); + EXPECT_EQ(A.GetType(), Scalar::e_int); EXPECT_EQ(APInt(width, 24), A.UInt128(APInt())); } } @@ -374,15 +375,17 @@ TEST(ScalarTest, Scalar_512) { ASSERT_TRUE(Z.IsZero()); Scalar S(APInt(512, 2000)); - ASSERT_STREQ(S.GetTypeAsCString(), "signed int"); + ASSERT_STREQ(S.GetTypeAsCString(), "int"); ASSERT_TRUE(S.MakeUnsigned()); - EXPECT_EQ(S.GetType(), Scalar::e_uint); - ASSERT_STREQ(S.GetTypeAsCString(), "unsigned int"); + EXPECT_EQ(S.GetType(), Scalar::e_int); + EXPECT_FALSE(S.IsSigned()); + ASSERT_STREQ(S.GetTypeAsCString(), "int"); EXPECT_EQ(S.GetByteSize(), 64U); ASSERT_TRUE(S.MakeSigned()); - EXPECT_EQ(S.GetType(), Scalar::e_sint); + EXPECT_EQ(S.GetType(), Scalar::e_int); + EXPECT_TRUE(S.IsSigned()); EXPECT_EQ(S.GetByteSize(), 64U); } diff --git a/llvm-spirv/include/LLVMSPIRVOpts.h b/llvm-spirv/include/LLVMSPIRVOpts.h index 6abbf15eb9bbe..bdc24cd2d22a1 100644 --- a/llvm-spirv/include/LLVMSPIRVOpts.h +++ b/llvm-spirv/include/LLVMSPIRVOpts.h @@ -71,6 +71,8 @@ enum class BIsRepresentation : uint32_t { OpenCL12, OpenCL20, SPIRVFriendlyIR }; enum class FPContractMode : uint32_t { On, Off, Fast }; +enum class DebugInfoEIS : uint32_t { SPIRV_Debug, OpenCL_DebugInfo_100 }; + /// \brief Helper class to manage SPIR-V translation class TranslatorOpts { public: @@ -146,6 +148,10 @@ class TranslatorOpts { SPIRVAllowUnknownIntrinsics = AllowUnknownIntrinsics; } + DebugInfoEIS getDebugInfoEIS() const { return DebugInfoVersion; } + + void setDebugInfoEIS(DebugInfoEIS EIS) { DebugInfoVersion = EIS; } + private: // Common translation options VersionNumber MaxVersion = VersionNumber::MaximumVersion; @@ -172,6 +178,8 @@ class TranslatorOpts { // Unknown LLVM intrinsics will be translated as external function calls in // SPIR-V bool SPIRVAllowUnknownIntrinsics = false; + + DebugInfoEIS DebugInfoVersion = DebugInfoEIS::OpenCL_DebugInfo_100; }; } // namespace SPIRV diff --git a/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp b/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp index d95851263bf80..80815a3d62e44 100644 --- a/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp +++ b/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp @@ -86,7 +86,7 @@ SPIRVValue *LLVMToSPIRVDbgTran::createDebugDeclarePlaceholder( DbgDeclareIntrinsics.push_back(DbgDecl); using namespace SPIRVDebug::Operand::DebugDeclare; SPIRVWordVec Ops(OperandCount, getDebugInfoNoneId()); - SPIRVId ExtSetId = BM->getExtInstSetId(SPIRVEIS_Debug); + SPIRVId ExtSetId = BM->getExtInstSetId(BM->getDebugInfoEIS()); return BM->addExtInst(getVoidTy(), ExtSetId, SPIRVDebug::Declare, Ops, BB); } @@ -94,9 +94,9 @@ void LLVMToSPIRVDbgTran::finalizeDebugDeclare( const DbgVariableIntrinsic *DbgDecl) { SPIRVValue *V = SPIRVWriter->getTranslatedValue(DbgDecl); assert(V && "llvm.dbg.declare intrinsic isn't mapped to a SPIRV instruction"); - assert(V->isExtInst(SPIRV::SPIRVEIS_Debug, SPIRVDebug::Declare) && + assert(V->isExtInst(BM->getDebugInfoEIS(), SPIRVDebug::Declare) && "llvm.dbg.declare intrinsic has been translated wrong!"); - if (!V || !V->isExtInst(SPIRV::SPIRVEIS_Debug, SPIRVDebug::Declare)) + if (!V || !V->isExtInst(BM->getDebugInfoEIS(), SPIRVDebug::Declare)) return; SPIRVExtInst *DD = static_cast(V); SPIRVBasicBlock *BB = DD->getBasicBlock(); @@ -121,7 +121,7 @@ SPIRVValue *LLVMToSPIRVDbgTran::createDebugValuePlaceholder( DbgValueIntrinsics.push_back(DbgValue); using namespace SPIRVDebug::Operand::DebugValue; SPIRVWordVec Ops(MinOperandCount, getDebugInfoNone()->getId()); - SPIRVId ExtSetId = BM->getExtInstSetId(SPIRVEIS_Debug); + SPIRVId ExtSetId = BM->getExtInstSetId(BM->getDebugInfoEIS()); return BM->addExtInst(getVoidTy(), ExtSetId, SPIRVDebug::Value, Ops, BB); } @@ -129,9 +129,9 @@ void LLVMToSPIRVDbgTran::finalizeDebugValue( const DbgVariableIntrinsic *DbgValue) { SPIRVValue *V = SPIRVWriter->getTranslatedValue(DbgValue); assert(V && "llvm.dbg.value intrinsic isn't mapped to a SPIRV instruction"); - assert(V->isExtInst(SPIRV::SPIRVEIS_Debug, SPIRVDebug::Value) && + assert(V->isExtInst(BM->getDebugInfoEIS(), SPIRVDebug::Value) && "llvm.dbg.value intrinsic has been translated wrong!"); - if (!V || !V->isExtInst(SPIRV::SPIRVEIS_Debug, SPIRVDebug::Value)) + if (!V || !V->isExtInst(BM->getDebugInfoEIS(), SPIRVDebug::Value)) return; SPIRVExtInst *DV = static_cast(V); SPIRVBasicBlock *BB = DV->getBasicBlock(); @@ -888,7 +888,7 @@ SPIRVEntry *LLVMToSPIRVDbgTran::transDbgScope(const DIScope *S) { SPIRVEntry *LLVMToSPIRVDbgTran::transDebugLoc(const DebugLoc &Loc, SPIRVBasicBlock *BB, SPIRVInstruction *InsertBefore) { - SPIRVId ExtSetId = BM->getExtInstSetId(SPIRVEIS_Debug); + SPIRVId ExtSetId = BM->getExtInstSetId(BM->getDebugInfoEIS()); if (!Loc.get()) return BM->addExtInst(getVoidTy(), ExtSetId, SPIRVDebug::NoScope, std::vector(), BB, InsertBefore); diff --git a/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp b/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp index 15647f2906a1b..145b5f60e9a00 100644 --- a/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp +++ b/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp @@ -255,6 +255,9 @@ void PreprocessMetadata::preprocessVectorComputeMetadata(Module *M, auto EM = B->addNamedMD(kSPIRVMD::ExecutionMode); for (auto &F : *M) { + if (F.getCallingConv() != CallingConv::SPIR_KERNEL) + continue; + // Add VC float control execution modes // RoundMode and FloatMode are always same for all types in VC // While Denorm could be different for double, float and half diff --git a/llvm-spirv/lib/SPIRV/SPIRVLowerConstExpr.cpp b/llvm-spirv/lib/SPIRV/SPIRVLowerConstExpr.cpp index 205d313515671..ceb7b54ce6367 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVLowerConstExpr.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVLowerConstExpr.cpp @@ -162,6 +162,8 @@ void SPIRVLowerConstExpr::visit(Module *M) { [LowerOp](Value *V) { return LowerOp(V); }); Value *Repl = nullptr; unsigned Idx = 0; + auto *PhiII = dyn_cast(II); + auto *InsPoint = PhiII ? &PhiII->getIncomingBlock(OI)->back() : II; std::list ReplList; for (auto V : OpList) { if (auto *Inst = dyn_cast(V)) @@ -169,7 +171,7 @@ void SPIRVLowerConstExpr::visit(Module *M) { Repl = InsertElementInst::Create( (Repl ? Repl : UndefValue::get(Vec->getType())), V, ConstantInt::get(Type::getInt32Ty(M->getContext()), Idx++), "", - II); + InsPoint); } II->replaceUsesOfWith(Op, Repl); WorkList.splice(WorkList.begin(), ReplList); diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp index 03499d42c00b7..18f3bba3149f2 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp @@ -864,6 +864,7 @@ void SPIRVToLLVM::setLLVMLoopMetadata(const LoopInstType *LM, Parameters.push_back(SafelenMDOp); Metadata.push_back(llvm::MDNode::get(*Context, Parameters)); } + ++NumParam; } if (LC & LoopControlPipelineEnableINTELMask) { Metadata.push_back(llvm::MDNode::get( @@ -2336,6 +2337,7 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F, case SPIRVEIS_OpenCL: return mapValue(BV, transOCLBuiltinFromExtInst(ExtInst, BB)); case SPIRVEIS_Debug: + case SPIRVEIS_OpenCL_DebugInfo_100: return mapValue(BV, DbgTran->transDebugIntrinsic(ExtInst, BB)); default: llvm_unreachable("Unknown extended instruction set!"); @@ -3963,7 +3965,7 @@ bool SPIRVToLLVM::transVectorComputeMetadata(SPIRVFunction *BF) { static_cast( FloatModes.at(0)); auto FloatingMode = DecFlt->getOperationMode(); -#ifdef NDEBUG +#ifndef NDEBUG for (auto *DecPreCast : FloatModes) { auto *Dec = static_cast( diff --git a/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.cpp b/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.cpp index d657a37e137ed..3f599fcc553c0 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.cpp @@ -78,7 +78,8 @@ SPIRVExtInst *SPIRVToLLVMDbgTran::getDbgInst(const SPIRVId Id) { SPIRVEntry *E = BM->getEntry(Id); if (isa(E)) { SPIRVExtInst *EI = static_cast(E); - if (EI->getExtSetKind() == SPIRV::SPIRVEIS_Debug) + if (EI->getExtSetKind() == SPIRV::SPIRVEIS_Debug || + EI->getExtSetKind() == SPIRV::SPIRVEIS_OpenCL_DebugInfo_100) return EI; } return nullptr; @@ -230,7 +231,9 @@ SPIRVToLLVMDbgTran::transTypeComposite(const SPIRVExtInst *DebugInst) { uint64_t Size = 0; SPIRVEntry *SizeEntry = BM->getEntry(Ops[SizeIdx]); - if (!SizeEntry->isExtInst(SPIRVEIS_Debug, SPIRVDebug::DebugInfoNone)) { + if (!(SizeEntry->isExtInst(SPIRVEIS_Debug, SPIRVDebug::DebugInfoNone) || + SizeEntry->isExtInst(SPIRVEIS_OpenCL_DebugInfo_100, + SPIRVDebug::DebugInfoNone))) { Size = BM->get(Ops[SizeIdx])->getZExtIntValue(); } diff --git a/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.h b/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.h index 08724975338be..3409858409b9a 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.h +++ b/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.h @@ -70,7 +70,8 @@ class SPIRVToLLVMDbgTran { void transDbgInfo(const SPIRVValue *SV, Value *V); template T *transDebugInst(const SPIRVExtInst *DebugInst) { - assert(DebugInst->getExtSetKind() == SPIRVEIS_Debug && + assert((DebugInst->getExtSetKind() == SPIRVEIS_Debug || + DebugInst->getExtSetKind() == SPIRVEIS_OpenCL_DebugInfo_100) && "Unexpected extended instruction set"); auto It = DebugInstCache.find(DebugInst); if (It != DebugInstCache.end()) diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp index 59b0bce31f757..de70a5a130166 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp @@ -175,7 +175,7 @@ bool LLVMToSPIRV::isBuiltinTransToExtInst(Function *F, SPIRVExtInstSetKind Set = SPIRVEIS_Count; if (!SPIRVExtSetShortNameMap::rfind(ExtSetName.str(), &Set)) return false; - assert((Set == SPIRVEIS_OpenCL || Set == SPIRVEIS_Debug) && + assert((Set == SPIRVEIS_OpenCL || Set == BM->getDebugInfoEIS()) && "Unsupported extended instruction set"); auto ExtOpName = S.substr(Loc + 1); @@ -563,7 +563,10 @@ SPIRVFunction *LLVMToSPIRV::transFunctionDecl(Function *F) { // Order of integer numbers in MD node follows the order of function // parameters on which we shall attach the appropriate decoration. Add // decoration only if MD value is not negative. - int LocID = getMDOperandAsInt(BufferLocation, ArgNo); + int LocID = -1; + if (!isa(BufferLocation->getOperand(ArgNo)) && + !isa(BufferLocation->getOperand(ArgNo))) + LocID = getMDOperandAsInt(BufferLocation, ArgNo); if (LocID >= 0) { BM->addCapability(CapabilityFPGABufferLocationINTEL); BA->addDecorate(DecorationBufferLocationINTEL, LocID); @@ -961,6 +964,7 @@ LLVMToSPIRV::getLoopControl(const BranchInst *Branch, return spv::LoopControlMaskNone; size_t LoopControl = spv::LoopControlMaskNone; + std::vector> ParametersToSort; // Unlike with most of the cases, some loop metadata specifications // can occur multiple times - for these, all correspondent tokens @@ -985,13 +989,13 @@ LLVMToSPIRV::getLoopControl(const BranchInst *Branch, else if (S == "llvm.loop.unroll.count" && !(LoopControl & LoopControlDontUnrollMask)) { size_t I = getMDOperandAsInt(Node, 1); - Parameters.push_back(I); + ParametersToSort.emplace_back(spv::LoopControlPartialCountMask, I); LoopControl |= spv::LoopControlPartialCountMask; } else if (S == "llvm.loop.ivdep.enable") LoopControl |= spv::LoopControlDependencyInfiniteMask; else if (S == "llvm.loop.ivdep.safelen") { size_t I = getMDOperandAsInt(Node, 1); - Parameters.push_back(I); + ParametersToSort.emplace_back(spv::LoopControlDependencyLengthMask, I); LoopControl |= spv::LoopControlDependencyLengthMask; } else if (BM->isAllowedToUseExtension( ExtensionID::SPV_INTEL_fpga_loop_controls)) { @@ -1000,13 +1004,15 @@ LLVMToSPIRV::getLoopControl(const BranchInst *Branch, BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); BM->addCapability(CapabilityFPGALoopControlsINTEL); size_t I = getMDOperandAsInt(Node, 1); - Parameters.push_back(I); + ParametersToSort.emplace_back( + spv::LoopControlInitiationIntervalINTELMask, I); LoopControl |= spv::LoopControlInitiationIntervalINTELMask; } else if (S == "llvm.loop.max_concurrency.count") { BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); BM->addCapability(CapabilityFPGALoopControlsINTEL); size_t I = getMDOperandAsInt(Node, 1); - Parameters.push_back(I); + ParametersToSort.emplace_back(spv::LoopControlMaxConcurrencyINTELMask, + I); LoopControl |= spv::LoopControlMaxConcurrencyINTELMask; } else if (S == "llvm.loop.parallel_access_indices") { // Intel FPGA IVDep loop attribute @@ -1024,7 +1030,8 @@ LLVMToSPIRV::getLoopControl(const BranchInst *Branch, BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); BM->addCapability(CapabilityFPGALoopControlsINTEL); size_t I = getMDOperandAsInt(Node, 1); - Parameters.push_back(I); + ParametersToSort.emplace_back(spv::LoopControlPipelineEnableINTELMask, + I); LoopControl |= spv::LoopControlPipelineEnableINTELMask; } else if (S == "llvm.loop.coalesce.enable") { BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); @@ -1034,19 +1041,22 @@ LLVMToSPIRV::getLoopControl(const BranchInst *Branch, BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); BM->addCapability(CapabilityFPGALoopControlsINTEL); size_t I = getMDOperandAsInt(Node, 1); - Parameters.push_back(I); + ParametersToSort.emplace_back(spv::LoopControlLoopCoalesceINTELMask, + I); LoopControl |= spv::LoopControlLoopCoalesceINTELMask; } else if (S == "llvm.loop.max_interleaving.count") { BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); BM->addCapability(CapabilityFPGALoopControlsINTEL); size_t I = getMDOperandAsInt(Node, 1); - Parameters.push_back(I); + ParametersToSort.emplace_back( + spv::LoopControlMaxInterleavingINTELMask, I); LoopControl |= spv::LoopControlMaxInterleavingINTELMask; } else if (S == "llvm.loop.intel.speculated.iterations.count") { BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); BM->addCapability(CapabilityFPGALoopControlsINTEL); size_t I = getMDOperandAsInt(Node, 1); - Parameters.push_back(I); + ParametersToSort.emplace_back( + spv::LoopControlSpeculatedIterationsINTELMask, I); LoopControl |= spv::LoopControlSpeculatedIterationsINTELMask; } } @@ -1058,16 +1068,27 @@ LLVMToSPIRV::getLoopControl(const BranchInst *Branch, if (!DependencyArrayParameters.empty()) { // The first parameter states the number of pairs to be // listed - Parameters.push_back(DependencyArrayParameters.size()); + ParametersToSort.emplace_back(spv::LoopControlDependencyArrayINTELMask, + DependencyArrayParameters.size()); for (auto &ArraySflnPair : DependencyArrayParameters) { - Parameters.push_back(ArraySflnPair.first); - Parameters.push_back(ArraySflnPair.second); + ParametersToSort.emplace_back(spv::LoopControlDependencyArrayINTELMask, + ArraySflnPair.first); + ParametersToSort.emplace_back(spv::LoopControlDependencyArrayINTELMask, + ArraySflnPair.second); } BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); BM->addCapability(CapabilityFPGALoopControlsINTEL); LoopControl |= spv::LoopControlDependencyArrayINTELMask; } + std::sort(ParametersToSort.begin(), ParametersToSort.end(), + [](const std::pair &CompareLeft, + const std::pair &CompareRight) { + return CompareLeft.first < CompareRight.first; + }); + for (auto Param : ParametersToSort) + Parameters.push_back(Param.second); + return static_cast(LoopControl); } @@ -1608,7 +1629,8 @@ bool LLVMToSPIRV::transBuiltinSet() { if (!BM->importBuiltinSet("OpenCL.std", &EISId)) return false; if (SPIRVMDWalker(*M).getNamedMD("llvm.dbg.cu")) { - if (!BM->importBuiltinSet("SPIRV.debug", &EISId)) + if (!BM->importBuiltinSet( + SPIRVBuiltinSetNameMap::map(BM->getDebugInfoEIS()), &EISId)) return false; } return true; diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h index acbc3e1a7eb8d..f3e1a932189c6 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h @@ -70,6 +70,7 @@ enum SPIRVInstructionSchemaKind { enum SPIRVExtInstSetKind { SPIRVEIS_OpenCL, SPIRVEIS_Debug, + SPIRVEIS_OpenCL_DebugInfo_100, SPIRVEIS_Count, }; @@ -120,6 +121,7 @@ template <> inline void SPIRVMap::init() { template <> inline void SPIRVMap::init() { add(SPIRVEIS_OpenCL, "OpenCL.std"); add(SPIRVEIS_Debug, "SPIRV.debug"); + add(SPIRVEIS_OpenCL_DebugInfo_100, "OpenCL.DebugInfo.100"); } typedef SPIRVMap SPIRVBuiltinSetNameMap; diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVFunction.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVFunction.cpp index db9823214af1a..da1ba0dfbc676 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVFunction.cpp +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVFunction.cpp @@ -160,9 +160,12 @@ bool SPIRVFunction::decodeBB(SPIRVDecoder &Decoder) { if (Inst->getOpCode() == OpUndef) { Module->add(Inst); } else { - if (Inst->isExtInst(SPIRVEIS_Debug, SPIRVDebug::Scope)) { + if (Inst->isExtInst(SPIRVEIS_Debug, SPIRVDebug::Scope) || + Inst->isExtInst(SPIRVEIS_OpenCL_DebugInfo_100, SPIRVDebug::Scope)) { DebugScope = Inst; - } else if (Inst->isExtInst(SPIRVEIS_Debug, SPIRVDebug::NoScope)) { + } else if (Inst->isExtInst(SPIRVEIS_Debug, SPIRVDebug::NoScope) || + Inst->isExtInst(SPIRVEIS_OpenCL_DebugInfo_100, + SPIRVDebug::NoScope)) { DebugScope = nullptr; } else { Inst->setDebugScope(DebugScope); diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h index 363efdbb59e5b..2c8280048563d 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h @@ -1821,7 +1821,8 @@ class SPIRVExtInst : public SPIRVFunctionCallGeneric { void setExtSetKindById() { assert(Module && "Invalid module"); ExtSetKind = Module->getBuiltinSet(ExtSetId); - assert((ExtSetKind == SPIRVEIS_OpenCL || ExtSetKind == SPIRVEIS_Debug) && + assert((ExtSetKind == SPIRVEIS_OpenCL || ExtSetKind == SPIRVEIS_Debug || + ExtSetKind == SPIRVEIS_OpenCL_DebugInfo_100) && "not supported"); } void encode(spv_ostream &O) const override { @@ -1831,6 +1832,7 @@ class SPIRVExtInst : public SPIRVFunctionCallGeneric { getEncoder(O) << ExtOpOCL; break; case SPIRVEIS_Debug: + case SPIRVEIS_OpenCL_DebugInfo_100: getEncoder(O) << ExtOpDebug; break; default: @@ -1847,6 +1849,7 @@ class SPIRVExtInst : public SPIRVFunctionCallGeneric { getDecoder(I) >> ExtOpOCL; break; case SPIRVEIS_Debug: + case SPIRVEIS_OpenCL_DebugInfo_100: getDecoder(I) >> ExtOpDebug; break; default: diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp index e24d1df86d187..3821e4740680e 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp @@ -656,7 +656,8 @@ void SPIRVModuleImpl::layoutEntry(SPIRVEntry *E) { } break; case OpExtInst: { SPIRVExtInst *EI = static_cast(E); - if (EI->getExtSetKind() == SPIRVEIS_Debug && + if ((EI->getExtSetKind() == SPIRVEIS_Debug || + EI->getExtSetKind() == SPIRVEIS_OpenCL_DebugInfo_100) && EI->getExtOp() != SPIRVDebug::Declare && EI->getExtOp() != SPIRVDebug::Value && EI->getExtOp() != SPIRVDebug::Scope && @@ -1230,9 +1231,9 @@ SPIRVInstruction *SPIRVModuleImpl::addExtInst( SPIRVEntry *SPIRVModuleImpl::addDebugInfo(SPIRVWord InstId, SPIRVType *TheType, const std::vector &Args) { - return addEntry(new SPIRVExtInst(this, getId(), TheType, SPIRVEIS_Debug, - ExtInstSetIds[SPIRVEIS_Debug], InstId, - Args)); + return addEntry( + new SPIRVExtInst(this, getId(), TheType, SPIRVEIS_OpenCL_DebugInfo_100, + ExtInstSetIds[getDebugInfoEIS()], InstId, Args)); } SPIRVInstruction * diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h index b20fe02e21918..1828531371d42 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h @@ -487,6 +487,18 @@ class SPIRVModule { return TranslationOpts.isSPIRVAllowUnknownIntrinsicsEnabled(); } + SPIRVExtInstSetKind getDebugInfoEIS() const { + switch (TranslationOpts.getDebugInfoEIS()) { + case DebugInfoEIS::SPIRV_Debug: + return SPIRVEIS_Debug; + case DebugInfoEIS::OpenCL_DebugInfo_100: + return SPIRVEIS_OpenCL_DebugInfo_100; + default: + assert(false && "Unexpected debug info EIS!"); + return SPIRVEIS_Debug; + } + } + // I/O functions friend spv_ostream &operator<<(spv_ostream &O, SPIRVModule &M); friend std::istream &operator>>(std::istream &I, SPIRVModule &M); diff --git a/llvm-spirv/test/DebugInfo/Generic/debug-info-eis-option.ll b/llvm-spirv/test/DebugInfo/Generic/debug-info-eis-option.ll new file mode 100755 index 0000000000000..7f0ef7ac4943d --- /dev/null +++ b/llvm-spirv/test/DebugInfo/Generic/debug-info-eis-option.ll @@ -0,0 +1,83 @@ +; RUN: llvm-as < %s -o %t.bc +; RUN: llvm-spirv %t.bc -o %t.spv --spirv-debug-info-version=legacy +; RUN: llvm-spirv -r %t.spv -o - | llvm-dis -o %t.ll + +; RUN: llc -mtriple=%triple -stop-before=finalize-isel -pre-RA-sched=linearize < %t.ll | FileCheck %s + +; RUN: llvm-spirv %t.spv -to-text -o - | FileCheck --check-prefix CHECK-SPIRV %s + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" +; CHECK-SPIRV: ExtInstImport [[Set:[0-9]+]] "SPIRV.debug" +; CHECK-SPIRV: TypeVoid [[Void:[0-9]+]] +; CHECK-SPIRV: ExtInst [[Void]] {{[0-9]+}} [[Set]] DebugValue + +source_filename = "linear-dbg-value.ll" + +; Function Attrs: nounwind readonly uwtable +define i32 @foo(i32* nocapture readonly %a, i32 %N) local_unnamed_addr #0 !dbg !6 { +entry: + %cmp6 = icmp sgt i32 %N, 0, !dbg !11 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup, !dbg !15 + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body, !dbg !17 + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup, !dbg !19 + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %x.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.cond.cleanup.loopexit ] + ret i32 %x.0.lcssa, !dbg !19 + +for.body: ; preds = %for.body, %for.body.preheader +; CHECK: ![[X:[0-9]+]] = !DILocalVariable(name: "x", +; CHECK-LABEL: bb.3.for.body: +; CHECK: DBG_VALUE {{.*}} ![[X]], !DIExpression() +; CHECK: DBG_VALUE {{.*}} ![[X]], !DIExpression() + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %x.07 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv, !dbg !17 + %0 = load i32, i32* %arrayidx, align 4, !dbg !17 + %add = add nsw i32 %0, %x.07, !dbg !17 + call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !20 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21 + call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !20 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !11 + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !15 +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + +attributes #0 = { nounwind readonly uwtable } +attributes #1 = { nounwind readnone speculatable } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!llvm.ident = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "foo.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{!"clang version 4.0.1 "} +!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !8) +!7 = !DISubroutineType(types: !2) +!8 = !{!9} +!9 = !DILocalVariable(name: "x", scope: !6, file: !1, line: 3, type: !10) +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DILocation(line: 4, scope: !12) +!12 = !DILexicalBlockFile(scope: !13, file: !1, discriminator: 1) +!13 = distinct !DILexicalBlock(scope: !14, file: !1, line: 4, column: 3) +!14 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 3) +!15 = !DILocation(line: 4, scope: !16) +!16 = !DILexicalBlockFile(scope: !14, file: !1, discriminator: 1) +!17 = !DILocation(line: 5, scope: !18) +!18 = distinct !DILexicalBlock(scope: !13, file: !1, line: 4, column: 31) +!19 = !DILocation(line: 7, scope: !6) +!20 = !DILocation(line: 3, scope: !6) +!21 = !DILocation(line: 4, scope: !22) +!22 = !DILexicalBlockFile(scope: !13, file: !1, discriminator: 3) diff --git a/llvm-spirv/test/DebugInfo/Generic/linear-dbg-value.ll b/llvm-spirv/test/DebugInfo/Generic/linear-dbg-value.ll index 14b66314e66dc..57e2984383076 100644 --- a/llvm-spirv/test/DebugInfo/Generic/linear-dbg-value.ll +++ b/llvm-spirv/test/DebugInfo/Generic/linear-dbg-value.ll @@ -8,7 +8,7 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown" -; CHECK-SPIRV: ExtInstImport [[Set:[0-9]+]] "SPIRV.debug" +; CHECK-SPIRV: ExtInstImport [[Set:[0-9]+]] "OpenCL.DebugInfo.100" ; CHECK-SPIRV: TypeVoid [[Void:[0-9]+]] ; CHECK-SPIRV: ExtInst [[Void]] {{[0-9]+}} [[Set]] DebugValue diff --git a/llvm-spirv/test/DebugInfo/X86/inlined-formal-parameter.ll b/llvm-spirv/test/DebugInfo/X86/inlined-formal-parameter.ll index a8d09dd3951e9..74e345881d700 100644 --- a/llvm-spirv/test/DebugInfo/X86/inlined-formal-parameter.ll +++ b/llvm-spirv/test/DebugInfo/X86/inlined-formal-parameter.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llvm-as < %s -o %t.bc ; RUN: llvm-spirv %t.bc -o %t.spv ; RUN: llvm-spirv -r %t.spv -o - | llvm-dis -o %t.ll diff --git a/llvm-spirv/test/constexpr_phi.ll b/llvm-spirv/test/constexpr_phi.ll new file mode 100644 index 0000000000000..02217cee8ef0b --- /dev/null +++ b/llvm-spirv/test/constexpr_phi.ll @@ -0,0 +1,56 @@ +; RUN: llvm-as %s -o %t.bc +; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_function_pointers -o %t.spv +; RUN: llvm-spirv %t.spv -to-text -o %t.spt +; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV +; RUN: llvm-spirv -r %t.spv -o %t.r.bc +; RUN: llvm-dis %t.r.bc -o %t.r.ll +; RUN: FileCheck < %t.r.ll %s --check-prefix=CHECK-LLVM + +; CHECK-SPIRV: Name [[#F:]] "_Z3runiiPi" +; CHECK-SPIRV: Function [[#]] [[#F]] [[#]] [[#]] +; CHECK-SPIRV: Label [[#L1:]] +; CHECK-SPIRV: CompositeInsert [[#]] [[#Ins1:]] [[#]] [[#]] 0 +; CHECK-SPIRV: CompositeInsert [[#]] [[#Ins2:]] [[#]] [[#Ins1]] 1 +; CHECK-SPIRV: BranchConditional [[#]] [[#L2:]] [[#L3:]] +; CHECK-SPIRV: Label [[#L2]] +; CHECK-SPIRV: CompositeInsert [[#]] [[#Ins3:]] [[#]] [[#]] 0 +; CHECK-SPIRV: CompositeInsert [[#]] [[#Ins4:]] [[#]] [[#Ins3]] 1 +; CHECK-SPIRV: Branch [[#L3]] +; CHECK-SPIRV: Label [[#L3]] +; CHECK-NEXT-SPIRV: Phi [[#]] [[#]] + ; CHECK-SAME-SPIRV: [[#Ins2]] [[#L1]] + ; CHECK-SAME-SPIRV: [[#Ins4]] [[#L2]] + +; CHECK-LLVM: br label %[[#L:]] +; CHECK-LLVM: [[#L]]: +; CHECK-NEXT-LLVM: %[[#]] = phi <2 x i64> [ %[[#]], %[[#]] ], [ %[[#]], %[[#]] ] + + +target triple = "spir-unknown-unknown" + +define dso_local i32 @_Z2f1i(i32 %0) { + %2 = add nsw i32 %0, 1 + ret i32 %2 +} + +define dso_local i32 @_Z2f2i(i32 %0) { + %2 = add nsw i32 %0, 2 + ret i32 %2 +} + +define dso_local i64 @_Z3runiiPi(i32 %0, i32 %1, i32* nocapture %2) local_unnamed_addr { + %4 = icmp slt i32 %0, 10 + br i1 %4, label %5, label %7 + +5: + %6 = add nsw i32 %1, 2 + store i32 %6, i32* %2, align 4 + br label %7 + +7: + %8 = phi <2 x i64> [ , %5 ], [ , %3 ] + %9 = extractelement <2 x i64> %8, i64 0 + %10 = extractelement <2 x i64> %8, i64 1 + %11 = add nsw i64 %9, %10 + ret i64 %11 +} diff --git a/llvm-spirv/test/nullptr-metadata-test.ll b/llvm-spirv/test/nullptr-metadata-test.ll new file mode 100755 index 0000000000000..2a84b5fe3fd5b --- /dev/null +++ b/llvm-spirv/test/nullptr-metadata-test.ll @@ -0,0 +1,10 @@ +; This test ensures that the translator does not crash +; RUN: llvm-as %s -o %t.bc +; RUN: llvm-spirv %t.bc -o %t.spv + +; ModuleID = 'test.bc' +target triple = "spir64" + +declare dllexport void @test_func(i32) #0 + +attributes #0 = { "VCSLMSize"="0" } diff --git a/llvm-spirv/test/transcoding/intel_multiple_fpga_loop_attrs.ll b/llvm-spirv/test/transcoding/intel_multiple_fpga_loop_attrs.ll new file mode 100644 index 0000000000000..54547dd454dad --- /dev/null +++ b/llvm-spirv/test/transcoding/intel_multiple_fpga_loop_attrs.ll @@ -0,0 +1,139 @@ +; RUN: llvm-as < %s > %t.bc +; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_fpga_loop_controls -o - -spirv-text | FileCheck %s --check-prefix=CHECK-SPIRV + +; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_fpga_loop_controls -o %t.spv +; RUN: llvm-spirv -r %t.spv -o %t.rev.bc +; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM + +; RUN: llvm-spirv %t.bc -o - -spirv-text | FileCheck %s --check-prefix=CHECK-SPIRV-NEGATIVE + +; RUN: llvm-spirv %t.bc -o %t.spv +; RUN: llvm-spirv -r %t.spv -o %t.rev.bc +; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM-NEGATIVE + +; CHECK-SPIRV: Capability FPGALoopControlsINTEL +; CHECK-SPIRV: Extension "SPV_INTEL_fpga_loop_controls" +; CHECK-SPIRV-NEGATIVE-NOT: Capability FPGALoopControlsINTEL +; CHECK-SPIRV-NEGATIVE-NOT: Extension "SPV_INTEL_fpga_loop_controls" +; CHECK-SPIRV: 4522248 3 2 1 1 16 3 0 +; CHECK-SPIRV-NEGATIVE: LoopMerge {{[0-9]+}} {{[0-9]+}} 264 3 2 + +; CHECK-LLVM: br label %for.cond{{[0-9]*}}, !llvm.loop ![[MD:[0-9]+]] +; CHECK-LLVM: ![[MD]] = distinct !{![[MD]], ![[MD_ivdep:[0-9]+]], ![[MD_unroll:[0-9]+]], ![[MD_ii:[0-9]+]], ![[MD_access:[0-9]+]], ![[MD_si:[0-9]+]]} +; CHECK-LLVM: ![[MD_ivdep]] = !{!"llvm.loop.ivdep.safelen", i32 3} +; CHECK-LLVM: ![[MD_unroll]] = !{!"llvm.loop.unroll.count", i32 2} +; CHECK-LLVM: ![[MD_ii]] = !{!"llvm.loop.ii.count", i32 1} +; CHECK-LLVM: ![[MD_access]] = !{!"llvm.loop.parallel_access_indices", !{{[0-9]+}}, i32 3} +; CHECK-LLVM: ![[MD_si]] = !{!"llvm.loop.intel.speculated.iterations.count", i32 0} + +; CHECK-LLVM-NEGATIVE: br label %for.cond{{[0-9]*}}, !llvm.loop ![[MD:[0-9]+]] +; CHECK-LLVM-NEGATIVE: ![[MD]] = distinct !{![[MD]], ![[MD_ivdep:[0-9]+]], ![[MD_unroll:[0-9]+]]} +; CHECK-LLVM-NEGATIVE: ![[MD_ivdep]] = !{!"llvm.loop.ivdep.safelen", i32 3} +; CHECK-LLVM-NEGATIVE: ![[MD_unroll]] = !{!"llvm.loop.unroll.count", i32 2} + +; ModuleID = 'intel-fpga-loops.cpp' +source_filename = "intel-fpga-loops.cpp" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-linux-sycldevice" + +%class.anon = type { i8 } + +; Function Attrs: nounwind +define spir_func void @_Z4testv() #0 { +entry: + %a = alloca [10 x i32], align 4 + %i = alloca i32, align 4 + %0 = bitcast [10 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 40, i8* %0) #4 + %1 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #4 + store i32 0, i32* %i, align 4, !tbaa !2 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %2 = load i32, i32* %i, align 4, !tbaa !2 + %cmp = icmp ne i32 %2, 10 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %3 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #4 + br label %for.end + +for.body: ; preds = %for.cond + %4 = load i32, i32* %i, align 4, !tbaa !2 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 %idxprom, !llvm.index.group !6 + store i32 0, i32* %arrayidx, align 4, !tbaa !2 + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4, !tbaa !2 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4, !tbaa !2 + br label %for.cond, !llvm.loop !7 + +for.end: ; preds = %for.cond.cleanup + %6 = bitcast [10 x i32]* %a to i8* + call void @llvm.lifetime.end.p0i8(i64 40, i8* %6) #4 + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 + +; Function Attrs: norecurse nounwind +define i32 @main() #2 { +entry: + %retval = alloca i32, align 4 + %agg.tmp = alloca %class.anon, align 1 + store i32 0, i32* %retval, align 4 + call spir_func void @"_Z18kernel_single_taskIZ4mainE15kernel_functionZ4mainE3$_0EvT0_"(%class.anon* byval(%class.anon) align 1 %agg.tmp) + ret i32 0 +} + +; Function Attrs: nounwind +define internal spir_func void @"_Z18kernel_single_taskIZ4mainE15kernel_functionZ4mainE3$_0EvT0_"(%class.anon* byval(%class.anon) align 1 %kernelFunc) #0 { +entry: + %0 = addrspacecast %class.anon* %kernelFunc to %class.anon addrspace(4)* + call spir_func void @"_ZZ4mainENK3$_0clEv"(%class.anon addrspace(4)* %0) + ret void +} + +; Function Attrs: inlinehint nounwind +define internal spir_func void @"_ZZ4mainENK3$_0clEv"(%class.anon addrspace(4)* %this) #3 align 2 { +entry: + %this.addr = alloca %class.anon addrspace(4)*, align 8 + store %class.anon addrspace(4)* %this, %class.anon addrspace(4)** %this.addr, align 8, !tbaa !13 + %this1 = load %class.anon addrspace(4)*, %class.anon addrspace(4)** %this.addr, align 8 + call spir_func void @_Z4testv() + ret void +} + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 12.0.0"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = distinct !{} +!7 = distinct !{!7, !8, !9, !10, !11, !12} +!8 = !{!"llvm.loop.parallel_access_indices", !6, i32 3} +!9 = !{!"llvm.loop.ivdep.safelen", i32 3} +!10 = !{!"llvm.loop.ii.count", i32 1} +!11 = !{!"llvm.loop.intel.speculated.iterations.count", i32 0} +!12 = !{!"llvm.loop.unroll.count", i32 2} +!13 = !{!14, !14, i64 0} +!14 = !{!"any pointer", !4, i64 0} diff --git a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp index 42be0f4f581f2..c25d96577eb4e 100644 --- a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp +++ b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp @@ -183,6 +183,19 @@ cl::opt SPIRVAllowUnknownIntrinsics( cl::desc("Unknown LLVM intrinsics will be translated as external function " "calls in SPIR-V")); +static cl::opt DebugEIS( + "spirv-debug-info-version", cl::desc("Set SPIR-V debug info version:"), + cl::init(SPIRV::DebugInfoEIS::OpenCL_DebugInfo_100), + cl::values( + clEnumValN(SPIRV::DebugInfoEIS::SPIRV_Debug, "legacy", + "Emit debug info compliant with the SPIRV.debug extended " + "instruction set. This option is used for compatibility " + "with older versions of the translator"), + clEnumValN(SPIRV::DebugInfoEIS::OpenCL_DebugInfo_100, "ocl-100", + "Emit debug info compliant with the OpenCL.DebugInfo.100 " + "extended instruction set. This version of SPIR-V debug " + "info format is compatible with the SPIRV-Tools"))); + static std::string removeExt(const std::string &FileName) { size_t Pos = FileName.find_last_of("."); if (Pos != std::string::npos) @@ -568,6 +581,15 @@ int main(int Ac, char **Av) { } } + if (DebugEIS.getNumOccurrences() != 0) { + if (IsReverse) { + errs() << "Note: --spirv-debug-info-version option ignored as it only " + "affects translation from LLVM IR to SPIR-V"; + } else { + Opts.setDebugInfoEIS(DebugEIS); + } + } + #ifdef _SPIRV_SUPPORT_TEXT_FMT if (ToText && (ToBinary || IsReverse || IsRegularization)) { errs() << "Cannot use -to-text with -to-binary, -r, -s\n"; diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 4f92cd8432a6f..866435f92c69e 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -354,7 +354,12 @@ option(LLVM_ENABLE_LIBEDIT "Use libedit if available." ON) option(LLVM_ENABLE_LIBPFM "Use libpfm for performance counters if available." ON) -option(LLVM_ENABLE_THREADS "Use threads if available." ON) +# On z/OS, threads cannot be used because TLS is not supported. +if (CMAKE_SYSTEM_NAME MATCHES "OS390") + option(LLVM_ENABLE_THREADS "Use threads if available." OFF) +else() + option(LLVM_ENABLE_THREADS "Use threads if available." ON) +endif() set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON") @@ -562,6 +567,19 @@ option (LLVM_BUILD_EXTERNAL_COMPILER_RT option (LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO "Show target and host info when tools are invoked with --version." ON) +option(LLVM_INTEGRATED_CRT_ALLOC "Replace the Windows CRT allocator with any of {rpmalloc|mimalloc|snmalloc}. Only works with /MT enabled." OFF) +if(LLVM_INTEGRATED_CRT_ALLOC) + if(NOT WIN32) + message(FATAL_ERROR "LLVM_INTEGRATED_CRT_ALLOC is only supported on Windows.") + endif() + if(LLVM_USE_SANITIZER) + message(FATAL_ERROR "LLVM_INTEGRATED_CRT_ALLOC cannot be used along with LLVM_USE_SANITIZER!") + endif() + if(CMAKE_BUILD_TYPE AND uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + message(FATAL_ERROR "The Debug target isn't supported along with LLVM_INTEGRATED_CRT_ALLOC!") + endif() +endif() + # You can configure which libraries from LLVM you want to include in the # shared library by setting LLVM_DYLIB_COMPONENTS to a semi-colon delimited # list of LLVM components. All component names handled by llvm-config are valid. @@ -932,6 +950,13 @@ if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX") string(APPEND CMAKE_SHARED_LINKER_FLAGS " -shared") endif() +# Build with _XOPEN_SOURCE on z/OS. +if (CMAKE_SYSTEM_NAME MATCHES "OS390") + add_definitions("-D_XOPEN_SOURCE=600") + add_definitions("-D_OPEN_SYS") # Needed for process information. + add_definitions("-D_OPEN_SYS_FILE_EXT") # Needed for EBCDIC I/O. +endif() + # Build with _FILE_OFFSET_BITS=64 on Solaris to match g++ >= 9. if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "SunOS") add_definitions("-D_FILE_OFFSET_BITS=64") @@ -942,7 +967,7 @@ endif() # check its symbols. This is wasteful (the check was done when foo.so # was created) and can fail since it is not the dynamic linker and # doesn't know how to handle search paths correctly. -if (UNIX AND NOT APPLE AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "SunOS|AIX") +if (UNIX AND NOT APPLE AND NOT CMAKE_SYSTEM_NAME MATCHES "SunOS|AIX|OS390") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-allow-shlib-undefined") endif() diff --git a/llvm/bindings/go/llvm/executionengine_test.go b/llvm/bindings/go/llvm/executionengine_test.go index 4462f8fb20468..2369826db9147 100644 --- a/llvm/bindings/go/llvm/executionengine_test.go +++ b/llvm/bindings/go/llvm/executionengine_test.go @@ -80,7 +80,6 @@ func TestFactorial(t *testing.T) { pass := NewPassManager() defer pass.Dispose() - pass.AddConstantPropagationPass() pass.AddInstructionCombiningPass() pass.AddPromoteMemoryToRegisterPass() pass.AddGVNPass() diff --git a/llvm/bindings/go/llvm/transforms_scalar.go b/llvm/bindings/go/llvm/transforms_scalar.go index 36fc13e00696b..d1b54bd2a1913 100644 --- a/llvm/bindings/go/llvm/transforms_scalar.go +++ b/llvm/bindings/go/llvm/transforms_scalar.go @@ -40,6 +40,5 @@ func (pm PassManager) AddScalarReplAggregatesPassWithThreshold(threshold int) { } func (pm PassManager) AddSimplifyLibCallsPass() { C.LLVMAddSimplifyLibCallsPass(pm.C) } func (pm PassManager) AddTailCallEliminationPass() { C.LLVMAddTailCallEliminationPass(pm.C) } -func (pm PassManager) AddConstantPropagationPass() { C.LLVMAddConstantPropagationPass(pm.C) } func (pm PassManager) AddDemoteMemoryToRegisterPass() { C.LLVMAddDemoteMemoryToRegisterPass(pm.C) } func (pm PassManager) AddVerifierPass() { C.LLVMAddVerifierPass(pm.C) } diff --git a/llvm/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.ml b/llvm/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.ml index 4d905533936a6..b1b0db39a91fe 100644 --- a/llvm/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.ml +++ b/llvm/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.ml @@ -96,9 +96,6 @@ external add_lib_call_simplification external add_tail_call_elimination : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit = "llvm_add_tail_call_elimination" -external add_constant_propagation - : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit - = "llvm_add_constant_propagation" external add_memory_to_register_demotion : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit = "llvm_add_demote_memory_to_register" diff --git a/llvm/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.mli b/llvm/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.mli index 117218f06608e..bd57ba1136bb5 100644 --- a/llvm/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.mli +++ b/llvm/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.mli @@ -161,11 +161,6 @@ external add_tail_call_elimination : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit = "llvm_add_tail_call_elimination" -(** See the [llvm::createConstantPropagationPass] function. *) -external add_constant_propagation - : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit - = "llvm_add_constant_propagation" - (** See the [llvm::createDemoteMemoryToRegisterPass] function. *) external add_memory_to_register_demotion : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit diff --git a/llvm/bindings/ocaml/transforms/scalar_opts/scalar_opts_ocaml.c b/llvm/bindings/ocaml/transforms/scalar_opts/scalar_opts_ocaml.c index 8d10989bd6671..1e794c9241d6c 100644 --- a/llvm/bindings/ocaml/transforms/scalar_opts/scalar_opts_ocaml.c +++ b/llvm/bindings/ocaml/transforms/scalar_opts/scalar_opts_ocaml.c @@ -200,12 +200,6 @@ CAMLprim value llvm_add_tail_call_elimination(LLVMPassManagerRef PM) { return Val_unit; } -/* [ unit */ -CAMLprim value llvm_add_constant_propagation(LLVMPassManagerRef PM) { - LLVMAddConstantPropagationPass(PM); - return Val_unit; -} - /* [ unit */ CAMLprim value llvm_add_demote_memory_to_register(LLVMPassManagerRef PM) { LLVMAddDemoteMemoryToRegisterPass(PM); diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 11abb32c9072e..72505190e347f 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -54,6 +54,7 @@ check_include_file(sys/resource.h HAVE_SYS_RESOURCE_H) check_include_file(sys/stat.h HAVE_SYS_STAT_H) check_include_file(sys/time.h HAVE_SYS_TIME_H) check_include_file(sys/types.h HAVE_SYS_TYPES_H) +check_include_file(sysexits.h HAVE_SYSEXITS_H) check_include_file(termios.h HAVE_TERMIOS_H) check_include_file(unistd.h HAVE_UNISTD_H) check_include_file(valgrind/valgrind.h HAVE_VALGRIND_VALGRIND_H) @@ -147,18 +148,19 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*") else() set(HAVE_LIBEDIT 0) endif() - if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON) - set(MAYBE_REQUIRED REQUIRED) - else() - set(MAYBE_REQUIRED) - endif() if(LLVM_ENABLE_TERMINFO) - find_library(TERMINFO_LIB NAMES terminfo tinfo curses ncurses ncursesw ${MAYBE_REQUIRED}) - endif() - if(TERMINFO_LIB) - set(LLVM_ENABLE_TERMINFO 1) + set(HAVE_TERMINFO 0) + foreach(library terminfo tinfo curses ncurses ncursesw) + string(TOUPPER ${library} library_suffix) + check_library_exists(${library} setupterm "" HAVE_TERMINFO_${library_suffix}) + if(HAVE_TERMINFO_${library_suffix}) + set(HAVE_TERMINFO 1) + set(TERMINFO_LIBS "${library}") + break() + endif() + endforeach() else() - set(LLVM_ENABLE_TERMINFO 0) + set(HAVE_TERMINFO 0) endif() find_library(ICONV_LIBRARY_PATH NAMES iconv libiconv libiconv-2 c) @@ -175,11 +177,7 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*") endif() endif() endif() - else() - set(LLVM_ENABLE_TERMINFO 0) endif() -else() - set(LLVM_ENABLE_TERMINFO 0) endif() if (LLVM_ENABLE_LIBXML2 STREQUAL "FORCE_ON" AND NOT LLVM_LIBXML2_ENABLED) diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index 7b8077efab510..a40cf17426fe0 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -221,7 +221,7 @@ function(add_link_opts target_name) # Pass -O3 to the linker. This enabled different optimizations on different # linkers. - if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin|SunOS|AIX" OR WIN32)) + if(NOT (CMAKE_SYSTEM_NAME MATCHES "Darwin|SunOS|AIX|OS390" OR WIN32)) set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS " -Wl,-O3") endif() @@ -249,11 +249,12 @@ function(add_link_opts target_name) LINK_FLAGS " -Wl,-z,discard-unused=sections") endif() elseif(NOT WIN32 AND NOT LLVM_LINKER_IS_GOLD AND - NOT ${CMAKE_SYSTEM_NAME} MATCHES "OpenBSD|AIX") + NOT CMAKE_SYSTEM_NAME MATCHES "OpenBSD|AIX|OS390") # Object files are compiled with -ffunction-data-sections. # Versions of bfd ld < 2.23.1 have a bug in --gc-sections that breaks # tools that use plugins. Always pass --gc-sections once we require # a newer linker. + # TODO Revisit this later on z/OS. set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS " -Wl,--gc-sections") endif() @@ -1400,9 +1401,6 @@ function(add_unittest test_suite test_name) set(EXCLUDE_FROM_ALL ON) endif() - include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include) - include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock/include) - if (SUPPORTS_VARIADIC_MACROS_FLAG) list(APPEND LLVM_COMPILE_FLAGS "-Wno-variadic-macros") endif () diff --git a/llvm/cmake/modules/GetHostTriple.cmake b/llvm/cmake/modules/GetHostTriple.cmake index 251ca1a32b141..a5f033c1110de 100644 --- a/llvm/cmake/modules/GetHostTriple.cmake +++ b/llvm/cmake/modules/GetHostTriple.cmake @@ -14,7 +14,9 @@ function( get_host_triple var ) else() set( value "i686-pc-windows-gnu" ) endif() - elseif( CMAKE_HOST_SYSTEM_NAME STREQUAL AIX ) + elseif( CMAKE_SYSTEM_NAME MATCHES "OS390" ) + set( value "s390x-ibm-zos" ) + elseif( CMAKE_SYSTEM_NAME STREQUAL AIX ) # We defer to dynamic detection of the host AIX version. if( CMAKE_SIZEOF_VOID_P EQUAL 8 ) set( value "powerpc64-ibm-aix" ) diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 4feb4b7a7f300..3a0761d56f62e 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -144,6 +144,10 @@ else(WIN32) endif(FUCHSIA OR UNIX) endif(WIN32) +if (CMAKE_SYSTEM_NAME MATCHES "OS390") + set(LLVM_HAVE_LINK_VERSION_SCRIPT 0) +endif() + set(EXEEXT ${CMAKE_EXECUTABLE_SUFFIX}) set(LTDL_SHLIB_EXT ${CMAKE_SHARED_LIBRARY_SUFFIX}) @@ -207,7 +211,7 @@ endif() # Pass -Wl,-z,defs. This makes sure all symbols are defined. Otherwise a DSO # build might work on ELF but fail on MachO/COFF. -if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin|FreeBSD|OpenBSD|DragonFly|AIX|SunOS" OR +if(NOT (CMAKE_SYSTEM_NAME MATCHES "Darwin|FreeBSD|OpenBSD|DragonFly|AIX|SunOS|OS390" OR WIN32 OR CYGWIN) AND NOT LLVM_USE_SANITIZER) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,defs") diff --git a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake index 0aeaf663ad1c5..db3a536b0c249 100644 --- a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake +++ b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake @@ -53,8 +53,10 @@ function(llvm_ExternalProject_Add name source_dir) endforeach() if(NOT ARG_TOOLCHAIN_TOOLS) - set(ARG_TOOLCHAIN_TOOLS clang lld llvm-ar llvm-lipo llvm-ranlib llvm-nm llvm-objdump) - if(NOT _cmake_system_name STREQUAL Darwin) + set(ARG_TOOLCHAIN_TOOLS clang lld llvm-ar llvm-ranlib llvm-nm llvm-objdump) + if(_cmake_system_name STREQUAL Darwin) + list(APPEND ARG_TOOLCHAIN_TOOLS llvm-libtool-darwin llvm-lipo) + else() # TODO: These tools don't fully support Mach-O format yet. list(APPEND ARG_TOOLCHAIN_TOOLS llvm-objcopy llvm-strip) endif() @@ -144,6 +146,9 @@ function(llvm_ExternalProject_Add name source_dir) if(llvm-ar IN_LIST TOOLCHAIN_TOOLS) list(APPEND compiler_args -DCMAKE_AR=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-ar${CMAKE_EXECUTABLE_SUFFIX}) endif() + if(llvm-libtool-darwin IN_LIST TOOLCHAIN_TOOLS) + list(APPEND compiler_args -DCMAKE_LIBTOOL=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-libtool-darwin${CMAKE_EXECUTABLE_SUFFIX}) + endif() if(llvm-lipo IN_LIST TOOLCHAIN_TOOLS) list(APPEND compiler_args -DCMAKE_LIPO=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-lipo${CMAKE_EXECUTABLE_SUFFIX}) endif() diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst index 9887196d330fc..8cc29803f2182 100644 --- a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst +++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst @@ -106,7 +106,7 @@ programming languages used in ML and HPC. The extensions also include improved support for optimized code on any architecture. Some of the generalizations may also benefit other issues that have been raised. -The extensions have evolved though collaboration with many individuals and +The extensions have evolved through collaboration with many individuals and active prototyping within the GDB debugger and LLVM compiler. Input has also been very much appreciated from the developers working on the Perforce TotalView HPC Debugger and GCC compiler. @@ -147,7 +147,7 @@ be generated to describe the CFI as only a single expression is required for the whole vector register, rather than a separate expression for each lane's dword of the vector register. It also allows the compiler to produce DWARF that indexes the vector register if it spills scalar registers into portions -of a vector registers. +of a vector register. Since DWARF stack value entries have a base type and AMDGPU registers are a vector of dwords, the ability to specify that a base type is a vector is diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst index c45f854c98493..cba6308b03ec0 100644 --- a/llvm/docs/CMake.rst +++ b/llvm/docs/CMake.rst @@ -461,6 +461,23 @@ LLVM-specific variables **LLVM_PARALLEL_LINK_JOBS**:STRING Define the maximum number of concurrent link jobs. +**LLVM_USE_CRT_{target}**:STRING + On Windows, tells which version of the C runtime library (CRT) should be used. + For example, -DLLVM_USE_CRT_RELEASE=MT would statically link the CRT into the + LLVM tools and library. + +**LLVM_INTEGRATED_CRT_ALLOC**:PATH + On Windows, allows embedding a different C runtime allocator into the LLVM + tools and libraries. Using a lock-free allocator such as the ones listed below + greatly decreases ThinLTO link time by about an order of magnitude. It also + midly improves Clang build times, by about 5-10%. At the moment, rpmalloc, + snmalloc and mimalloc are supported. Use the path to `git clone` to select + the respective allocator, for example: + D:\git> git clone https://github.com/mjansson/rpmalloc + D:\llvm-project> cmake ... -DLLVM_INTEGRATED_CRT_ALLOC=D:\git\rpmalloc + This flag needs to be used along with the static CRT, ie. if building the + Release target, add -DLLVM_USE_CRT_RELEASE=MT. + **LLVM_BUILD_DOCS**:BOOL Adds all *enabled* documentation targets (i.e. Doxgyen and Sphinx targets) as dependencies of the default build targets. This results in all of the (enabled) diff --git a/llvm/docs/CommandGuide/FileCheck.rst b/llvm/docs/CommandGuide/FileCheck.rst index 80c48534959e8..088c141f89778 100644 --- a/llvm/docs/CommandGuide/FileCheck.rst +++ b/llvm/docs/CommandGuide/FileCheck.rst @@ -430,7 +430,7 @@ from ``baz``. To fix this, you could add ``CHECK-NEXT`` matchers for every CHECK: Value: CHECK-SAME: {{ 1$}} -This verifies that the *next* time "``Value:``" appears in the ouput, it has +This verifies that the *next* time "``Value:``" appears in the output, it has the value ``1``. Note: a "``CHECK-SAME:``" cannot be the first directive in a file. diff --git a/llvm/docs/CommandGuide/llvm-libtool-darwin.rst b/llvm/docs/CommandGuide/llvm-libtool-darwin.rst index a3c7aba6cf501..6be849a22fd73 100644 --- a/llvm/docs/CommandGuide/llvm-libtool-darwin.rst +++ b/llvm/docs/CommandGuide/llvm-libtool-darwin.rst @@ -76,9 +76,13 @@ OPTIONS Use actual timestamps and UIDs/GIDs. +.. option:: -V + + Display the version of this program and perform any operation specified. + .. option:: -version - Display the version of this program. + Display the version of this program and exit immediately. EXIT STATUS ----------- diff --git a/llvm/docs/CommandLine.rst b/llvm/docs/CommandLine.rst index 431ebc0e67e67..c67e73373ebd2 100644 --- a/llvm/docs/CommandLine.rst +++ b/llvm/docs/CommandLine.rst @@ -475,7 +475,7 @@ Parsing a list of options Now that we have the standard run-of-the-mill argument types out of the way, lets get a little wild and crazy. Lets say that we want our optimizer to accept a **list** of optimizations to perform, allowing duplicates. For example, we -might want to run: "``compiler -dce -constprop -inline -dce -strip``". In this +might want to run: "``compiler -dce -instsimplify -inline -dce -strip``". In this case, the order of the arguments and the number of appearances is very important. This is what the "``cl::list``" template is for. First, start by defining an enum of the optimizations that you would like to perform: @@ -484,7 +484,7 @@ defining an enum of the optimizations that you would like to perform: enum Opts { // 'inline' is a C++ keyword, so name it 'inlining' - dce, constprop, inlining, strip + dce, instsimplify, inlining, strip }; Then define your "``cl::list``" variable: @@ -494,7 +494,7 @@ Then define your "``cl::list``" variable: cl::list OptimizationList(cl::desc("Available Optimizations:"), cl::values( clEnumVal(dce , "Dead Code Elimination"), - clEnumVal(constprop , "Constant Propagation"), + clEnumVal(instsimplify , "Instruction Simplification"), clEnumValN(inlining, "inline", "Procedure Integration"), clEnumVal(strip , "Strip Symbols"))); @@ -553,16 +553,16 @@ Reworking the above list example, we could replace `cl::list`_ with `cl::bits`_: cl::bits OptimizationBits(cl::desc("Available Optimizations:"), cl::values( clEnumVal(dce , "Dead Code Elimination"), - clEnumVal(constprop , "Constant Propagation"), + clEnumVal(instsimplify , "Instruction Simplification"), clEnumValN(inlining, "inline", "Procedure Integration"), clEnumVal(strip , "Strip Symbols"))); -To test to see if ``constprop`` was specified, we can use the ``cl:bits::isSet`` +To test to see if ``instsimplify`` was specified, we can use the ``cl:bits::isSet`` function: .. code-block:: c++ - if (OptimizationBits.isSet(constprop)) { + if (OptimizationBits.isSet(instsimplify)) { ... } diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index f33b562ec19fc..fdeb5d9b51961 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -1157,10 +1157,11 @@ Currently, only the following parameter attributes are defined: ``align `` or ``align()`` This indicates that the pointer value may be assumed by the optimizer to have the specified alignment. If the pointer value does not have the - specified alignment, behavior is undefined. + specified alignment, behavior is undefined. ``align 1`` has no effect on + non-byval, non-preallocated arguments. Note that this attribute has additional semantics when combined with the - ``byval`` attribute, which are documented there. + ``byval`` or ``preallocated`` attribute, which are documented there. .. _noalias: @@ -12478,8 +12479,8 @@ overlap. It copies "len" bytes of memory over. If the argument is known to be aligned to some boundary, this can be specified as an attribute on the argument. -If "len" is 0, the pointers may be NULL or dangling. However, they must still -be appropriately aligned. +If "len" is 0, the pointers may be NULL, dangling, ``undef``, or ``poison`` +pointers. However, they must still be appropriately aligned. .. _int_memcpy_inline: @@ -12535,8 +12536,8 @@ overlap. It copies "len" bytes of memory over. If the argument is known to be aligned to some boundary, this can be specified as an attribute on the argument. -If "len" is 0, the pointers may be NULL or dangling. However, they must still -be appropriately aligned. +If "len" is 0, the pointers may be NULL, dangling, ``undef``, or ``poison`` +pointers. However, they must still be appropriately aligned. The generated code is guaranteed not to call any external functions. @@ -12595,8 +12596,8 @@ copies "len" bytes of memory over. If the argument is known to be aligned to some boundary, this can be specified as an attribute on the argument. -If "len" is 0, the pointers may be NULL or dangling. However, they must still -be appropriately aligned. +If "len" is 0, the pointers may be NULL, dangling, ``undef``, or ``poison`` +pointers. However, they must still be appropriately aligned. .. _int_memset: @@ -12650,8 +12651,8 @@ at the destination location. If the argument is known to be aligned to some boundary, this can be specified as an attribute on the argument. -If "len" is 0, the pointers may be NULL or dangling. However, they must still -be appropriately aligned. +If "len" is 0, the pointer may be NULL, dangling, ``undef``, or ``poison`` +pointer. However, it must still be appropriately aligned. '``llvm.sqrt.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16930,27 +16931,28 @@ to: :: - %m[i] = icmp ule (%base + i), %n + %m[i] = icmp ult (%base + i), %n where ``%m`` is a vector (mask) of active/inactive lanes with its elements indexed by ``i``, and ``%base``, ``%n`` are the two arguments to -``llvm.get.active.lane.mask.*``, ``%imcp`` is an integer compare and ``ule`` -the unsigned less-than-equal comparison operator. Overflow cannot occur in +``llvm.get.active.lane.mask.*``, ``%icmp`` is an integer compare and ``ult`` +the unsigned less-than comparison operator. Overflow cannot occur in ``(%base + i)`` and its comparison against ``%n`` as it is performed in integer -numbers and not in machine numbers. The above is equivalent to: +numbers and not in machine numbers. If ``%n`` is ``0``, then the result is a +poison value. The above is equivalent to: :: %m = @llvm.get.active.lane.mask(%base, %n) -This can, for example, be emitted by the loop vectorizer. Then, ``%base`` is -the first element of the vector induction variable (VIV), and ``%n`` is the -Back-edge Taken Count (BTC). Thus, these intrinsics perform an element-wise -less than or equal comparison of VIV with BTC, producing a mask of true/false -values representing active/inactive vector lanes, except if the VIV overflows -in which case they return false in the lanes where the VIV overflows. The -arguments are scalar types to accommodate scalable vector types, for which it is -unknown what the type of the step vector needs to be that enumerate its +This can, for example, be emitted by the loop vectorizer in which case +``%base`` is the first element of the vector induction variable (VIV) and +``%n`` is the loop tripcount. Thus, these intrinsics perform an element-wise +less than comparison of VIV with the loop tripcount, producing a mask of +true/false values representing active/inactive vector lanes, except if the VIV +overflows in which case they return false in the lanes where the VIV overflows. +The arguments are scalar types to accommodate scalable vector types, for which +it is unknown what the type of the step vector needs to be that enumerate its lanes without overflow. This mask ``%m`` can e.g. be used in masked load/store instructions. These diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst index 2ff28eb09e54b..202e3ab223d6b 100644 --- a/llvm/docs/Passes.rst +++ b/llvm/docs/Passes.rst @@ -460,27 +460,6 @@ shared. This is useful because some passes (i.e., TraceValues) insert a lot of string constants into the program, regardless of whether or not an existing string is available. -``-constprop``: Simple constant propagation -------------------------------------------- - -This pass implements constant propagation and merging. It looks for -instructions involving only constant operands and replaces them with a constant -value instead of an instruction. For example: - -.. code-block:: llvm - - add i32 1, 2 - -becomes - -.. code-block:: llvm - - i32 3 - -NOTE: this pass has a habit of making definitions be dead. It is a good idea -to run a :ref:`Dead Instruction Elimination ` pass sometime after -running this pass. - .. _passes-dce: ``-dce``: Dead Code Elimination diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 2a2775f13dd1f..d30cca5440197 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -97,6 +97,15 @@ During this release ... * The 'mpx' feature was removed from the backend. It had been removed from clang frontend in 10.0. Mention of the 'mpx' feature in an IR file will print a message to stderr, but IR should still compile. +* Support for -march=sapphirerapids was added. +* The assembler now has support for {disp32} and {disp8} pseudo prefixes for + controlling displacement size for memory operands and jump displacements. The + assembler also supports the .d32 and .d8 mnemonic suffixes to do the same. +* A new function attribute "tune-cpu" has been added to support -mtune like gcc. + This allows microarchitectural optimizations to be applied independent from + the "target-cpu" attribute or TargetMachine CPU which will be used to select + Instruction Set. If the attribute is not present, the tune CPU will follow + the target CPU. Changes to the AMDGPU Target ----------------------------- @@ -148,6 +157,14 @@ Changes to the LLVM tools Changes to LLDB --------------------------------- +Changes to Sanitizers +--------------------- + +The integer sanitizer `-fsanitize=integer` now has a new sanitizer: +`-fsanitize=unsigned-shift-base`. It's not undefined behavior for an unsigned +left shift to overflow (i.e. to shift bits out), but it has been the source of +bugs and exploits in certain codebases in the past. + External Open Source Projects Using LLVM 12 =========================================== diff --git a/llvm/examples/OrcV2Examples/LLJITWithThinLTOSummaries/CMakeLists.txt b/llvm/examples/OrcV2Examples/LLJITWithThinLTOSummaries/CMakeLists.txt index 22262e4eb5b97..175e655007050 100644 --- a/llvm/examples/OrcV2Examples/LLJITWithThinLTOSummaries/CMakeLists.txt +++ b/llvm/examples/OrcV2Examples/LLJITWithThinLTOSummaries/CMakeLists.txt @@ -1,7 +1,7 @@ set(LLVM_LINK_COMPONENTS + BitReader Core ExecutionEngine - IRReader OrcJIT Support nativecodegen diff --git a/llvm/include/llvm-c/Transforms/Scalar.h b/llvm/include/llvm-c/Transforms/Scalar.h index 93d79a2051950..8b0a4d2642a98 100644 --- a/llvm/include/llvm-c/Transforms/Scalar.h +++ b/llvm/include/llvm-c/Transforms/Scalar.h @@ -125,9 +125,6 @@ void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM); /** See llvm::createTailCallEliminationPass function. */ void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM); -/** See llvm::createConstantPropagationPass function. */ -void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM); - /** See llvm::demotePromoteMemoryToRegisterPass function. */ void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 06d354411af69..a3e624842700b 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -128,6 +128,11 @@ class IntrinsicCostAttributes { IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, unsigned Factor); + IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, + ElementCount Factor) + : IntrinsicCostAttributes(Id, CI, Factor.Min) { + assert(!Factor.Scalable); + } IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, unsigned Factor, unsigned ScalarCost); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index ee1527ff88819..bb70b97870804 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -448,13 +448,15 @@ class TargetTransformInfoImplBase { // Identity and pointer-to-pointer casts are free. return 0; break; - case Instruction::Trunc: + case Instruction::Trunc: { // trunc to a native type is free (assuming the target has compare and // shift-right of the same width). - if (DL.isLegalInteger(DL.getTypeSizeInBits(Dst))) + TypeSize DstSize = DL.getTypeSizeInBits(Dst); + if (!DstSize.isScalable() && DL.isLegalInteger(DstSize.getFixedSize())) return 0; break; } + } return 1; } diff --git a/llvm/include/llvm/Analysis/Utils/TFUtils.h b/llvm/include/llvm/Analysis/Utils/TFUtils.h index a6cfb16113c1d..bba275b2524fd 100644 --- a/llvm/include/llvm/Analysis/Utils/TFUtils.h +++ b/llvm/include/llvm/Analysis/Utils/TFUtils.h @@ -90,6 +90,13 @@ class TensorSpec final { size_t ElementCount = 0; }; +/// Construct a TensorSpec from a JSON dictionary of the form: +/// { "name": , +/// "port": , +/// "type": , +/// "shape": } +/// For the "type" field, see the C++ primitive types used in +/// TFUTILS_SUPPORTED_TYPES. Optional getTensorSpecFromJSON(LLVMContext &Ctx, const json::Value &Value); @@ -155,23 +162,22 @@ class TFModelEvaluator final { std::unique_ptr Impl; }; -/// List of supported types, as a triple: -/// C++ type -/// short name (for strings, for instance) -/// capitalized short name (for enums, for instance) +/// List of supported types, as a pair: +/// - C++ type +/// - enum name (implementation-specific) #define TFUTILS_SUPPORTED_TYPES(M) \ - M(float, float, FLOAT) \ - M(double, double, DOUBLE) \ - M(int8_t, int8, INT8) \ - M(uint8_t, uint8, UINT8) \ - M(int16_t, int16, INT16) \ - M(uint16_t, uint16, UINT16) \ - M(int32_t, int32, INT32) \ - M(uint32_t, uint32, UINT32) \ - M(int64_t, int64, INT64) \ - M(uint64_t, uint64, UINT64) - -#define TFUTILS_GETDATATYPE_DEF(T, S, C) \ + M(float, TF_FLOAT) \ + M(double, TF_DOUBLE) \ + M(int8_t, TF_INT8) \ + M(uint8_t, TF_UINT8) \ + M(int16_t, TF_INT16) \ + M(uint16_t, TF_UINT16) \ + M(int32_t, TF_INT32) \ + M(uint32_t, TF_UINT32) \ + M(int64_t, TF_INT64) \ + M(uint64_t, TF_UINT64) + +#define TFUTILS_GETDATATYPE_DEF(T, E) \ template <> int TensorSpec::getDataType(); TFUTILS_SUPPORTED_TYPES(TFUTILS_GETDATATYPE_DEF) diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index f1b9cc906049b..f9a27a8ec4b09 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -413,10 +413,12 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6; bool getUnderlyingObjectsForCodeGen(const Value *V, SmallVectorImpl &Objects); - /// Finds alloca where the value comes from. - AllocaInst *findAllocaForValue(Value *V); - inline const AllocaInst *findAllocaForValue(const Value *V) { - return findAllocaForValue(const_cast(V)); + /// Returns unique alloca where the value comes from, or nullptr. + /// If OffsetZero is true check that V points to the begining of the alloca. + AllocaInst *findAllocaForValue(Value *V, bool OffsetZero = false); + inline const AllocaInst *findAllocaForValue(const Value *V, + bool OffsetZero = false) { + return findAllocaForValue(const_cast(V), OffsetZero); } /// Return true if the only users of this pointer are lifetime markers. @@ -584,10 +586,10 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6; /// getGuaranteedNonPoisonOp. bool propagatesPoison(const Instruction *I); - /// Return either nullptr or an operand of I such that I will trigger - /// undefined behavior if I is executed and that operand has a poison - /// value. - const Value *getGuaranteedNonPoisonOp(const Instruction *I); + /// Insert operands of I into Ops such that I will trigger undefined behavior + /// if I is executed and that operand has a poison value. + void getGuaranteedNonPoisonOps(const Instruction *I, + SmallPtrSetImpl &Ops); /// Return true if the given instruction must trigger undefined behavior. /// when I is executed with any operands which appear in KnownPoison holding diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index f77048d45d012..527bba67b2579 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -300,13 +300,17 @@ namespace Intrinsic { typedef unsigned ID; } -/// A helper function for converting Scalar types to vector types. -/// If the incoming type is void, we return void. If the VF is 1, we return -/// the scalar type. -inline Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) { - if (Scalar->isVoidTy() || VF == 1) +/// A helper function for converting Scalar types to vector types. If +/// the incoming type is void, we return void. If the EC represents a +/// scalar, we return the scalar type. +inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { + if (Scalar->isVoidTy() || EC.isScalar()) return Scalar; - return VectorType::get(Scalar, ElementCount::get(VF, isScalable)); + return VectorType::get(Scalar, EC); +} + +inline Type *ToVectorTy(Type *Scalar, unsigned VF) { + return ToVectorTy(Scalar, ElementCount::getFixed(VF)); } /// Identify if the intrinsic is trivially vectorizable. diff --git a/llvm/include/llvm/Bitstream/BitstreamWriter.h b/llvm/include/llvm/Bitstream/BitstreamWriter.h index c0ead19dc71db..162a0fea09132 100644 --- a/llvm/include/llvm/Bitstream/BitstreamWriter.h +++ b/llvm/include/llvm/Bitstream/BitstreamWriter.h @@ -103,7 +103,7 @@ class BitstreamWriter { /// with the specified value. void BackpatchWord(uint64_t BitNo, unsigned NewWord) { using namespace llvm::support; - unsigned ByteNo = BitNo / 8; + uint64_t ByteNo = BitNo / 8; assert((!endian::readAtBitAlignment( &Out[ByteNo], BitNo & 7)) && "Expected to be patching over 0-value placeholders"); diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h index 1b77556dcbb17..39df106b97ef0 100644 --- a/llvm/include/llvm/CodeGen/CommandFlags.h +++ b/llvm/include/llvm/CodeGen/CommandFlags.h @@ -116,6 +116,8 @@ bool getEmitCallSiteInfo(); bool getEnableDebugEntryValues(); +bool getValueTrackingVariableLocations(); + bool getForceDwarfFrameSection(); bool getXRayOmitFunctionIndex(); diff --git a/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h b/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h index 5a1085ea3a37c..bca6065b16439 100644 --- a/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h +++ b/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h @@ -24,6 +24,24 @@ class MachineFunction; class MachineInstr; class TargetRegisterInfo; +/// Record instruction ordering so we can query their relative positions within +/// a function. Meta instructions are given the same ordinal as the preceding +/// non-meta instruction. Class state is invalid if MF is modified after +/// calling initialize. +class InstructionOrdering { +public: + void initialize(const MachineFunction &MF); + void clear() { InstNumberMap.clear(); } + + /// Check if instruction \p A comes before \p B, where \p A and \p B both + /// belong to the MachineFunction passed to initialize(). + bool isBefore(const MachineInstr *A, const MachineInstr *B) const; + +private: + /// Each instruction is assigned an order number. + DenseMap InstNumberMap; +}; + /// For each user variable, keep a list of instruction ranges where this /// variable is accessible. The variables are listed in order of appearance. class DbgValueHistoryMap { @@ -93,7 +111,8 @@ class DbgValueHistoryMap { } /// Drop location ranges which exist entirely outside each variable's scope. - void trimLocationRanges(const MachineFunction &MF, LexicalScopes &LScopes); + void trimLocationRanges(const MachineFunction &MF, LexicalScopes &LScopes, + const InstructionOrdering &Ordering); bool empty() const { return VarEntries.empty(); } void clear() { VarEntries.clear(); } EntriesMap::const_iterator begin() const { return VarEntries.begin(); } diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h index 4ff0fdea36ae2..b488979f458cd 100644 --- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h +++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h @@ -110,6 +110,9 @@ class DebugHandlerBase : public AsmPrinterHandler { virtual void endFunctionImpl(const MachineFunction *MF) = 0; virtual void skippedNonDebugFunction() {} +private: + InstructionOrdering InstOrdering; + // AsmPrinterHandler overrides. public: void beginInstruction(const MachineInstr *MI) override; @@ -129,8 +132,10 @@ class DebugHandlerBase : public AsmPrinterHandler { /// If this type is derived from a base type then return base type size. static uint64_t getBaseTypeSize(const DIType *Ty); + + const InstructionOrdering &getInstOrdering() const { return InstOrdering; } }; -} +} // namespace llvm #endif diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h index a705e1c278175..f76dec57c8401 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h @@ -182,6 +182,8 @@ class GISelInstProfileBuilder { const GISelInstProfileBuilder &addNodeIDRegNum(Register Reg) const; + const GISelInstProfileBuilder &addNodeIDReg(Register Reg) const; + const GISelInstProfileBuilder &addNodeIDImmediate(int64_t Imm) const; const GISelInstProfileBuilder & addNodeIDMBB(const MachineBasicBlock *MBB) const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 4d833ef217df1..77b55928d5867 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -33,6 +33,7 @@ class GISelKnownBits; class MachineDominatorTree; class LegalizerInfo; struct LegalityQuery; +class TargetLowering; struct PreferredTuple { LLT Ty; // The result type of the extend. @@ -52,6 +53,11 @@ struct PtrAddChain { Register Base; }; +struct RegisterImmPair { + Register Reg; + int64_t Imm; +}; + using OperandBuildSteps = SmallVector, 4>; struct InstructionBuildSteps { @@ -90,6 +96,8 @@ class CombinerHelper { return KB; } + const TargetLowering &getTargetLowering() const; + /// \return true if the combine is running prior to legalization, or if \p /// Query is legal on the target. bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const; @@ -222,6 +230,12 @@ class CombinerHelper { bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); bool applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); + // Transform a G_SHL with an extended source into a narrower shift if + // possible. + bool matchCombineShlOfExtend(MachineInstr &MI, RegisterImmPair &MatchData); + bool applyCombineShlOfExtend(MachineInstr &MI, + const RegisterImmPair &MatchData); + /// Reduce a shift by a constant to an unmerge and a shift on a half sized /// type. This will not produce a shift smaller than \p TargetShiftSize. bool matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize, @@ -237,6 +251,13 @@ class CombinerHelper { bool matchCombineP2IToI2P(MachineInstr &MI, Register &Reg); bool applyCombineP2IToI2P(MachineInstr &MI, Register &Reg); + /// Transform G_ADD (G_PTRTOINT x), y -> G_PTRTOINT (G_PTR_ADD x, y) + /// Transform G_ADD y, (G_PTRTOINT x) -> G_PTRTOINT (G_PTR_ADD x, y) + bool matchCombineAddP2IToPtrAdd(MachineInstr &MI, + std::pair &PtrRegAndCommute); + bool applyCombineAddP2IToPtrAdd(MachineInstr &MI, + std::pair &PtrRegAndCommute); + /// Return true if any explicit use operand on \p MI is defined by a /// G_IMPLICIT_DEF. bool matchAnyExplicitUseIsUndef(MachineInstr &MI); @@ -251,6 +272,13 @@ class CombinerHelper { /// Return true if a G_STORE instruction \p MI is storing an undef value. bool matchUndefStore(MachineInstr &MI); + /// Return true if a G_SELECT instruction \p MI has an undef comparison. + bool matchUndefSelectCmp(MachineInstr &MI); + + /// Return true if a G_SELECT instruction \p MI has a constant comparison. If + /// true, \p OpIdx will store the operand index of the known selected value. + bool matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx); + /// Replace an instruction with a G_FCONSTANT with value \p C. bool replaceInstWithFConstant(MachineInstr &MI, double C); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h index f5662a27debe3..f2c19f559a36f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h @@ -34,6 +34,10 @@ class GISelKnownBits : public GISelChangeObserver { /// Cache maintained during a computeKnownBits request. SmallDenseMap ComputeKnownBitsCache; + void computeKnownBitsMin(Register Src0, Register Src1, KnownBits &Known, + const APInt &DemandedElts, + unsigned Depth = 0); + public: GISelKnownBits(MachineFunction &MF, unsigned MaxDepth = 6); virtual ~GISelKnownBits() = default; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h index ce16eee45fd25..033d5b4b58348 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -217,7 +217,7 @@ class IRTranslator : public MachineFunctionPass { /// Translate an LLVM string intrinsic (memcpy, memset, ...). bool translateMemFunc(const CallInst &CI, MachineIRBuilder &MIRBuilder, - Intrinsic::ID ID); + unsigned Opcode); void getStackGuard(Register DstReg, MachineIRBuilder &MIRBuilder); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h index 756eab28e3ac8..17c1ec36c24fe 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h @@ -112,6 +112,14 @@ enum { /// - InsnID - Instruction ID /// - Expected opcode GIM_CheckOpcode, + + /// Check the opcode on the specified instruction, checking 2 acceptable + /// alternatives. + /// - InsnID - Instruction ID + /// - Expected opcode + /// - Alternative expected opcode + GIM_CheckOpcodeIsEither, + /// Check the instruction has the right number of operands /// - InsnID - Instruction ID /// - Expected number of operands diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h index 85caed0ecae30..1f1fb5aca8757 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h @@ -154,24 +154,31 @@ bool InstructionSelector::executeMatchTable( break; } - case GIM_CheckOpcode: { + case GIM_CheckOpcode: + case GIM_CheckOpcodeIsEither: { int64_t InsnID = MatchTable[CurrentIdx++]; - int64_t Expected = MatchTable[CurrentIdx++]; + int64_t Expected0 = MatchTable[CurrentIdx++]; + int64_t Expected1 = -1; + if (MatcherOpcode == GIM_CheckOpcodeIsEither) + Expected1 = MatchTable[CurrentIdx++]; assert(State.MIs[InsnID] != nullptr && "Used insn before defined"); unsigned Opcode = State.MIs[InsnID]->getOpcode(); DEBUG_WITH_TYPE(TgtInstructionSelector::getName(), - dbgs() << CurrentIdx << ": GIM_CheckOpcode(MIs[" << InsnID - << "], ExpectedOpcode=" << Expected - << ") // Got=" << Opcode << "\n"); - if (Opcode != Expected) { + dbgs() << CurrentIdx << ": GIM_CheckOpcode(MIs[" << InsnID + << "], ExpectedOpcode=" << Expected0; + if (MatcherOpcode == GIM_CheckOpcodeIsEither) + dbgs() << " || " << Expected1; + dbgs() << ") // Got=" << Opcode << "\n"; + ); + + if (Opcode != Expected0 && Opcode != Expected1) { if (handleReject() == RejectAndGiveUp) return false; } break; } - case GIM_SwitchOpcode: { int64_t InsnID = MatchTable[CurrentIdx++]; int64_t LowerBound = MatchTable[CurrentIdx++]; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h index 8725d96efd821..da785406bc31d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h @@ -104,36 +104,37 @@ class RegisterBankInfo { /// Currently the TableGen-like file would look like: /// \code /// PartialMapping[] = { - /// /*32-bit add*/ {0, 32, GPR}, // Scalar entry repeated for first vec elt. - /// /*2x32-bit add*/ {0, 32, GPR}, {32, 32, GPR}, - /// /*<2x32-bit> vadd {0, 64, VPR} + /// /*32-bit add*/ {0, 32, GPR}, // Scalar entry repeated for first + /// // vec elt. + /// /*2x32-bit add*/ {0, 32, GPR}, {32, 32, GPR}, + /// /*<2x32-bit> vadd*/ {0, 64, VPR} /// }; // PartialMapping duplicated. /// /// ValueMapping[] { - /// /*plain 32-bit add*/ {&PartialMapping[0], 1}, + /// /*plain 32-bit add*/ {&PartialMapping[0], 1}, /// /*expanded vadd on 2xadd*/ {&PartialMapping[1], 2}, - /// /*plain <2x32-bit> vadd*/ {&PartialMapping[3], 1} + /// /*plain <2x32-bit> vadd*/ {&PartialMapping[3], 1} /// }; /// \endcode /// /// With the array of pointer, we would have: /// \code /// PartialMapping[] = { - /// /*32-bit add lower */ {0, 32, GPR}, + /// /*32-bit add lower */ { 0, 32, GPR}, /// /*32-bit add upper */ {32, 32, GPR}, - /// /*<2x32-bit> vadd {0, 64, VPR} + /// /*<2x32-bit> vadd */ { 0, 64, VPR} /// }; // No more duplication. /// /// BreakDowns[] = { - /// /*AddBreakDown*/ &PartialMapping[0], + /// /*AddBreakDown*/ &PartialMapping[0], /// /*2xAddBreakDown*/ &PartialMapping[0], &PartialMapping[1], - /// /*VAddBreakDown*/ &PartialMapping[2] + /// /*VAddBreakDown*/ &PartialMapping[2] /// }; // Addresses of PartialMapping duplicated (smaller). /// /// ValueMapping[] { - /// /*plain 32-bit add*/ {&BreakDowns[0], 1}, + /// /*plain 32-bit add*/ {&BreakDowns[0], 1}, /// /*expanded vadd on 2xadd*/ {&BreakDowns[1], 2}, - /// /*plain <2x32-bit> vadd*/ {&BreakDowns[3], 1} + /// /*plain <2x32-bit> vadd*/ {&BreakDowns[3], 1} /// }; /// \endcode /// diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 35aab5018fa4a..756c6e5aa2c16 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -448,6 +448,19 @@ class MachineRegisterInfo { return ++DI == def_end(); } + /// Returns the defining operand if there is exactly one operand defining the + /// specified register, otherwise nullptr. + MachineOperand *getOneDef(Register Reg) const { + def_iterator DI = def_begin(Reg); + if (DI == def_end()) // No defs. + return nullptr; + + def_iterator OneDef = DI; + if (++DI == def_end()) + return &*OneDef; + return nullptr; // Multiple defs. + } + /// use_iterator/use_begin/use_end - Walk all uses of the specified register. using use_iterator = defusechain_iterator; diff --git a/llvm/include/llvm/CodeGen/RegisterScavenging.h b/llvm/include/llvm/CodeGen/RegisterScavenging.h index 5b5a80a67e7f7..7bcd451509054 100644 --- a/llvm/include/llvm/CodeGen/RegisterScavenging.h +++ b/llvm/include/llvm/CodeGen/RegisterScavenging.h @@ -89,15 +89,6 @@ class RegScavenger { while (MBBI != I) forward(); } - /// Invert the behavior of forward() on the current instruction (undo the - /// changes to the available registers made by forward()). - void unprocess(); - - /// Unprocess instructions until you reach the provided iterator. - void unprocess(MachineBasicBlock::iterator I) { - while (MBBI != I) unprocess(); - } - /// Update internal register state and move MBB iterator backwards. /// Contrary to unprocess() this method gives precise results even in the /// absence of kill flags. diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h index 1eb9b9f322ba2..d2b95209d7b4b 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -268,6 +268,11 @@ namespace llvm { return SU->SchedClass; } + /// IsReachable - Checks if SU is reachable from TargetSU. + bool IsReachable(SUnit *SU, SUnit *TargetSU) { + return Topo.IsReachable(SU, TargetSU); + } + /// Returns an iterator to the top of the current scheduling region. MachineBasicBlock::iterator begin() const { return RegionBegin; } diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 7c2b49087edd2..cde075f41f739 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1379,8 +1379,18 @@ class MemSDNode : public SDNode { } const SDValue &getChain() const { return getOperand(0); } + const SDValue &getBasePtr() const { - return getOperand(getOpcode() == ISD::STORE ? 2 : 1); + switch (getOpcode()) { + case ISD::STORE: + case ISD::MSTORE: + return getOperand(2); + case ISD::MGATHER: + case ISD::MSCATTER: + return getOperand(3); + default: + return getOperand(1); + } } // Methods to support isa and dyn_cast @@ -2292,9 +2302,6 @@ class MaskedLoadStoreSDNode : public MemSDNode { // MaskedLoadSDNode (Chain, ptr, offset, mask, passthru) // MaskedStoreSDNode (Chain, data, ptr, offset, mask) // Mask is a vector of i1 elements - const SDValue &getBasePtr() const { - return getOperand(getOpcode() == ISD::MLOAD ? 1 : 2); - } const SDValue &getOffset() const { return getOperand(getOpcode() == ISD::MLOAD ? 2 : 3); } diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index e402773f5a6c8..ec6e038d59f69 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -375,6 +375,13 @@ class TargetLoweringBase { EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes = true) const; + /// Return the preferred type to use for a shift opcode, given the shifted + /// amount type is \p ShiftValueTy. + LLVM_READONLY + virtual LLT getPreferredShiftAmountTy(LLT ShiftValueTy) const { + return ShiftValueTy; + } + /// Returns the type to be used for the index operand of: /// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, /// ISD::INSERT_SUBVECTOR, and ISD::EXTRACT_SUBVECTOR diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index a65947bf24c43..90789df356ae1 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -208,8 +208,11 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H} +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_SYSEXITS_H ${HAVE_SYSEXITS_H} + /* Define if the setupterm() function is supported this platform. */ -#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO} +#cmakedefine HAVE_TERMINFO ${HAVE_TERMINFO} /* Define if the xar_open() function is supported this platform. */ #cmakedefine HAVE_LIBXAR ${HAVE_LIBXAR} diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h b/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h index 72687682f606c..72394d7c99854 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h @@ -42,32 +42,11 @@ class EHFrameRegistrar { /// Registers / Deregisters EH-frames in the current process. class InProcessEHFrameRegistrar final : public EHFrameRegistrar { public: - /// Get a reference to the InProcessEHFrameRegistrar singleton. - static InProcessEHFrameRegistrar &getInstance(); - - InProcessEHFrameRegistrar(const InProcessEHFrameRegistrar &) = delete; - InProcessEHFrameRegistrar & - operator=(const InProcessEHFrameRegistrar &) = delete; - - InProcessEHFrameRegistrar(InProcessEHFrameRegistrar &&) = delete; - InProcessEHFrameRegistrar &operator=(InProcessEHFrameRegistrar &&) = delete; - Error registerEHFrames(JITTargetAddress EHFrameSectionAddr, - size_t EHFrameSectionSize) override { - return registerEHFrameSection( - jitTargetAddressToPointer(EHFrameSectionAddr), - EHFrameSectionSize); - } + size_t EHFrameSectionSize) override; Error deregisterEHFrames(JITTargetAddress EHFrameSectionAddr, - size_t EHFrameSectionSize) override { - return deregisterEHFrameSection( - jitTargetAddressToPointer(EHFrameSectionAddr), - EHFrameSectionSize); - } - -private: - InProcessEHFrameRegistrar(); + size_t EHFrameSectionSize) override; }; using StoreFrameRangeFunction = diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h index fbf9bde8a9d55..cb8ee130ab614 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h @@ -177,7 +177,8 @@ class ObjectLinkingLayer : public ObjectLayer { class EHFrameRegistrationPlugin : public ObjectLinkingLayer::Plugin { public: - EHFrameRegistrationPlugin(jitlink::EHFrameRegistrar &Registrar); + EHFrameRegistrationPlugin( + std::unique_ptr Registrar); Error notifyEmitted(MaterializationResponsibility &MR) override; void modifyPassConfig(MaterializationResponsibility &MR, const Triple &TT, jitlink::PassConfiguration &PassConfig) override; @@ -192,7 +193,7 @@ class EHFrameRegistrationPlugin : public ObjectLinkingLayer::Plugin { }; std::mutex EHFramePluginMutex; - jitlink::EHFrameRegistrar &Registrar; + std::unique_ptr Registrar; DenseMap InProcessLinks; DenseMap TrackedEHFrameRanges; std::vector UntrackedEHFrameRanges; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h index db9cd1b98cf9c..90097f1131f31 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h @@ -206,4 +206,4 @@ TPCIndirectionUtils::CreateWithABI(TargetProcessControl &TPC) { } // end namespace orc } // end namespace llvm -#endif // LLVM_EXECUTIONENGINE_ORC_T_H +#endif // LLVM_EXECUTIONENGINE_ORC_TPCINDIRECTIONUTILS_H diff --git a/llvm/include/llvm/Frontend/OpenACC/ACC.td b/llvm/include/llvm/Frontend/OpenACC/ACC.td index d6ed93374013a..b15f8348c8f42 100644 --- a/llvm/include/llvm/Frontend/OpenACC/ACC.td +++ b/llvm/include/llvm/Frontend/OpenACC/ACC.td @@ -192,7 +192,7 @@ def ACCC_Private : Clause<"private"> { // 2.9.7 def ACCC_Tile : Clause <"tile"> { - let flangClassValue = "AccSizeExprList"; + let flangClassValue = "AccTileExprList"; } // 2.8.1 diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h index 9a1d2b80c48e4..41b3414f171ba 100644 --- a/llvm/include/llvm/IR/Constant.h +++ b/llvm/include/llvm/IR/Constant.h @@ -78,7 +78,8 @@ class Constant : public User { bool isMinSignedValue() const; /// Return true if this is a finite and non-zero floating-point scalar - /// constant or a vector constant with all finite and non-zero elements. + /// constant or a fixed width vector constant with all finite and non-zero + /// elements. bool isFiniteNonZeroFP() const; /// Return true if this is a normal (as opposed to denormal) floating-point @@ -100,11 +101,13 @@ class Constant : public User { bool isElementWiseEqual(Value *Y) const; /// Return true if this is a vector constant that includes any undefined - /// elements. + /// elements. Since it is impossible to inspect a scalable vector element- + /// wise at compile time, this function returns true only if the entire + /// vector is undef bool containsUndefElement() const; - /// Return true if this is a vector constant that includes any constant - /// expressions. + /// Return true if this is a fixed width vector constant that includes + /// any constant expressions. bool containsConstantExpression() const; /// Return true if evaluation of this constant could trap. This is true for diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h index b7e0ecde8629e..33736321b42b0 100644 --- a/llvm/include/llvm/IR/DiagnosticInfo.h +++ b/llvm/include/llvm/IR/DiagnosticInfo.h @@ -21,6 +21,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Support/CBindingWrapping.h" +#include "llvm/Support/TypeSize.h" #include "llvm/Support/YAMLTraits.h" #include #include @@ -434,6 +435,7 @@ class DiagnosticInfoOptimizationBase : public DiagnosticInfoWithLocationBase { Argument(StringRef Key, unsigned N); Argument(StringRef Key, unsigned long N); Argument(StringRef Key, unsigned long long N); + Argument(StringRef Key, ElementCount EC); Argument(StringRef Key, bool B) : Key(Key), Val(B ? "true" : "false") {} Argument(StringRef Key, DebugLoc dl); }; diff --git a/llvm/include/llvm/IR/GetElementPtrTypeIterator.h b/llvm/include/llvm/IR/GetElementPtrTypeIterator.h index 79ea5791b2fdb..6293305a2639f 100644 --- a/llvm/include/llvm/IR/GetElementPtrTypeIterator.h +++ b/llvm/include/llvm/IR/GetElementPtrTypeIterator.h @@ -83,7 +83,7 @@ namespace llvm { if (isa(VTy)) NumElements = Unbounded; else - NumElements = VTy->getNumElements(); + NumElements = cast(VTy)->getNumElements(); } else CurTy = dyn_cast(Ty); ++OpIt; diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 5beda48214d61..ac7ce75a9f310 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -2057,7 +2057,7 @@ class ShuffleVectorInst : public Instruction { /// Example: shufflevector <2 x n> A, <2 x n> B, <1,2,3> bool increasesLength() const { unsigned NumSourceElts = - cast(Op<0>()->getType())->getNumElements(); + cast(Op<0>()->getType())->getNumElements(); unsigned NumMaskElts = ShuffleMask.size(); return NumSourceElts < NumMaskElts; } @@ -2250,7 +2250,8 @@ class ShuffleVectorInst : public Instruction { /// Return true if this shuffle mask is an extract subvector mask. bool isExtractSubvectorMask(int &Index) const { - int NumSrcElts = cast(Op<0>()->getType())->getNumElements(); + int NumSrcElts = + cast(Op<0>()->getType())->getNumElements(); return isExtractSubvectorMask(ShuffleMask, NumSrcElts, Index); } diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 3162d4bea5298..d42d576dc2030 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -17,7 +17,9 @@ include "llvm/CodeGen/SDNodeProperties.td" // Properties we keep track of for intrinsics. //===----------------------------------------------------------------------===// -class IntrinsicProperty; +class IntrinsicProperty { + bit IsDefault = is_default; +} // Intr*Mem - Memory properties. If no property is set, the worst case // is assumed (it may read and write any memory it can get access to and it may @@ -81,6 +83,11 @@ class NoAlias : IntrinsicProperty { int ArgNo = idx.Value; } +// NoUndef - The specified argument is neither undef nor poison. +class NoUndef : IntrinsicProperty { + int ArgNo = idx.Value; +} + class Align : IntrinsicProperty { int ArgNo = idx.Value; int Align = align; @@ -331,7 +338,8 @@ class Intrinsic ret_types, list param_types = [], list intr_properties = [], string name = "", - list sd_properties = []> : SDPatternOperator { + list sd_properties = [], + bit disable_default_attributes = 0> : SDPatternOperator { string LLVMName = name; string TargetPrefix = ""; // Set to a prefix for target-specific intrinsics. list RetTypes = ret_types; @@ -339,6 +347,10 @@ class Intrinsic ret_types, list IntrProperties = intr_properties; let Properties = sd_properties; + // Disable applying IntrinsicProperties that are marked default with + // IntrinsicProperty<1> + bit DisableDefaultAttributes = disable_default_attributes; + bit isTarget = 0; } @@ -508,7 +520,8 @@ def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>; // The assume intrinsic is marked as arbitrarily writing so that proper // control dependencies will be maintained. -def int_assume : Intrinsic<[], [llvm_i1_ty], [IntrWillReturn]>; +def int_assume : Intrinsic<[], [llvm_i1_ty], [IntrWillReturn, + NoUndef>]>; // Stack Protector Intrinsic - The stackprotector intrinsic writes the stack // guard to the correct place on the stack frame. diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 3f71f644f9a1d..c1b780be17c63 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -184,6 +184,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; + class AdvSIMD_BF16FML_Intrinsic + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; } // Arithmetic ops @@ -466,9 +470,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in { def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic; - def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic; - def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic; - def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic; + def int_aarch64_neon_bfmmla + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; + def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic; + def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic; // v8.6-A Bfloat Intrinsics diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index df74e446b965a..052dd7c813bcb 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -791,14 +791,17 @@ def int_arm_neon_vcvtbfp2bf : Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>; def int_arm_neon_bfdot : Neon_Dot_Intrinsic; -def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic; - -class Neon_FML_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], - [IntrNoMem]>; -def int_arm_neon_bfmlalb : Neon_FML_Intrinsic; -def int_arm_neon_bfmlalt : Neon_FML_Intrinsic; +def int_arm_neon_bfmmla + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; + +class Neon_BF16FML_Intrinsic + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic; +def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic; def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td index b2e542994de93..2ff045865bb7b 100644 --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -950,6 +950,18 @@ def int_ppc_altivec_vrldmi : [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +// Vector Divide Extended Intrinsics. +def int_ppc_altivec_vdivesw : PowerPC_Vec_WWW_Intrinsic<"vdivesw">; +def int_ppc_altivec_vdiveuw : PowerPC_Vec_WWW_Intrinsic<"vdiveuw">; +def int_ppc_altivec_vdivesd : PowerPC_Vec_DDD_Intrinsic<"vdivesd">; +def int_ppc_altivec_vdiveud : PowerPC_Vec_DDD_Intrinsic<"vdiveud">; + +// Vector Multiply High Intrinsics. +def int_ppc_altivec_vmulhsw : PowerPC_Vec_WWW_Intrinsic<"vmulhsw">; +def int_ppc_altivec_vmulhuw : PowerPC_Vec_WWW_Intrinsic<"vmulhuw">; +def int_ppc_altivec_vmulhsd : PowerPC_Vec_DDD_Intrinsic<"vmulhsd">; +def int_ppc_altivec_vmulhud : PowerPC_Vec_DDD_Intrinsic<"vmulhud">; + //===----------------------------------------------------------------------===// // PowerPC VSX Intrinsic Definitions. diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h index 5d04b3563dd5d..084b1d49569e6 100644 --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -38,14 +38,19 @@ template class MatrixBuilder { Value *RHS) { assert((LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy()) && "One of the operands must be a matrix (embedded in a vector)"); - if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy()) + if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy()) { + assert(!isa(LHS->getType()) && + "LHS Assumed to be fixed width"); RHS = B.CreateVectorSplat( - cast(LHS->getType())->getNumElements(), RHS, + cast(LHS->getType())->getElementCount(), RHS, "scalar.splat"); - else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy()) + } else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy()) { + assert(!isa(RHS->getType()) && + "RHS Assumed to be fixed width"); LHS = B.CreateVectorSplat( - cast(RHS->getType())->getNumElements(), LHS, + cast(RHS->getType())->getElementCount(), LHS, "scalar.splat"); + } return {LHS, RHS}; } @@ -155,14 +160,19 @@ template class MatrixBuilder { /// matrixes. Value *CreateAdd(Value *LHS, Value *RHS) { assert(LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy()); - if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy()) + if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy()) { + assert(!isa(LHS->getType()) && + "LHS Assumed to be fixed width"); RHS = B.CreateVectorSplat( - cast(LHS->getType())->getNumElements(), RHS, + cast(LHS->getType())->getElementCount(), RHS, "scalar.splat"); - else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy()) + } else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy()) { + assert(!isa(RHS->getType()) && + "RHS Assumed to be fixed width"); LHS = B.CreateVectorSplat( - cast(RHS->getType())->getNumElements(), LHS, + cast(RHS->getType())->getElementCount(), LHS, "scalar.splat"); + } return cast(LHS->getType()) ->getElementType() @@ -175,14 +185,19 @@ template class MatrixBuilder { /// point matrixes. Value *CreateSub(Value *LHS, Value *RHS) { assert(LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy()); - if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy()) + if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy()) { + assert(!isa(LHS->getType()) && + "LHS Assumed to be fixed width"); RHS = B.CreateVectorSplat( - cast(LHS->getType())->getNumElements(), RHS, + cast(LHS->getType())->getElementCount(), RHS, "scalar.splat"); - else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy()) + } else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy()) { + assert(!isa(RHS->getType()) && + "RHS Assumed to be fixed width"); LHS = B.CreateVectorSplat( - cast(RHS->getType())->getNumElements(), LHS, + cast(RHS->getType())->getElementCount(), LHS, "scalar.splat"); + } return cast(LHS->getType()) ->getElementType() diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h new file mode 100644 index 0000000000000..eb63a21403100 --- /dev/null +++ b/llvm/include/llvm/IR/StructuralHash.h @@ -0,0 +1,34 @@ +//===- llvm/IR/StructuralHash.h - IR Hash for expensive checks --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides hashing of the LLVM IR structure to be used to check +// Passes modification status. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_STRUCTURALHASH_H +#define LLVM_IR_STRUCTURALHASH_H + +#ifdef EXPENSIVE_CHECKS + +#include + +// This header is only meant to be used when -DEXPENSIVE_CHECKS is set +namespace llvm { + +class Function; +class Module; + +uint64_t StructuralHash(const Function &F); +uint64_t StructuralHash(const Module &M); + +} // end namespace llvm + +#endif + +#endif // LLVM_IR_STRUCTURALHASH_H diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h index eec96373a1a9f..1ed689dc82a5e 100644 --- a/llvm/include/llvm/IR/Value.h +++ b/llvm/include/llvm/IR/Value.h @@ -424,7 +424,7 @@ class Value { return materialized_users(); } - /// Return true if there is exactly one user of this value. + /// Return true if there is exactly one use of this value. /// /// This is specialized because it is a common request and does not require /// traversing the whole use list. @@ -434,15 +434,25 @@ class Value { return ++I == E; } - /// Return true if this Value has exactly N users. + /// Return true if this Value has exactly N uses. bool hasNUses(unsigned N) const; - /// Return true if this value has N users or more. + /// Return true if this value has N uses or more. /// /// This is logically equivalent to getNumUses() >= N. bool hasNUsesOrMore(unsigned N) const; - /// Return true if there is exactly one user of this value that cannot be + /// Return true if there is exactly one user of this value. + /// + /// Note that this is not the same as "has one use". If a value has one use, + /// then there certainly is a single user. But if value has several uses, + /// it is possible that all uses are in a single user, or not. + /// + /// This check is potentially costly, since it requires traversing, + /// in the worst case, the whole use list of a value. + bool hasOneUser() const; + + /// Return true if there is exactly one use of this value that cannot be /// dropped. /// /// This is specialized because it is a common request and does not require @@ -455,7 +465,7 @@ class Value { /// traversing the whole use list. bool hasNUndroppableUses(unsigned N) const; - /// Return true if this value has N users or more. + /// Return true if this value has N uses or more. /// /// This is logically equivalent to getNumUses() >= N. bool hasNUndroppableUsesOrMore(unsigned N) const; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index d3beb79ff1cbe..b8a78c9733c2f 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -113,7 +113,6 @@ void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCodeGenPreparePass(PassRegistry&); void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeConstantMergeLegacyPassPass(PassRegistry&); -void initializeConstantPropagationPass(PassRegistry&); void initializeControlHeightReductionLegacyPassPass(PassRegistry&); void initializeCorrelatedValuePropagationPass(PassRegistry&); void initializeCostModelAnalysisPass(PassRegistry&); @@ -178,6 +177,7 @@ void initializeGlobalSplitPass(PassRegistry&); void initializeGlobalsAAWrapperPassPass(PassRegistry&); void initializeGuardWideningLegacyPassPass(PassRegistry&); void initializeHardwareLoopsPass(PassRegistry&); +void initializeHeapProfilerLegacyPassPass(PassRegistry &); void initializeHotColdSplittingLegacyPassPass(PassRegistry&); void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &); void initializeIPSCCPLegacyPassPass(PassRegistry&); @@ -305,6 +305,7 @@ void initializeMergeICmpsLegacyPassPass(PassRegistry &); void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&); void initializeMetaRenamerPass(PassRegistry&); void initializeModuleDebugInfoPrinterPass(PassRegistry&); +void initializeModuleHeapProfilerLegacyPassPass(PassRegistry &); void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&); void initializeModuloScheduleTestPass(PassRegistry&); void initializeMustExecutePrinterPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index f7e7602f5ae1d..71af80da5b804 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -91,7 +91,6 @@ namespace { (void) llvm::createLibCallsShrinkWrapPass(); (void) llvm::createCalledValuePropagationPass(); (void) llvm::createConstantMergePass(); - (void) llvm::createConstantPropagationPass(); (void) llvm::createControlHeightReductionLegacyPass(); (void) llvm::createCostModelAnalysisPass(); (void) llvm::createDeadArgEliminationPass(); diff --git a/llvm/include/llvm/MC/MCSectionXCOFF.h b/llvm/include/llvm/MC/MCSectionXCOFF.h index babb2dcb24d16..4ad9aa76a2108 100644 --- a/llvm/include/llvm/MC/MCSectionXCOFF.h +++ b/llvm/include/llvm/MC/MCSectionXCOFF.h @@ -66,11 +66,6 @@ class MCSectionXCOFF final : public MCSection { XCOFF::StorageClass getStorageClass() const { return QualName->getStorageClass(); } - - XCOFF::VisibilityType getVisibilityType() const { - return QualName->getVisibilityType(); - } - XCOFF::SymbolType getCSectType() const { return Type; } MCSymbolXCOFF *getQualNameSymbol() const { return QualName; } diff --git a/llvm/include/llvm/MC/MCWinEH.h b/llvm/include/llvm/MC/MCWinEH.h index b1c28c0ecae79..baeafa3468e24 100644 --- a/llvm/include/llvm/MC/MCWinEH.h +++ b/llvm/include/llvm/MC/MCWinEH.h @@ -40,6 +40,7 @@ struct FrameInfo { bool HandlesUnwind = false; bool HandlesExceptions = false; + bool EmitAttempted = false; int LastFrameInst = -1; const FrameInfo *ChainedParent = nullptr; @@ -53,6 +54,15 @@ struct FrameInfo { const FrameInfo *ChainedParent) : Begin(BeginFuncEHLabel), Function(Function), ChainedParent(ChainedParent) {} + + bool empty() const { + if (!Instructions.empty()) + return false; + for (const auto &E : EpilogMap) + if (!E.second.empty()) + return false; + return true; + } }; class UnwindEmitter { diff --git a/llvm/include/llvm/Object/Binary.h b/llvm/include/llvm/Object/Binary.h index e95516f30a403..de0f48e7f4f53 100644 --- a/llvm/include/llvm/Object/Binary.h +++ b/llvm/include/llvm/Object/Binary.h @@ -228,7 +228,8 @@ template const T* OwningBinary::getBinary() const { return Bin.get(); } -Expected> createBinary(StringRef Path); +Expected> createBinary(StringRef Path, + LLVMContext *Context = nullptr); } // end namespace object diff --git a/llvm/include/llvm/Object/MachOUniversal.h b/llvm/include/llvm/Object/MachOUniversal.h index 5e006fd873180..9bcacb5101087 100644 --- a/llvm/include/llvm/Object/MachOUniversal.h +++ b/llvm/include/llvm/Object/MachOUniversal.h @@ -22,8 +22,11 @@ namespace llvm { class StringRef; +class Module; +class LLVMContext; namespace object { +class IRObjectFile; class MachOUniversalBinary : public Binary { virtual void anchor(); @@ -101,6 +104,8 @@ class MachOUniversalBinary : public Binary { } Expected> getAsObjectFile() const; + Expected> + getAsIRObject(LLVMContext &Ctx) const; Expected> getAsArchive() const; }; @@ -154,6 +159,9 @@ class MachOUniversalBinary : public Binary { Expected> getMachOObjectForArch(StringRef ArchName) const; + Expected> + getIRObjectForArch(StringRef ArchName, LLVMContext &Ctx) const; + Expected> getArchiveForArch(StringRef ArchName) const; }; diff --git a/llvm/include/llvm/Object/MachOUniversalWriter.h b/llvm/include/llvm/Object/MachOUniversalWriter.h index c860495ddd6f0..643b70e8eaf1d 100644 --- a/llvm/include/llvm/Object/MachOUniversalWriter.h +++ b/llvm/include/llvm/Object/MachOUniversalWriter.h @@ -19,7 +19,10 @@ #include "llvm/Object/MachO.h" namespace llvm { +class LLVMContext; + namespace object { +class IRObjectFile; class Slice { const Binary *B; @@ -32,12 +35,18 @@ class Slice { // file size can be calculated before creating the output buffer. uint32_t P2Alignment; + Slice(const IRObjectFile *IRO, uint32_t CPUType, uint32_t CPUSubType, + std::string ArchName, uint32_t Align); + public: explicit Slice(const MachOObjectFile &O); Slice(const MachOObjectFile &O, uint32_t Align); - static Expected create(const Archive *A); + static Expected create(const Archive *A, + LLVMContext *LLVMCtx = nullptr); + + static Expected create(const IRObjectFile *IRO, uint32_t Align); void setP2Alignment(uint32_t Align) { P2Alignment = Align; } diff --git a/llvm/include/llvm/Object/XCOFFObjectFile.h b/llvm/include/llvm/Object/XCOFFObjectFile.h index 53caeae0bdcc0..fbc542fd3e6de 100644 --- a/llvm/include/llvm/Object/XCOFFObjectFile.h +++ b/llvm/include/llvm/Object/XCOFFObjectFile.h @@ -372,6 +372,8 @@ class XCOFFObjectFile : public ObjectFile { Expected> relocations(const XCOFFSectionHeader32 &) const; + + static bool classof(const Binary *B) { return B->isXCOFF(); } }; // XCOFFObjectFile class XCOFFSymbolRef { diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h index 005936d1a8abf..9625d02ea3241 100644 --- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h +++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h @@ -137,9 +137,9 @@ struct LineTableOpcode { struct LineTable { dwarf::DwarfFormat Format; - uint64_t Length; + Optional Length; uint16_t Version; - uint64_t PrologueLength; + Optional PrologueLength; uint8_t MinInstLength; uint8_t MaxOpsPerInst; uint8_t DefaultIsStmt; diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h index fa5326038ada0..aca941b2da15a 100644 --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -342,6 +342,7 @@ class SampleRecord { raw_ostream &operator<<(raw_ostream &OS, const SampleRecord &Sample); class FunctionSamples; +class SampleProfileReaderItaniumRemapper; using BodySampleMap = std::map; // NOTE: Using a StringMap here makes parsed profiles consume around 17% more @@ -428,35 +429,15 @@ class FunctionSamples { return &iter->second; } - /// Returns a pointer to FunctionSamples at the given callsite location \p Loc - /// with callee \p CalleeName. If no callsite can be found, relax the - /// restriction to return the FunctionSamples at callsite location \p Loc - /// with the maximum total sample count. - const FunctionSamples *findFunctionSamplesAt(const LineLocation &Loc, - StringRef CalleeName) const { - std::string CalleeGUID; - CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID); - - auto iter = CallsiteSamples.find(Loc); - if (iter == CallsiteSamples.end()) - return nullptr; - auto FS = iter->second.find(CalleeName); - if (FS != iter->second.end()) - return &FS->second; - // If we cannot find exact match of the callee name, return the FS with - // the max total count. Only do this when CalleeName is not provided, - // i.e., only for indirect calls. - if (!CalleeName.empty()) - return nullptr; - uint64_t MaxTotalSamples = 0; - const FunctionSamples *R = nullptr; - for (const auto &NameFS : iter->second) - if (NameFS.second.getTotalSamples() >= MaxTotalSamples) { - MaxTotalSamples = NameFS.second.getTotalSamples(); - R = &NameFS.second; - } - return R; - } + /// Returns a pointer to FunctionSamples at the given callsite location + /// \p Loc with callee \p CalleeName. If no callsite can be found, relax + /// the restriction to return the FunctionSamples at callsite location + /// \p Loc with the maximum total sample count. If \p Remapper is not + /// nullptr, use \p Remapper to find FunctionSamples with equivalent name + /// as \p CalleeName. + const FunctionSamples * + findFunctionSamplesAt(const LineLocation &Loc, StringRef CalleeName, + SampleProfileReaderItaniumRemapper *Remapper) const; bool empty() const { return TotalSamples == 0; } @@ -630,7 +611,11 @@ class FunctionSamples { /// tree nodes in the profile. /// /// \returns the FunctionSamples pointer to the inlined instance. - const FunctionSamples *findFunctionSamples(const DILocation *DIL) const; + /// If \p Remapper is not nullptr, it will be used to find matching + /// FunctionSamples with not exactly the same but equivalent name. + const FunctionSamples *findFunctionSamples( + const DILocation *DIL, + SampleProfileReaderItaniumRemapper *Remapper = nullptr) const; static SampleProfileFormat Format; @@ -648,6 +633,10 @@ class FunctionSamples { return UseMD5 ? std::stoull(Name.data()) : Function::getGUID(Name); } + // Find all the names in the current FunctionSamples including names in + // all the inline instances and names of call targets. + void findAllNames(DenseSet &NameSet) const; + private: /// Mangled name of the function. StringRef Name; diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h index 0e8ee7696c543..385ac820f5b5b 100644 --- a/llvm/include/llvm/ProfileData/SampleProfReader.h +++ b/llvm/include/llvm/ProfileData/SampleProfReader.h @@ -208,6 +208,7 @@ #ifndef LLVM_PROFILEDATA_SAMPLEPROFREADER_H #define LLVM_PROFILEDATA_SAMPLEPROFREADER_H +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" @@ -275,15 +276,18 @@ class SampleProfileReaderItaniumRemapper { return Remappings->lookup(FunctionName); } - /// Return the samples collected for function \p F if remapper knows - /// it is present in SampleMap. - FunctionSamples *getSamplesFor(StringRef FunctionName); + /// Return the equivalent name in the profile for \p FunctionName if + /// it exists. + Optional lookUpNameInProfile(StringRef FunctionName); private: // The buffer holding the content read from remapping file. std::unique_ptr Buffer; std::unique_ptr Remappings; - DenseMap SampleMap; + // Map remapping key to the name in the profile. By looking up the + // key in the remapper, a given new name can be mapped to the + // cannonical name using the NameMap. + DenseMap NameMap; // The Reader the remapper is servicing. SampleProfileReader &Reader; // Indicate whether remapping has been applied to the profile read @@ -370,15 +374,19 @@ class SampleProfileReader { /// Return the samples collected for function \p F. virtual FunctionSamples *getSamplesFor(StringRef Fname) { - if (Remapper) { - if (auto FS = Remapper->getSamplesFor(Fname)) - return FS; - } std::string FGUID; Fname = getRepInFormat(Fname, useMD5(), FGUID); auto It = Profiles.find(Fname); if (It != Profiles.end()) return &It->second; + + if (Remapper) { + if (auto NameInProfile = Remapper->lookUpNameInProfile(Fname)) { + auto It = Profiles.find(*NameInProfile); + if (It != Profiles.end()) + return &It->second; + } + } return nullptr; } @@ -423,6 +431,8 @@ class SampleProfileReader { /// Return whether names in the profile are all MD5 numbers. virtual bool useMD5() { return false; } + SampleProfileReaderItaniumRemapper *getRemapper() { return Remapper.get(); } + protected: /// Map every function to its associated profile. /// diff --git a/llvm/include/llvm/Support/ExitCodes.h b/llvm/include/llvm/Support/ExitCodes.h new file mode 100644 index 0000000000000..2715c5ca4128b --- /dev/null +++ b/llvm/include/llvm/Support/ExitCodes.h @@ -0,0 +1,33 @@ +//===-- llvm/Support/ExitCodes.h - Exit codes for exit() -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains definitions of exit codes for exit() function. They are +/// either defined by sysexits.h if it is supported, or defined here if +/// sysexits.h is not supported. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_EXITCODES_H +#define LLVM_SUPPORT_EXITCODES_H + +#include "llvm/Config/config.h" + +#if HAVE_SYSEXITS_H +#include +#elif __MVS__ +// does not exist on z/OS. The only value used in LLVM is +// EX_IOERR, which is used to signal a special error condition (broken pipe). +// Define the macro with its usual value from BSD systems, which is chosen to +// not clash with more standard exit codes like 1. +#define EX_IOERR 74 +#elif LLVM_ON_UNIX +#error Exit code EX_IOERR not available +#endif + +#endif diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h index 69040cd23f039..4f33365c41d35 100644 --- a/llvm/include/llvm/Support/KnownBits.h +++ b/llvm/include/llvm/Support/KnownBits.h @@ -241,6 +241,12 @@ struct KnownBits { static KnownBits computeForAddSub(bool Add, bool NSW, const KnownBits &LHS, KnownBits RHS); + /// Insert the bits from a smaller known bits starting at bitPosition. + void insertBits(const KnownBits &SubBits, unsigned BitPosition) { + Zero.insertBits(SubBits.Zero, BitPosition); + One.insertBits(SubBits.One, BitPosition); + } + /// Update known bits based on ANDing with RHS. KnownBits &operator&=(const KnownBits &RHS); diff --git a/llvm/include/llvm/Support/Signals.h b/llvm/include/llvm/Support/Signals.h index e0a18e72f2a70..c5b94f5ac7768 100644 --- a/llvm/include/llvm/Support/Signals.h +++ b/llvm/include/llvm/Support/Signals.h @@ -50,7 +50,9 @@ namespace sys { void DisableSystemDialogsOnCrash(); /// Print the stack trace using the given \c raw_ostream object. - void PrintStackTrace(raw_ostream &OS); + /// \param Depth refers to the number of stackframes to print. If not + /// specified, the entire frame is printed. + void PrintStackTrace(raw_ostream &OS, int Depth = 0); // Run all registered signal handlers. void RunSignalHandlers(); diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index ce1f92aca9bbf..db36fc42aa2a2 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -697,10 +697,19 @@ HANDLE_TARGET_OPCODE(G_READ_REGISTER) /// write_register intrinsic HANDLE_TARGET_OPCODE(G_WRITE_REGISTER) +/// llvm.memcpy intrinsic +HANDLE_TARGET_OPCODE(G_MEMCPY) + +/// llvm.memmove intrinsic +HANDLE_TARGET_OPCODE(G_MEMMOVE) + +/// llvm.memset intrinsic +HANDLE_TARGET_OPCODE(G_MEMSET) + /// Marker for the end of the generic opcode. /// This is used to check if an opcode is in the range of the /// generic opcodes. -HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_WRITE_REGISTER) +HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_MEMSET) /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific post-isel opcode values start here. diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index a7f5b849bcc11..8b346ad673d8a 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -67,8 +67,33 @@ class ElementCount { static ElementCount get(unsigned Min, bool Scalable) { return {Min, Scalable}; } + + /// Printing function. + void print(raw_ostream &OS) const { + if (Scalable) + OS << "vscale x "; + OS << Min; + } + /// Counting predicates. + /// + /// Notice that Min = 1 and Scalable = true is considered more than + /// one element. + /// + ///@{ No elements.. + bool isZero() const { return Min == 0; } + /// Exactly one element. + bool isScalar() const { return !Scalable && Min == 1; } + /// One or more elements. + bool isVector() const { return (Scalable && Min != 0) || Min > 1; } + ///@} }; +/// Stream operator function for `ElementCount`. +inline raw_ostream &operator<<(raw_ostream &OS, const ElementCount &EC) { + EC.print(OS); + return OS; +} + // This class is used to represent the size of types. If the type is of fixed // size, it will represent the exact size. If the type is a scalable vector, // it will represent the known minimum size. diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def index 697f8c70f962d..e3998c99a50a6 100644 --- a/llvm/include/llvm/Support/X86TargetParser.def +++ b/llvm/include/llvm/Support/X86TargetParser.def @@ -84,6 +84,7 @@ X86_CPU_SUBTYPE(AMDFAM17H_ZNVER2, "znver2") X86_CPU_SUBTYPE(INTEL_COREI7_CASCADELAKE, "cascadelake") X86_CPU_SUBTYPE(INTEL_COREI7_TIGERLAKE, "tigerlake") X86_CPU_SUBTYPE(INTEL_COREI7_COOPERLAKE, "cooperlake") +X86_CPU_SUBTYPE(INTEL_COREI7_SAPPHIRERAPIDS, "sapphirerapids") #undef X86_CPU_SUBTYPE diff --git a/llvm/include/llvm/Support/X86TargetParser.h b/llvm/include/llvm/Support/X86TargetParser.h index 36d8f41df6ec9..d97f620419e1e 100644 --- a/llvm/include/llvm/Support/X86TargetParser.h +++ b/llvm/include/llvm/Support/X86TargetParser.h @@ -100,6 +100,7 @@ enum CPUKind { CK_IcelakeClient, CK_IcelakeServer, CK_Tigerlake, + CK_SapphireRapids, CK_KNL, CK_KNM, CK_Lakemont, diff --git a/llvm/include/llvm/Support/raw_ostream.h b/llvm/include/llvm/Support/raw_ostream.h index e9c710d0f38fe..cae57430baffb 100644 --- a/llvm/include/llvm/Support/raw_ostream.h +++ b/llvm/include/llvm/Support/raw_ostream.h @@ -412,6 +412,7 @@ class raw_fd_ostream : public raw_pwrite_stream { int FD; bool ShouldClose; bool SupportsSeeking = false; + mutable Optional HasColors; #ifdef _WIN32 /// True if this fd refers to a Windows console device. Mintty and other diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 87f0e4b61d31e..a33b7f4a13961 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -1272,3 +1272,30 @@ def G_STRICT_FDIV : ConstrainedIntruction; def G_STRICT_FREM : ConstrainedIntruction; def G_STRICT_FMA : ConstrainedIntruction; def G_STRICT_FSQRT : ConstrainedIntruction; + +//------------------------------------------------------------------------------ +// Memory intrinsics +//------------------------------------------------------------------------------ + +def G_MEMCPY : GenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins ptype0:$dst_addr, ptype1:$src_addr, type2:$size, untyped_imm_0:$tailcall); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 1; +} + +def G_MEMMOVE : GenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins ptype0:$dst_addr, ptype1:$src_addr, type2:$size, untyped_imm_0:$tailcall); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 1; +} + +def G_MEMSET : GenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins ptype0:$dst_addr, type1:$value, type2:$size, untyped_imm_0:$tailcall); + let hasSideEffects = 0; + let mayStore = 1; +} diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index baaf875881d5c..f0645d8380d8c 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -167,6 +167,14 @@ def mul_to_shl : GICombineRule< [{ return Helper.matchCombineMulToShl(*${mi}, ${matchinfo}); }]), (apply [{ Helper.applyCombineMulToShl(*${mi}, ${matchinfo}); }])>; +// shl ([asz]ext x), y => zext (shl x, y), if shift does not overflow int +def reduce_shl_of_extend_matchdata : GIDefMatchData<"RegisterImmPair">; +def reduce_shl_of_extend : GICombineRule< + (defs root:$dst, reduce_shl_of_extend_matchdata:$matchinfo), + (match (G_SHL $dst, $src0, $src1):$mi, + [{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]), + (apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>; + // [us]itofp(undef) = 0, because the result value is bounded. def undef_to_fp_zero : GICombineRule< (defs root:$root), @@ -217,6 +225,24 @@ def select_same_val: GICombineRule< (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 2); }]) >; +// Fold (undef ? x : y) -> y +def select_undef_cmp: GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_SELECT):$root, + [{ return Helper.matchUndefSelectCmp(*${root}); }]), + (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 2); }]) +>; + +// Fold (true ? x : y) -> x +// Fold (false ? x : y) -> y +def select_constant_cmp_matchdata : GIDefMatchData<"unsigned">; +def select_constant_cmp: GICombineRule< + (defs root:$root, select_constant_cmp_matchdata:$matchinfo), + (match (wip_match_opcode G_SELECT):$root, + [{ return Helper.matchConstantSelectCmp(*${root}, ${matchinfo}); }]), + (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, ${matchinfo}); }]) +>; + // Fold x op 0 -> x def right_identity_zero: GICombineRule< (defs root:$root), @@ -283,6 +309,16 @@ def i2p_to_p2i: GICombineRule< (apply [{ return Helper.applyCombineP2IToI2P(*${root}, ${info}); }]) >; +// Fold add ptrtoint(x), y -> ptrtoint (ptr_add x), y +def add_p2i_to_ptradd_matchinfo : GIDefMatchData<"std::pair">; +def add_p2i_to_ptradd : GICombineRule< + (defs root:$root, add_p2i_to_ptradd_matchinfo:$info), + (match (wip_match_opcode G_ADD):$root, + [{ return Helper.matchCombineAddP2IToPtrAdd(*${root}, ${info}); }]), + (apply [{ return Helper.applyCombineAddP2IToPtrAdd(*${root}, ${info}); }]) +>; + + // Simplify: (logic_op (op x...), (op y...)) -> (op (logic_op x, y)) def hoist_logic_op_with_same_opcode_hands: GICombineRule < (defs root:$root, instruction_steps_matchdata:$info), @@ -321,9 +357,17 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, binop_right_to_zero, p2i_to_i2p, i2p_to_p2i]>; -def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl]>; +def known_bits_simplifications : GICombineGroup<[and_trivial_mask]>; + +def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>; + +def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>; + +def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd]>; def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, combines_for_extload, combine_indexed_load_store, undef_combines, identity_combines, simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, - shl_ashr_to_sext_inreg, sext_inreg_of_load]>; + shl_ashr_to_sext_inreg, sext_inreg_of_load, + width_reduction_combines, select_combines, + known_bits_simplifications]>; diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h index c3f0963f59970..063fa93751144 100644 --- a/llvm/include/llvm/Target/TargetOptions.h +++ b/llvm/include/llvm/Target/TargetOptions.h @@ -126,8 +126,8 @@ namespace llvm { EmitStackSizeSection(false), EnableMachineOutliner(false), SupportsDefaultOutlining(false), EmitAddrsig(false), EmitCallSiteInfo(false), SupportsDebugEntryValues(false), - EnableDebugEntryValues(false), ForceDwarfFrameSection(false), - XRayOmitFunctionIndex(false), + EnableDebugEntryValues(false), ValueTrackingVariableLocations(false), + ForceDwarfFrameSection(false), XRayOmitFunctionIndex(false), FPDenormalMode(DenormalMode::IEEE, DenormalMode::IEEE) {} /// DisableFramePointerElim - This returns true if frame pointer elimination @@ -285,6 +285,11 @@ namespace llvm { /// production. bool ShouldEmitDebugEntryValues() const; + // When set to true, use experimental new debug variable location tracking, + // which seeks to follow the values of variables rather than their location, + // post isel. + unsigned ValueTrackingVariableLocations : 1; + /// Emit DWARF debug frame section. unsigned ForceDwarfFrameSection : 1; diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 73e25417452cf..802d2bdbb1106 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -969,6 +969,7 @@ struct Attributor { /// attribute. Using this after Attributor started running is restricted to /// only the Attributor itself. Initial seeding of AAs can be done via this /// function. + /// NOTE: ForceUpdate is ignored in any stage other than the update stage. template const AAType &getOrCreateAAFor(const IRPosition &IRP, const AbstractAttribute *QueryingAA = nullptr, @@ -976,7 +977,7 @@ struct Attributor { DepClassTy DepClass = DepClassTy::OPTIONAL, bool ForceUpdate = false) { if (AAType *AAPtr = lookupAAFor(IRP, QueryingAA, TrackDependence)) { - if (ForceUpdate) + if (ForceUpdate && Phase == AttributorPhase::UPDATE) updateAA(*AAPtr); return *AAPtr; } @@ -986,7 +987,7 @@ struct Attributor { auto &AA = AAType::createForPosition(IRP, *this); // If we are currenty seeding attributes, enforce seeding rules. - if (SeedingPeriod && !shouldSeedAttribute(AA)) { + if (Phase == AttributorPhase::SEEDING && !shouldSeedAttribute(AA)) { AA.getState().indicatePessimisticFixpoint(); return AA; } @@ -1020,14 +1021,21 @@ struct Attributor { return AA; } + // If this is queried in the manifest stage, we force the AA to indicate + // pessimistic fixpoint immediately. + if (Phase == AttributorPhase::MANIFEST) { + AA.getState().indicatePessimisticFixpoint(); + return AA; + } + // Allow seeded attributes to declare dependencies. // Remember the seeding state. - bool OldSeedingPeriod = SeedingPeriod; - SeedingPeriod = false; + AttributorPhase OldPhase = Phase; + Phase = AttributorPhase::UPDATE; updateAA(AA); - SeedingPeriod = OldSeedingPeriod; + Phase = OldPhase; if (TrackDependence && AA.getState().isValidState()) recordDependence(AA, const_cast(*QueryingAA), @@ -1096,8 +1104,10 @@ struct Attributor { assert(!AAPtr && "Attribute already in map!"); AAPtr = &AA; - DG.SyntheticRoot.Deps.push_back( - AADepGraphNode::DepTy(&AA, unsigned(DepClassTy::REQUIRED))); + // Register AA with the synthetic root only before the manifest stage. + if (Phase == AttributorPhase::SEEDING || Phase == AttributorPhase::UPDATE) + DG.SyntheticRoot.Deps.push_back( + AADepGraphNode::DepTy(&AA, unsigned(DepClassTy::REQUIRED))); return AA; } @@ -1522,9 +1532,14 @@ struct Attributor { /// Invoke instructions with at least a single dead successor block. SmallVector InvokeWithDeadSuccessor; - /// Wheather attributes are being `seeded`, always false after ::run function - /// gets called \see getOrCreateAAFor. - bool SeedingPeriod = true; + /// A flag that indicates which stage of the process we are in. Initially, the + /// phase is SEEDING. Phase is changed in `Attributor::run()` + enum class AttributorPhase { + SEEDING, + UPDATE, + MANIFEST, + CLEANUP, + } Phase = AttributorPhase::SEEDING; /// Functions, blocks, and instructions we delete after manifest is done. /// @@ -2646,6 +2661,12 @@ struct AAIsDead : public StateWrapper { return F.hasPersonalityFn() && !canSimplifyInvokeNoUnwind(&F); } + /// Return if the edge from \p From BB to \p To BB is assumed dead. + /// This is specifically useful in AAReachability. + virtual bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const { + return false; + } + /// See AbstractAttribute::getName() const std::string getName() const override { return "AAIsDead"; } @@ -3362,9 +3383,10 @@ template > struct PotentialValuesState : AbstractState { using SetTy = DenseSet; - PotentialValuesState() : IsValidState(true) {} + PotentialValuesState() : IsValidState(true), UndefIsContained(false) {} - PotentialValuesState(bool IsValid) : IsValidState(IsValid) {} + PotentialValuesState(bool IsValid) + : IsValidState(IsValid), UndefIsContained(false) {} /// See AbstractState::isValidState(...) bool isValidState() const override { return IsValidState.isValidState(); } @@ -3393,11 +3415,19 @@ struct PotentialValuesState : AbstractState { return Set; } + /// Returns whether this state contains an undef value or not. + bool undefIsContained() const { + assert(isValidState() && "This flag shoud not be used when it is invalid!"); + return UndefIsContained; + } + bool operator==(const PotentialValuesState &RHS) const { if (isValidState() != RHS.isValidState()) return false; if (!isValidState() && !RHS.isValidState()) return true; + if (undefIsContained() != RHS.undefIsContained()) + return false; return Set == RHS.getAssumedSet(); } @@ -3425,6 +3455,9 @@ struct PotentialValuesState : AbstractState { /// Union assumed set with assumed set of the passed state \p PVS. void unionAssumed(const PotentialValuesState &PVS) { unionWith(PVS); } + /// Union assumed set with an undef value. + void unionAssumedWithUndef() { unionWithUndef(); } + /// "Clamp" this state with \p PVS. PotentialValuesState operator^=(const PotentialValuesState &PVS) { IsValidState ^= PVS.IsValidState; @@ -3446,6 +3479,10 @@ struct PotentialValuesState : AbstractState { indicatePessimisticFixpoint(); } + /// If this state contains both undef and not undef, we can reduce + /// undef to the not undef value. + void reduceUndefValue() { UndefIsContained = UndefIsContained & Set.empty(); } + /// Insert an element into this set. void insert(const MemberTy &C) { if (!isValidState()) @@ -3466,9 +3503,17 @@ struct PotentialValuesState : AbstractState { } for (const MemberTy &C : R.Set) Set.insert(C); + UndefIsContained |= R.undefIsContained(); + reduceUndefValue(); checkAndInvalidate(); } + /// Take union with an undef value. + void unionWithUndef() { + UndefIsContained = true; + reduceUndefValue(); + } + /// Take intersection with R. void intersectWith(const PotentialValuesState &R) { /// If R is a full set, do nothing. @@ -3485,6 +3530,8 @@ struct PotentialValuesState : AbstractState { IntersectSet.insert(C); } Set = IntersectSet; + UndefIsContained &= R.undefIsContained(); + reduceUndefValue(); } /// A helper state which indicate whether this state is valid or not. @@ -3492,6 +3539,9 @@ struct PotentialValuesState : AbstractState { /// Container for potential values SetTy Set; + + /// Flag for undef value + bool UndefIsContained; }; using PotentialConstantIntValuesState = PotentialValuesState; @@ -3538,8 +3588,12 @@ struct AAPotentialValues if (getAssumedSet().size() == 1) return cast(ConstantInt::get(getAssociatedValue().getType(), *(getAssumedSet().begin()))); - if (getAssumedSet().size() == 0) + if (getAssumedSet().size() == 0) { + if (undefIsContained()) + return cast( + ConstantInt::get(getAssociatedValue().getType(), 0)); return llvm::None; + } return nullptr; } diff --git a/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h new file mode 100644 index 0000000000000..af905bbecad8f --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h @@ -0,0 +1,51 @@ +//===--------- Definition of the HeapProfiler class ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the HeapProfiler class. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H +#define LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// Public interface to the heap profiler pass for instrumenting code to +/// profile heap memory accesses. +/// +/// The profiler itself is a function pass that works by inserting various +/// calls to the HeapProfiler runtime library functions. The runtime library +/// essentially replaces malloc() and free() with custom implementations that +/// record data about the allocations. +class HeapProfilerPass : public PassInfoMixin { +public: + explicit HeapProfilerPass(); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// Public interface to the heap profiler module pass for instrumenting code +/// to profile heap memory allocations and accesses. +class ModuleHeapProfilerPass : public PassInfoMixin { +public: + explicit ModuleHeapProfilerPass(); + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + +private: +}; + +// Insert HeapProfiler instrumentation +FunctionPass *createHeapProfilerFunctionPass(); +ModulePass *createModuleHeapProfilerLegacyPassPass(); + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h index 8dd59e018061b..242ffa0ede09d 100644 --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -24,12 +24,6 @@ class FunctionPass; class ModulePass; class Pass; -//===----------------------------------------------------------------------===// -// -// ConstantPropagation - A worklist driven constant propagation pass -// -FunctionPass *createConstantPropagationPass(); - //===----------------------------------------------------------------------===// // // AlignmentFromAssumptions - Use assume intrinsics to set load/store diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 7235aa5861120..46d107128ce15 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -230,6 +230,7 @@ class LoopVectorizationLegality { /// Return true if we can vectorize this loop while folding its tail by /// masking, and mark all respective loads/stores for masking. + /// This object's state is only modified iff this function returns true. bool prepareToFoldTailByMasking(); /// Returns the primary induction variable. @@ -370,8 +371,14 @@ class LoopVectorizationLegality { /// its original trip-count, under a proper guard, which should be preserved. /// \p SafePtrs is a list of addresses that are known to be legal and we know /// that we can read from them without segfault. + /// \p MaskedOp is a list of instructions that have to be transformed into + /// calls to the appropriate masked intrinsic when the loop is vectorized. + /// \p ConditionalAssumes is a list of assume instructions in predicated + /// blocks that must be dropped if the CFG gets flattened. bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs, - bool PreserveGuards = false); + SmallPtrSetImpl &MaskedOp, + SmallPtrSetImpl &ConditionalAssumes, + bool PreserveGuards = false) const; /// Updates the vectorization state by adding \p Phi to the inductions list. /// This can set \p Phi as the main induction of the loop if \p Phi is a diff --git a/llvm/lib/Analysis/CallGraphSCCPass.cpp b/llvm/lib/Analysis/CallGraphSCCPass.cpp index 91f8029cc326b..17dd4dd389d57 100644 --- a/llvm/lib/Analysis/CallGraphSCCPass.cpp +++ b/llvm/lib/Analysis/CallGraphSCCPass.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/OptBisect.h" #include "llvm/IR/PassTimingInfo.h" +#include "llvm/IR/StructuralHash.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -466,11 +467,24 @@ bool CGPassManager::RunAllPassesOnSCC(CallGraphSCC &CurSCC, CallGraph &CG, initializeAnalysisImpl(P); +#ifdef EXPENSIVE_CHECKS + uint64_t RefHash = StructuralHash(CG.getModule()); +#endif + // Actually run this pass on the current SCC. - Changed |= RunPassOnSCC(P, CurSCC, CG, - CallGraphUpToDate, DevirtualizedCall); + bool LocalChanged = + RunPassOnSCC(P, CurSCC, CG, CallGraphUpToDate, DevirtualizedCall); - if (Changed) + Changed |= LocalChanged; + +#ifdef EXPENSIVE_CHECKS + if (!LocalChanged && (RefHash != StructuralHash(CG.getModule()))) { + llvm::errs() << "Pass modifies its input and doesn't report it: " + << P->getPassName() << "\n"; + llvm_unreachable("Pass modifies its input and doesn't report it"); + } +#endif + if (LocalChanged) dumpPassInfo(P, MODIFICATION_MSG, ON_CG_MSG, ""); dumpPreservedSet(P); diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp index 19ec8cbd4042c..6f8205e19230e 100644 --- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp +++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp @@ -232,7 +232,9 @@ class DevelopmentModeMLInlineAdvisor : public MLInlineAdvisor { size_t getTotalSizeEstimate(); virtual ~DevelopmentModeMLInlineAdvisor(); - void updateNativeSizeEstimate(int64_t Change) { CurrentNativeSize += Change; } + void updateNativeSizeEstimate(int64_t Change) { + *CurrentNativeSize += Change; + } void resetNativeSize(Function *F) { FAM.invalidate(*F); } @@ -242,7 +244,7 @@ class DevelopmentModeMLInlineAdvisor : public MLInlineAdvisor { std::unique_ptr getAdviceFromModel(CallBase &CB, OptimizationRemarkEmitter &ORE) override; - size_t getNativeSizeEstimate(const Function &F) const; + Optional getNativeSizeEstimate(const Function &F) const; private: bool isLogging() const { return !!Logger; } @@ -251,8 +253,8 @@ class DevelopmentModeMLInlineAdvisor : public MLInlineAdvisor { const bool IsDoingInference; std::unique_ptr Logger; - const int32_t InitialNativeSize; - int32_t CurrentNativeSize = 0; + const Optional InitialNativeSize; + Optional CurrentNativeSize; }; /// A variant of MLInlineAdvice that tracks all non-trivial inlining @@ -261,9 +263,10 @@ class LoggingMLInlineAdvice : public MLInlineAdvice { public: LoggingMLInlineAdvice(DevelopmentModeMLInlineAdvisor *Advisor, CallBase &CB, OptimizationRemarkEmitter &ORE, bool Recommendation, - TrainingLogger &Logger, size_t CallerSizeEstimateBefore, - size_t CalleeSizeEstimateBefore, bool DefaultDecision, - bool Mandatory = false) + TrainingLogger &Logger, + Optional CallerSizeEstimateBefore, + Optional CalleeSizeEstimateBefore, + bool DefaultDecision, bool Mandatory = false) : MLInlineAdvice(Advisor, CB, ORE, Recommendation), Logger(Logger), CallerSizeEstimateBefore(CallerSizeEstimateBefore), CalleeSizeEstimateBefore(CalleeSizeEstimateBefore), @@ -279,11 +282,12 @@ class LoggingMLInlineAdvice : public MLInlineAdvice { MLInlineAdvice::recordInliningImpl(); getAdvisor()->resetNativeSize(Caller); int Reward = std::numeric_limits::max(); - if (!getAdvisor()->isForcedToStop()) { - int NativeSizeAfter = getAdvisor()->getNativeSizeEstimate(*Caller) + - CalleeSizeEstimateBefore; + if (InlineSizeEstimatorAnalysis::isEvaluatorRequested() && + !getAdvisor()->isForcedToStop()) { + int NativeSizeAfter = *getAdvisor()->getNativeSizeEstimate(*Caller) + + *CalleeSizeEstimateBefore; Reward = NativeSizeAfter - - (CallerSizeEstimateBefore + CalleeSizeEstimateBefore); + (*CallerSizeEstimateBefore + *CalleeSizeEstimateBefore); getAdvisor()->updateNativeSizeEstimate(Reward); } log(Reward, /*Success=*/true); @@ -292,10 +296,11 @@ class LoggingMLInlineAdvice : public MLInlineAdvice { void recordInliningWithCalleeDeletedImpl() override { MLInlineAdvice::recordInliningWithCalleeDeletedImpl(); getAdvisor()->resetNativeSize(Caller); - if (!getAdvisor()->isForcedToStop()) { - int NativeSizeAfter = getAdvisor()->getNativeSizeEstimate(*Caller); + if (InlineSizeEstimatorAnalysis::isEvaluatorRequested() && + !getAdvisor()->isForcedToStop()) { + int NativeSizeAfter = *getAdvisor()->getNativeSizeEstimate(*Caller); int Reward = NativeSizeAfter - - (CallerSizeEstimateBefore + CalleeSizeEstimateBefore); + (*CallerSizeEstimateBefore + *CalleeSizeEstimateBefore); getAdvisor()->updateNativeSizeEstimate(Reward); log(Reward, /*Success=*/true); } @@ -324,8 +329,8 @@ class LoggingMLInlineAdvice : public MLInlineAdvice { static const int64_t NoReward = 0; TrainingLogger &Logger; - const size_t CallerSizeEstimateBefore; - const size_t CalleeSizeEstimateBefore; + const Optional CallerSizeEstimateBefore; + const Optional CalleeSizeEstimateBefore; const bool DefaultDecision; const bool Mandatory; }; @@ -448,9 +453,11 @@ void TrainingLogger::print() { writeRawTensorsAsFeatureLists( OutFile, TensorSpec::createSpec(DecisionName, {1}), Outputs[0].data(), NumberOfRecords); - writeTensorsAsFeatureLists(OutFile, - TensorSpec::createSpec(RewardName, {1}), - Rewards.data(), NumberOfRecords); + + if (InlineSizeEstimatorAnalysis::isEvaluatorRequested()) + writeTensorsAsFeatureLists(OutFile, + TensorSpec::createSpec(RewardName, {1}), + Rewards.data(), NumberOfRecords); for (size_t I = 1; I < Outputs.size(); ++I) writeRawTensorsAsFeatureLists(OutFile, MUTR->outputSpecs()[I], @@ -479,8 +486,10 @@ DevelopmentModeMLInlineAdvisor::~DevelopmentModeMLInlineAdvisor() { Logger->print(); } -size_t +Optional DevelopmentModeMLInlineAdvisor::getNativeSizeEstimate(const Function &F) const { + if (!InlineSizeEstimatorAnalysis::isEvaluatorRequested()) + return None; auto &R = FAM.getResult(const_cast(F)); if (!R) { @@ -496,6 +505,7 @@ DevelopmentModeMLInlineAdvisor::getMandatoryAdvice( CallBase &CB, OptimizationRemarkEmitter &ORE) { if (!isLogging()) return MLInlineAdvisor::getMandatoryAdvice(CB, ORE); + return std::make_unique( /*Advisor=*/this, /*CB=*/CB, /*ORE=*/ORE, /*Recommendation=*/true, /*Logger=*/*Logger, @@ -524,13 +534,15 @@ DevelopmentModeMLInlineAdvisor::getAdviceFromModel( } size_t DevelopmentModeMLInlineAdvisor::getTotalSizeEstimate() { + if (!InlineSizeEstimatorAnalysis::isEvaluatorRequested()) + return 0; size_t Ret = 0; for (auto &F : M) { if (F.isDeclaration()) continue; if (isFunctionDeleted(&F)) continue; - Ret += getNativeSizeEstimate(F); + Ret += *getNativeSizeEstimate(F); } return Ret; } @@ -642,14 +654,6 @@ std::unique_ptr llvm::getDevelopmentModeAdvisor( Module &M, ModuleAnalysisManager &MAM, std::function GetDefaultAdvice) { auto &Ctx = M.getContext(); - if (TrainingLog.empty() != - !InlineSizeEstimatorAnalysis::isEvaluatorRequested()) { - Ctx.emitError("For development mode, if training logs are requested, then " - "a size estimator must be available; either that, or neither " - "are specified."); - return nullptr; - } - std::unique_ptr Runner; ModelUnderTrainingRunner *MUTRPtr = nullptr; bool IsDoingInference = false; diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 9b5bb37a0ff75..3746bc66e426a 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4302,7 +4302,7 @@ Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx, auto *ValC = dyn_cast(Val); auto *IdxC = dyn_cast(Idx); if (VecC && ValC && IdxC) - return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC); + return ConstantExpr::getInsertElement(VecC, ValC, IdxC); // For fixed-length vector, fold into undef if index is out of bounds. if (auto *CI = dyn_cast(Idx)) { @@ -4367,7 +4367,7 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, auto *VecVTy = cast(Vec->getType()); if (auto *CVec = dyn_cast(Vec)) { if (auto *CIdx = dyn_cast(Idx)) - return ConstantFoldExtractElementInstruction(CVec, CIdx); + return ConstantExpr::getExtractElement(CVec, CIdx); // The index is not relevant if our vector is a splat. if (auto *Splat = CVec->getSplatValue()) @@ -4403,6 +4403,21 @@ Value *llvm::SimplifyExtractElementInst(Value *Vec, Value *Idx, /// See if we can fold the given phi. If not, returns null. static Value *SimplifyPHINode(PHINode *PN, const SimplifyQuery &Q) { + // Is there an identical PHI node before this one in this basic block? + if (BasicBlock *BB = PN->getParent()) { + for (PHINode &Src : BB->phis()) { + // Once we've reached the PHI node we've been asked about, stop looking. + if (&Src == PN) + break; + // If the previous PHI is currently trivially dead, ignore it, + // it might have been already recorded as being dead. + if (Src.use_empty()) + continue; + if (PN->isIdenticalToWhenDefined(&Src)) + return &Src; + } + } + // If all of the PHI's incoming values are the same then replace the PHI node // with the common value. Value *CommonValue = nullptr; @@ -4565,8 +4580,8 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, // If all operands are constant, constant fold the shuffle. This // transformation depends on the value of the mask which is not known at // compile time for scalable vectors - if (!Scalable && Op0Const && Op1Const) - return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask); + if (Op0Const && Op1Const) + return ConstantExpr::getShuffleVector(Op0Const, Op1Const, Mask); // Canonicalization: if only one input vector is constant, it shall be the // second one. This transformation depends on the value of the mask which @@ -5232,6 +5247,16 @@ static APInt getMaxMinLimit(Intrinsic::ID IID, unsigned BitWidth) { } } +static ICmpInst::Predicate getMaxMinPredicate(Intrinsic::ID IID) { + switch (IID) { + case Intrinsic::smax: return ICmpInst::ICMP_SGE; + case Intrinsic::smin: return ICmpInst::ICMP_SLE; + case Intrinsic::umax: return ICmpInst::ICMP_UGE; + case Intrinsic::umin: return ICmpInst::ICMP_ULE; + default: llvm_unreachable("Unexpected intrinsic"); + } +} + /// Given a min/max intrinsic, see if it can be removed based on having an /// operand that is another min/max intrinsic with shared operand(s). The caller /// is expected to swap the operand arguments to handle commutation. @@ -5324,6 +5349,12 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, if (Value *V = foldMinMaxSharedOp(IID, Op1, Op0)) return V; + ICmpInst::Predicate Pred = getMaxMinPredicate(IID); + if (isICmpTrue(Pred, Op0, Op1, Q.getWithoutUndef(), RecursionLimit)) + return Op0; + if (isICmpTrue(Pred, Op1, Op0, Q.getWithoutUndef(), RecursionLimit)) + return Op1; + break; } case Intrinsic::usub_with_overflow: diff --git a/llvm/lib/Analysis/LoopNestAnalysis.cpp b/llvm/lib/Analysis/LoopNestAnalysis.cpp index 61e53de93151a..7fffaff22c25c 100644 --- a/llvm/lib/Analysis/LoopNestAnalysis.cpp +++ b/llvm/lib/Analysis/LoopNestAnalysis.cpp @@ -228,6 +228,28 @@ static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop, InnerLoop.getExitingBlock() != InnerLoopLatch || !InnerLoopExit) return false; + // Returns whether the block `ExitBlock` contains at least one LCSSA Phi node. + auto ContainsLCSSAPhi = [](const BasicBlock &ExitBlock) { + return any_of(ExitBlock.phis(), [](const PHINode &PN) { + return PN.getNumIncomingValues() == 1; + }); + }; + + // Returns whether the block `BB` qualifies for being an extra Phi block. The + // extra Phi block is the additional block inserted after the exit block of an + // "guarded" inner loop which contains "only" Phi nodes corresponding to the + // LCSSA Phi nodes in the exit block. + auto IsExtraPhiBlock = [&](const BasicBlock &BB) { + return BB.getFirstNonPHI() == BB.getTerminator() && + all_of(BB.phis(), [&](const PHINode &PN) { + return all_of(PN.blocks(), [&](const BasicBlock *IncomingBlock) { + return IncomingBlock == InnerLoopExit || + IncomingBlock == OuterLoopHeader; + }); + }); + }; + + const BasicBlock *ExtraPhiBlock = nullptr; // Ensure the only branch that may exist between the loops is the inner loop // guard. if (OuterLoopHeader != InnerLoopPreHeader) { @@ -237,6 +259,8 @@ static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop, if (!BI || BI != InnerLoop.getLoopGuardBranch()) return false; + bool InnerLoopExitContainsLCSSA = ContainsLCSSAPhi(*InnerLoopExit); + // The successors of the inner loop guard should be the inner loop // preheader and the outer loop latch. for (const BasicBlock *Succ : BI->successors()) { @@ -245,6 +269,20 @@ static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop, if (Succ == OuterLoopLatch) continue; + // If `InnerLoopExit` contains LCSSA Phi instructions, additional block + // may be inserted before the `OuterLoopLatch` to which `BI` jumps. The + // loops are still considered perfectly nested if the extra block only + // contains Phi instructions from InnerLoopExit and OuterLoopHeader. + if (InnerLoopExitContainsLCSSA && IsExtraPhiBlock(*Succ) && + Succ->getSingleSuccessor() == OuterLoopLatch) { + // Points to the extra block so that we can reference it later in the + // final check. We can also conclude that the inner loop is + // guarded and there exists LCSSA Phi node in the exit block later if we + // see a non-null `ExtraPhiBlock`. + ExtraPhiBlock = Succ; + continue; + } + DEBUG_WITH_TYPE(VerboseDebug, { dbgs() << "Inner loop guard successor " << Succ->getName() << " doesn't lead to inner loop preheader or " @@ -255,7 +293,9 @@ static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop, } // Ensure the inner loop exit block leads to the outer loop latch. - if (InnerLoopExit->getSingleSuccessor() != OuterLoopLatch) { + const BasicBlock *SuccInner = InnerLoopExit->getSingleSuccessor(); + if (!SuccInner || + (SuccInner != OuterLoopLatch && SuccInner != ExtraPhiBlock)) { DEBUG_WITH_TYPE( VerboseDebug, dbgs() << "Inner loop exit block " << *InnerLoopExit diff --git a/llvm/lib/Analysis/LoopPass.cpp b/llvm/lib/Analysis/LoopPass.cpp index 520f06003dd22..317b9577d7917 100644 --- a/llvm/lib/Analysis/LoopPass.cpp +++ b/llvm/lib/Analysis/LoopPass.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/OptBisect.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PassTimingInfo.h" +#include "llvm/IR/StructuralHash.h" #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/TimeProfiler.h" @@ -191,7 +192,19 @@ bool LPPassManager::runOnFunction(Function &F) { { PassManagerPrettyStackEntry X(P, *CurrentLoop->getHeader()); TimeRegion PassTimer(getPassTimer(P)); +#ifdef EXPENSIVE_CHECKS + uint64_t RefHash = StructuralHash(F); +#endif LocalChanged = P->runOnLoop(CurrentLoop, *this); + +#ifdef EXPENSIVE_CHECKS + if (!LocalChanged && (RefHash != StructuralHash(F))) { + llvm::errs() << "Pass modifies its input and doesn't report it: " + << P->getPassName() << "\n"; + llvm_unreachable("Pass modifies its input and doesn't report it"); + } +#endif + Changed |= LocalChanged; if (EmitICRemark) { unsigned NewSize = F.getInstructionCount(); diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 7f3de0fcf140a..2428d57d2809f 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -362,6 +362,8 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI, MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt, BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) { + // We can batch AA queries, because IR does not change during a MemDep query. + BatchAAResults BatchAA(AA); bool isInvariantLoad = false; unsigned DefaultLimit = getDefaultBlockScanLimit(); @@ -445,7 +447,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // pointer, not on query pointers that are indexed off of them. It'd // be nice to handle that at some point (the right approach is to use // GetPointerBaseWithConstantOffset). - if (AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), MemLoc)) + if (BatchAA.isMustAlias(MemoryLocation(II->getArgOperand(1)), MemLoc)) return MemDepResult::getDef(II); continue; } @@ -485,7 +487,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( MemoryLocation LoadLoc = MemoryLocation::get(LI); // If we found a pointer, check if it could be the same as our pointer. - AliasResult R = AA.alias(LoadLoc, MemLoc); + AliasResult R = BatchAA.alias(LoadLoc, MemLoc); if (isLoad) { if (R == NoAlias) @@ -516,7 +518,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( continue; // Stores don't alias loads from read-only memory. - if (AA.pointsToConstantMemory(LoadLoc)) + if (BatchAA.pointsToConstantMemory(LoadLoc)) continue; // Stores depend on may/must aliased loads. @@ -547,7 +549,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // If alias analysis can tell that this store is guaranteed to not modify // the query pointer, ignore it. Use getModRefInfo to handle cases where // the query pointer points to constant memory etc. - if (!isModOrRefSet(AA.getModRefInfo(SI, MemLoc))) + if (!isModOrRefSet(BatchAA.getModRefInfo(SI, MemLoc))) continue; // Ok, this store might clobber the query pointer. Check to see if it is @@ -556,7 +558,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( MemoryLocation StoreLoc = MemoryLocation::get(SI); // If we found a pointer, check if it could be the same as our pointer. - AliasResult R = AA.alias(StoreLoc, MemLoc); + AliasResult R = BatchAA.alias(StoreLoc, MemLoc); if (R == NoAlias) continue; @@ -575,7 +577,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // handled by BasicAA. if (isa(Inst) || isNoAliasFn(Inst, &TLI)) { const Value *AccessPtr = getUnderlyingObject(MemLoc.Ptr); - if (AccessPtr == Inst || AA.isMustAlias(Inst, AccessPtr)) + if (AccessPtr == Inst || BatchAA.isMustAlias(Inst, AccessPtr)) return MemDepResult::getDef(Inst); } @@ -592,9 +594,10 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( continue; // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer. - ModRefInfo MR = AA.getModRefInfo(Inst, MemLoc); + ModRefInfo MR = BatchAA.getModRefInfo(Inst, MemLoc); // If necessary, perform additional analysis. if (isModAndRefSet(MR)) + // TODO: Support callCapturesBefore() on BatchAAResults. MR = AA.callCapturesBefore(Inst, MemLoc, &DT); switch (clearMust(MR)) { case ModRefInfo::NoModRef: diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index 6fc827ae2b17c..f54f04460a4d7 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -1726,8 +1726,11 @@ MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I, if (CreationMustSucceed) assert(NewAccess != nullptr && "Tried to create a memory access for a " "non-memory touching instruction"); - if (NewAccess) + if (NewAccess) { + assert((!Definition || !isa(Definition)) && + "A use cannot be a defining access"); NewAccess->setDefiningAccess(Definition); + } return NewAccess; } diff --git a/llvm/lib/Analysis/RegionPass.cpp b/llvm/lib/Analysis/RegionPass.cpp index 6c0d17b45c622..1e1971f119a09 100644 --- a/llvm/lib/Analysis/RegionPass.cpp +++ b/llvm/lib/Analysis/RegionPass.cpp @@ -15,6 +15,7 @@ #include "llvm/Analysis/RegionPass.h" #include "llvm/IR/OptBisect.h" #include "llvm/IR/PassTimingInfo.h" +#include "llvm/IR/StructuralHash.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" @@ -90,15 +91,29 @@ bool RGPassManager::runOnFunction(Function &F) { initializeAnalysisImpl(P); + bool LocalChanged = false; { PassManagerPrettyStackEntry X(P, *CurrentRegion->getEntry()); TimeRegion PassTimer(getPassTimer(P)); - Changed |= P->runOnRegion(CurrentRegion, *this); +#ifdef EXPENSIVE_CHECKS + uint64_t RefHash = StructuralHash(F); +#endif + LocalChanged = P->runOnRegion(CurrentRegion, *this); + +#ifdef EXPENSIVE_CHECKS + if (!LocalChanged && (RefHash != StructuralHash(F))) { + llvm::errs() << "Pass modifies its input and doesn't report it: " + << P->getPassName() << "\n"; + llvm_unreachable("Pass modifies its input and doesn't report it"); + } +#endif + + Changed |= LocalChanged; } if (isPassDebuggingExecutionsOrMore()) { - if (Changed) + if (LocalChanged) dumpPassInfo(P, MODIFICATION_MSG, ON_REGION_MSG, skipThisRegion ? "" : CurrentRegion->getNameStr()); diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp index ef8103f6c516c..f95a8918afbba 100644 --- a/llvm/lib/Analysis/StackLifetime.cpp +++ b/llvm/lib/Analysis/StackLifetime.cpp @@ -64,18 +64,44 @@ bool StackLifetime::isAliveAfter(const AllocaInst *AI, return getLiveRange(AI).test(InstNum); } +// Returns unique alloca annotated by lifetime marker only if +// markers has the same size and points to the alloca start. +static const AllocaInst *findMatchingAlloca(const IntrinsicInst &II, + const DataLayout &DL) { + const AllocaInst *AI = findAllocaForValue(II.getArgOperand(1), true); + if (!AI) + return nullptr; + + auto AllocaSizeInBits = AI->getAllocationSizeInBits(DL); + if (!AllocaSizeInBits) + return nullptr; + int64_t AllocaSize = AllocaSizeInBits.getValue() / 8; + + auto *Size = dyn_cast(II.getArgOperand(0)); + if (!Size) + return nullptr; + int64_t LifetimeSize = Size->getSExtValue(); + + if (LifetimeSize != -1 && LifetimeSize != AllocaSize) + return nullptr; + + return AI; +} + void StackLifetime::collectMarkers() { InterestingAllocas.resize(NumAllocas); DenseMap> BBMarkerSet; + const DataLayout &DL = F.getParent()->getDataLayout(); + // Compute the set of start/end markers per basic block. for (const BasicBlock *BB : depth_first(&F)) { for (const Instruction &I : *BB) { const IntrinsicInst *II = dyn_cast(&I); if (!II || !II->isLifetimeStartOrEnd()) continue; - const AllocaInst *AI = llvm::findAllocaForValue(II->getArgOperand(1)); + const AllocaInst *AI = findMatchingAlloca(*II, DL); if (!AI) { HasUnknownLifetimeStartOrEnd = true; continue; diff --git a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp index ccf520dcea66e..36bef705d4f30 100644 --- a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp @@ -1,5 +1,4 @@ -//===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation -//--===// +//==- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation -==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -99,10 +98,10 @@ // loop exit and the loop header (_after_ SSA construction). // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/SyncDependenceAnalysis.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/PostDominators.h" -#include "llvm/Analysis/SyncDependenceAnalysis.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" @@ -221,7 +220,9 @@ struct DivergencePropagator { SuccessorIterable NodeSuccessors, const Loop *ParentLoop) { assert(JoinBlocks); - LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints. Parent loop: " << (ParentLoop ? ParentLoop->getName() : "") << "\n" ); + LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints. Parent loop: " + << (ParentLoop ? ParentLoop->getName() : "") + << "\n"); // bootstrap with branch targets for (const auto *SuccBlock : NodeSuccessors) { @@ -236,12 +237,10 @@ struct DivergencePropagator { } } - LLVM_DEBUG( - dbgs() << "SDA: rpo order:\n"; - for (const auto * RpoBlock : FuncRPOT) { - dbgs() << "- " << RpoBlock->getName() << "\n"; - } - ); + LLVM_DEBUG(dbgs() << "SDA: rpo order:\n"; for (const auto *RpoBlock + : FuncRPOT) { + dbgs() << "- " << RpoBlock->getName() << "\n"; + }); auto ItBeginRPO = FuncRPOT.begin(); auto ItEndRPO = FuncRPOT.end(); @@ -253,8 +252,7 @@ struct DivergencePropagator { // propagate definitions at the immediate successors of the node in RPO auto ItBlockRPO = ItBeginRPO; - while ((++ItBlockRPO != ItEndRPO) && - !PendingUpdates.empty()) { + while ((++ItBlockRPO != ItEndRPO) && !PendingUpdates.empty()) { const auto *Block = *ItBlockRPO; LLVM_DEBUG(dbgs() << "SDA::joins. visiting " << Block->getName() << "\n"); @@ -311,7 +309,8 @@ struct DivergencePropagator { assert(ParentLoop); auto ItHeaderDef = DefMap.find(ParentLoopHeader); - const auto *HeaderDefBlock = (ItHeaderDef == DefMap.end()) ? nullptr : ItHeaderDef->second; + const auto *HeaderDefBlock = + (ItHeaderDef == DefMap.end()) ? nullptr : ItHeaderDef->second; LLVM_DEBUG(printDefs(dbgs())); assert(HeaderDefBlock && "no definition at header of carrying loop"); diff --git a/llvm/lib/Analysis/TFUtils.cpp b/llvm/lib/Analysis/TFUtils.cpp index 99b63305121de..648a3a4adfd6e 100644 --- a/llvm/lib/Analysis/TFUtils.cpp +++ b/llvm/lib/Analysis/TFUtils.cpp @@ -122,8 +122,8 @@ Optional getTensorSpecFromJSON(LLVMContext &Ctx, if (!Mapper.map>("shape", TensorShape)) return EmitError("'shape' property not present or not an int array"); -#define PARSE_TYPE(T, S, E) \ - if (TensorType == #S) \ +#define PARSE_TYPE(T, E) \ + if (TensorType == #T) \ return TensorSpec::createSpec(TensorName, TensorShape, TensorPort); TFUTILS_SUPPORTED_TYPES(PARSE_TYPE) #undef PARSE_TYPE @@ -307,8 +307,8 @@ TFModelEvaluator::EvaluationResult::getUntypedTensorValue(size_t Index) const { return TF_TensorData(Impl->getOutput()[Index]); } -#define TFUTILS_GETDATATYPE_IMPL(T, S, E) \ - template <> int TensorSpec::getDataType() { return TF_##E; } +#define TFUTILS_GETDATATYPE_IMPL(T, E) \ + template <> int TensorSpec::getDataType() { return E; } TFUTILS_SUPPORTED_TYPES(TFUTILS_GETDATATYPE_IMPL) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 07c4e57228f11..f8a5cecc16a81 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4330,43 +4330,43 @@ bool llvm::getUnderlyingObjectsForCodeGen(const Value *V, return true; } -static AllocaInst * -findAllocaForValue(Value *V, DenseMap &AllocaForValue) { - if (AllocaInst *AI = dyn_cast(V)) - return AI; - // See if we've already calculated (or started to calculate) alloca for a - // given value. - auto I = AllocaForValue.find(V); - if (I != AllocaForValue.end()) - return I->second; - // Store 0 while we're calculating alloca for value V to avoid - // infinite recursion if the value references itself. - AllocaForValue[V] = nullptr; - AllocaInst *Res = nullptr; - if (CastInst *CI = dyn_cast(V)) - Res = findAllocaForValue(CI->getOperand(0), AllocaForValue); - else if (PHINode *PN = dyn_cast(V)) { - for (Value *IncValue : PN->incoming_values()) { - // Allow self-referencing phi-nodes. - if (IncValue == PN) - continue; - AllocaInst *IncValueAI = findAllocaForValue(IncValue, AllocaForValue); - // AI for incoming values should exist and should all be equal. - if (IncValueAI == nullptr || (Res != nullptr && IncValueAI != Res)) +AllocaInst *llvm::findAllocaForValue(Value *V, bool OffsetZero) { + AllocaInst *Result = nullptr; + SmallPtrSet Visited; + SmallVector Worklist; + + auto AddWork = [&](Value *V) { + if (Visited.insert(V).second) + Worklist.push_back(V); + }; + + AddWork(V); + do { + V = Worklist.pop_back_val(); + assert(Visited.count(V)); + + if (AllocaInst *AI = dyn_cast(V)) { + if (Result && Result != AI) return nullptr; - Res = IncValueAI; + Result = AI; + } else if (CastInst *CI = dyn_cast(V)) { + AddWork(CI->getOperand(0)); + } else if (PHINode *PN = dyn_cast(V)) { + for (Value *IncValue : PN->incoming_values()) + AddWork(IncValue); + } else if (auto *SI = dyn_cast(V)) { + AddWork(SI->getTrueValue()); + AddWork(SI->getFalseValue()); + } else if (GetElementPtrInst *GEP = dyn_cast(V)) { + if (OffsetZero && !GEP->hasAllZeroIndices()) + return nullptr; + AddWork(GEP->getPointerOperand()); + } else { + return nullptr; } - } else if (GetElementPtrInst *EP = dyn_cast(V)) { - Res = findAllocaForValue(EP->getPointerOperand(), AllocaForValue); - } - if (Res) - AllocaForValue[V] = Res; - return Res; -} + } while (!Worklist.empty()); -AllocaInst *llvm::findAllocaForValue(Value *V) { - DenseMap AllocaForValue; - return ::findAllocaForValue(V, AllocaForValue); + return Result; } static bool onlyUsedByLifetimeMarkersOrDroppableInstsHelper( @@ -5065,46 +5065,59 @@ bool llvm::propagatesPoison(const Instruction *I) { } } -const Value *llvm::getGuaranteedNonPoisonOp(const Instruction *I) { +void llvm::getGuaranteedNonPoisonOps(const Instruction *I, + SmallPtrSetImpl &Operands) { switch (I->getOpcode()) { case Instruction::Store: - return cast(I)->getPointerOperand(); + Operands.insert(cast(I)->getPointerOperand()); + break; case Instruction::Load: - return cast(I)->getPointerOperand(); + Operands.insert(cast(I)->getPointerOperand()); + break; case Instruction::AtomicCmpXchg: - return cast(I)->getPointerOperand(); + Operands.insert(cast(I)->getPointerOperand()); + break; case Instruction::AtomicRMW: - return cast(I)->getPointerOperand(); + Operands.insert(cast(I)->getPointerOperand()); + break; case Instruction::UDiv: case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - return I->getOperand(1); + Operands.insert(I->getOperand(1)); + break; case Instruction::Call: - if (auto *II = dyn_cast(I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::assume: - return II->getArgOperand(0); - default: - return nullptr; - } + case Instruction::Invoke: { + const CallBase *CB = cast(I); + if (CB->isIndirectCall()) + Operands.insert(CB->getCalledOperand()); + for (unsigned i = 0; i < CB->arg_size(); ++i) { + if (CB->paramHasAttr(i, Attribute::NoUndef)) + Operands.insert(CB->getArgOperand(i)); } - return nullptr; + break; + } default: - return nullptr; + break; } } bool llvm::mustTriggerUB(const Instruction *I, const SmallSet& KnownPoison) { - auto *NotPoison = getGuaranteedNonPoisonOp(I); - return (NotPoison && KnownPoison.count(NotPoison)); + SmallPtrSet NonPoisonOps; + getGuaranteedNonPoisonOps(I, NonPoisonOps); + + for (const auto *V : NonPoisonOps) + if (KnownPoison.count(V)) + return true; + + return false; } diff --git a/llvm/lib/Analysis/models/inliner/output_spec.json b/llvm/lib/Analysis/models/inliner/output_spec.json index d9e2060cf176c..5f9d13d8f8b85 100644 --- a/llvm/lib/Analysis/models/inliner/output_spec.json +++ b/llvm/lib/Analysis/models/inliner/output_spec.json @@ -4,7 +4,7 @@ "tensor_spec": { "name": "StatefulPartitionedCall", "port": 0, - "type": "int64", + "type": "int64_t", "shape": [ 1 ] diff --git a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp index 2739137c1e446..2f153f1e78091 100644 --- a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp +++ b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp @@ -156,8 +156,9 @@ Expected BitstreamCursor::skipRecord(unsigned AbbrevID) { report_fatal_error("Array element type can't be an Array or a Blob"); case BitCodeAbbrevOp::Fixed: assert((unsigned)EltEnc.getEncodingData() <= MaxChunkSize); - if (Error Err = JumpToBit(GetCurrentBitNo() + - NumElts * EltEnc.getEncodingData())) + if (Error Err = + JumpToBit(GetCurrentBitNo() + static_cast(NumElts) * + EltEnc.getEncodingData())) return std::move(Err); break; case BitCodeAbbrevOp::VBR: @@ -186,7 +187,8 @@ Expected BitstreamCursor::skipRecord(unsigned AbbrevID) { SkipToFourByteBoundary(); // 32-bit alignment // Figure out where the end of this blob will be including tail padding. - size_t NewEnd = GetCurrentBitNo()+((NumElts+3)&~3)*8; + const size_t NewEnd = + GetCurrentBitNo() + ((static_cast(NumElts) + 3) & ~3) * 8; // If this would read off the end of the bitcode file, just set the // record to empty and return. @@ -314,7 +316,8 @@ Expected BitstreamCursor::readRecord(unsigned AbbrevID, // Figure out where the end of this blob will be including tail padding. size_t CurBitPos = GetCurrentBitNo(); - size_t NewEnd = CurBitPos+((NumElts+3)&~3)*8; + const size_t NewEnd = + CurBitPos + ((static_cast(NumElts) + 3) & ~3) * 8; // If this would read off the end of the bitcode file, just set the // record to empty and return. diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp index ebf871600564c..df1a9617b87d9 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp @@ -53,6 +53,37 @@ static Register isDescribedByReg(const MachineInstr &MI) { : Register(); } +void InstructionOrdering::initialize(const MachineFunction &MF) { + // We give meta instructions the same ordinal as the preceding instruction + // because this class is written for the task of comparing positions of + // variable location ranges against scope ranges. To reflect what we'll see + // in the binary, when we look at location ranges we must consider all + // DBG_VALUEs between two real instructions at the same position. And a + // scope range which ends on a meta instruction should be considered to end + // at the last seen real instruction. E.g. + // + // 1 instruction p Both the variable location for x and for y start + // 1 DBG_VALUE for "x" after instruction p so we give them all the same + // 1 DBG_VALUE for "y" number. If a scope range ends at DBG_VALUE for "y", + // 2 instruction q we should treat it as ending after instruction p + // because it will be the last real instruction in the + // range. DBG_VALUEs at or after this position for + // variables declared in the scope will have no effect. + clear(); + unsigned Position = 0; + for (const MachineBasicBlock &MBB : MF) + for (const MachineInstr &MI : MBB) + InstNumberMap[&MI] = MI.isMetaInstruction() ? Position : ++Position; +} + +bool InstructionOrdering::isBefore(const MachineInstr *A, + const MachineInstr *B) const { + assert(A->getParent() && B->getParent() && "Operands must have a parent"); + assert(A->getMF() == B->getMF() && + "Operands must be in the same MachineFunction"); + return InstNumberMap.lookup(A) < InstNumberMap.lookup(B); +} + bool DbgValueHistoryMap::startDbgValue(InlinedEntity Var, const MachineInstr &MI, EntryIndex &NewIndex) { @@ -92,65 +123,29 @@ void DbgValueHistoryMap::Entry::endEntry(EntryIndex Index) { EndIndex = Index; } -using OrderMap = DenseMap; -/// Number instructions so that we can compare instruction positions within MF. -/// Meta instructions are given the same nubmer as the preceding instruction. -/// Because the block ordering will not change it is possible (and safe) to -/// compare instruction positions between blocks. -static void numberInstructions(const MachineFunction &MF, OrderMap &Ordering) { - // We give meta instructions the same number as the peceding instruction - // because this function is written for the task of comparing positions of - // variable location ranges against scope ranges. To reflect what we'll see - // in the binary, when we look at location ranges we must consider all - // DBG_VALUEs between two real instructions at the same position. And a - // scope range which ends on a meta instruction should be considered to end - // at the last seen real instruction. E.g. - // - // 1 instruction p Both the variable location for x and for y start - // 1 DBG_VALUE for "x" after instruction p so we give them all the same - // 1 DBG_VALUE for "y" number. If a scope range ends at DBG_VALUE for "y", - // 2 instruction q we should treat it as ending after instruction p - // because it will be the last real instruction in the - // range. DBG_VALUEs at or after this position for - // variables declared in the scope will have no effect. - unsigned position = 0; - for (const MachineBasicBlock &MBB : MF) - for (const MachineInstr &MI : MBB) - Ordering[&MI] = MI.isMetaInstruction() ? position : ++position; -} - -/// Check if instruction A comes before B. Meta instructions have the same -/// position as the preceding non-meta instruction. See numberInstructions for -/// more info. -static bool isBefore(const MachineInstr *A, const MachineInstr *B, - const OrderMap &Ordering) { - return Ordering.lookup(A) < Ordering.lookup(B); -} - /// Check if the instruction range [StartMI, EndMI] intersects any instruction /// range in Ranges. EndMI can be nullptr to indicate that the range is /// unbounded. Assumes Ranges is ordered and disjoint. Returns true and points /// to the first intersecting scope range if one exists. static Optional::iterator> intersects(const MachineInstr *StartMI, const MachineInstr *EndMI, - const ArrayRef &Ranges, const OrderMap &Ordering) { + const ArrayRef &Ranges, + const InstructionOrdering &Ordering) { for (auto RangesI = Ranges.begin(), RangesE = Ranges.end(); RangesI != RangesE; ++RangesI) { - if (EndMI && isBefore(EndMI, RangesI->first, Ordering)) + if (EndMI && Ordering.isBefore(EndMI, RangesI->first)) return None; - if (EndMI && !isBefore(RangesI->second, EndMI, Ordering)) + if (EndMI && !Ordering.isBefore(RangesI->second, EndMI)) return RangesI; - if (isBefore(StartMI, RangesI->second, Ordering)) + if (Ordering.isBefore(StartMI, RangesI->second)) return RangesI; } return None; } -void DbgValueHistoryMap::trimLocationRanges(const MachineFunction &MF, - LexicalScopes &LScopes) { - OrderMap Ordering; - numberInstructions(MF, Ordering); - +void DbgValueHistoryMap::trimLocationRanges( + const MachineFunction &MF, LexicalScopes &LScopes, + const InstructionOrdering &Ordering) { // The indices of the entries we're going to remove for each variable. SmallVector ToRemove; // Entry reference count for each variable. Clobbers left with no references diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index a46de83e555cc..9693248de70f0 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -196,8 +196,9 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) { assert(DbgLabels.empty() && "DbgLabels map wasn't cleaned!"); calculateDbgEntityHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(), DbgValues, DbgLabels); + InstOrdering.initialize(*MF); if (TrimVarLocs) - DbgValues.trimLocationRanges(*MF, LScopes); + DbgValues.trimLocationRanges(*MF, LScopes, InstOrdering); LLVM_DEBUG(DbgValues.dump()); // Request labels for the full history. @@ -333,6 +334,7 @@ void DebugHandlerBase::endFunction(const MachineFunction *MF) { DbgLabels.clear(); LabelsBeforeInsn.clear(); LabelsAfterInsn.clear(); + InstOrdering.clear(); } void DebugHandlerBase::beginBasicBlock(const MachineBasicBlock &MBB) { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 0730fff3bcaf2..64d57aa9402c8 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -148,11 +148,6 @@ static cl::opt "Abstract subprograms")), cl::init(DefaultLinkageNames)); -static cl::opt LocationAnalysisSizeLimit( - "singlevarlocation-input-bb-limit", - cl::desc("Maximum block size to analyze for single-location variables"), - cl::init(30000), cl::Hidden); - static const char *const DWARFGroupName = "dwarf"; static const char *const DWARFGroupDescription = "DWARF Emission"; static const char *const DbgTimerName = "writer"; @@ -1518,7 +1513,8 @@ void DwarfDebug::collectVariableInfoFromMFTable( /// either open or otherwise rolls off the end of the scope. static bool validThroughout(LexicalScopes &LScopes, const MachineInstr *DbgValue, - const MachineInstr *RangeEnd) { + const MachineInstr *RangeEnd, + const InstructionOrdering &Ordering) { assert(DbgValue->getDebugLoc() && "DBG_VALUE without a debug location"); auto MBB = DbgValue->getParent(); auto DL = DbgValue->getDebugLoc(); @@ -1530,34 +1526,30 @@ static bool validThroughout(LexicalScopes &LScopes, if (LSRange.size() == 0) return false; - - // Determine if the DBG_VALUE is valid at the beginning of its lexical block. const MachineInstr *LScopeBegin = LSRange.front().first; - // Early exit if the lexical scope begins outside of the current block. - if (LScopeBegin->getParent() != MBB) - return false; - - // If there are instructions belonging to our scope in another block, and - // we're not a constant (see DWARF2 comment below), then we can't be - // validThroughout. - const MachineInstr *LScopeEnd = LSRange.back().second; - if (RangeEnd && LScopeEnd->getParent() != MBB) - return false; - - MachineBasicBlock::const_reverse_iterator Pred(DbgValue); - for (++Pred; Pred != MBB->rend(); ++Pred) { - if (Pred->getFlag(MachineInstr::FrameSetup)) - break; - auto PredDL = Pred->getDebugLoc(); - if (!PredDL || Pred->isMetaInstruction()) - continue; - // Check whether the instruction preceding the DBG_VALUE is in the same - // (sub)scope as the DBG_VALUE. - if (DL->getScope() == PredDL->getScope()) - return false; - auto *PredScope = LScopes.findLexicalScope(PredDL); - if (!PredScope || LScope->dominates(PredScope)) + // If the scope starts before the DBG_VALUE then we may have a negative + // result. Otherwise the location is live coming into the scope and we + // can skip the following checks. + if (!Ordering.isBefore(DbgValue, LScopeBegin)) { + // Exit if the lexical scope begins outside of the current block. + if (LScopeBegin->getParent() != MBB) return false; + + MachineBasicBlock::const_reverse_iterator Pred(DbgValue); + for (++Pred; Pred != MBB->rend(); ++Pred) { + if (Pred->getFlag(MachineInstr::FrameSetup)) + break; + auto PredDL = Pred->getDebugLoc(); + if (!PredDL || Pred->isMetaInstruction()) + continue; + // Check whether the instruction preceding the DBG_VALUE is in the same + // (sub)scope as the DBG_VALUE. + if (DL->getScope() == PredDL->getScope()) + return false; + auto *PredScope = LScopes.findLexicalScope(PredDL); + if (!PredScope || LScope->dominates(PredScope)) + return false; + } } // If the range of the DBG_VALUE is open-ended, report success. @@ -1571,24 +1563,10 @@ static bool validThroughout(LexicalScopes &LScopes, if (DbgValue->getDebugOperand(0).isImm() && MBB->pred_empty()) return true; - // Now check for situations where an "open-ended" DBG_VALUE isn't enough to - // determine eligibility for a single location, e.g. nested scopes, inlined - // functions. - // FIXME: For now we just handle a simple (but common) case where the scope - // is contained in MBB. We could be smarter here. - // - // At this point we know that our scope ends in MBB. So, if RangeEnd exists - // outside of the block we can ignore it; the location is just leaking outside - // its scope. - assert(LScopeEnd->getParent() == MBB && "Scope ends outside MBB"); - if (RangeEnd->getParent() != DbgValue->getParent()) - return true; - - // The location range and variable's enclosing scope are both contained within - // MBB, test if location terminates before end of scope. - for (auto I = RangeEnd->getIterator(); I != MBB->end(); ++I) - if (&*I == LScopeEnd) - return false; + // Test if the location terminates before the end of the scope. + const MachineInstr *LScopeEnd = LSRange.back().second; + if (Ordering.isBefore(RangeEnd, LScopeEnd)) + return false; // There's a single location which starts at the scope start, and ends at or // after the scope end. @@ -1628,10 +1606,8 @@ static bool validThroughout(LexicalScopes &LScopes, // [1-3) [(reg0, fragment 0, 32), (reg1, fragment 32, 32)] // [3-4) [(reg1, fragment 32, 32), (123, fragment 64, 32)] // [4-) [(@g, fragment 0, 96)] -bool DwarfDebug::buildLocationList( - SmallVectorImpl &DebugLoc, - const DbgValueHistoryMap::Entries &Entries, - DenseSet &VeryLargeBlocks) { +bool DwarfDebug::buildLocationList(SmallVectorImpl &DebugLoc, + const DbgValueHistoryMap::Entries &Entries) { using OpenRange = std::pair; SmallVector OpenRanges; @@ -1727,14 +1703,8 @@ bool DwarfDebug::buildLocationList( DebugLoc.pop_back(); } - // If there's a single entry, safe for a single location, and not part of - // an over-sized basic block, then ask validThroughout whether this - // location can be represented as a single variable location. - if (DebugLoc.size() != 1 || !isSafeForSingleLocation) - return false; - if (VeryLargeBlocks.count(StartDebugMI->getParent())) - return false; - return validThroughout(LScopes, StartDebugMI, EndMI); + return DebugLoc.size() == 1 && isSafeForSingleLocation && + validThroughout(LScopes, StartDebugMI, EndMI, getInstOrdering()); } DbgEntity *DwarfDebug::createConcreteEntity(DwarfCompileUnit &TheCU, @@ -1766,13 +1736,6 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU, // Grab the variable info that was squirreled away in the MMI side-table. collectVariableInfoFromMFTable(TheCU, Processed); - // Identify blocks that are unreasonably sized, so that we can later - // skip lexical scope analysis over them. - DenseSet VeryLargeBlocks; - for (const auto &MBB : *CurFn) - if (MBB.size() > LocationAnalysisSizeLimit) - VeryLargeBlocks.insert(&MBB); - for (const auto &I : DbgValues) { InlinedEntity IV = I.first; if (Processed.count(IV)) @@ -1809,8 +1772,7 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU, if (HistSize == 1 || SingleValueWithClobber) { const auto *End = SingleValueWithClobber ? HistoryMapEntries[1].getInstr() : nullptr; - if (VeryLargeBlocks.count(MInsn->getParent()) == 0 && - validThroughout(LScopes, MInsn, End)) { + if (validThroughout(LScopes, MInsn, End, getInstOrdering())) { RegVar->initializeDbgValue(MInsn); continue; } @@ -1825,8 +1787,7 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU, // Build the location list for this variable. SmallVector Entries; - bool isValidSingleLocation = - buildLocationList(Entries, HistoryMapEntries, VeryLargeBlocks); + bool isValidSingleLocation = buildLocationList(Entries, HistoryMapEntries); // Check whether buildLocationList managed to merge all locations to one // that is valid throughout the variable's scope. If so, produce single diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index a97d3680412b9..ba0bb84367035 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -598,10 +598,8 @@ class DwarfDebug : public DebugHandlerBase { /// function that describe the same variable. If the resulting /// list has only one entry that is valid for entire variable's /// scope return true. - bool buildLocationList( - SmallVectorImpl &DebugLoc, - const DbgValueHistoryMap::Entries &Entries, - DenseSet &VeryLargeBlocks); + bool buildLocationList(SmallVectorImpl &DebugLoc, + const DbgValueHistoryMap::Entries &Entries); /// Collect variable information from the side table maintained by MF. void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU, diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 0f139f791d142..ceed1fe6e3bd5 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1422,8 +1422,10 @@ static bool hasVectorBeenPadded(const DICompositeType *CTy) { Elements[0]->getTag() == dwarf::DW_TAG_subrange_type && "Invalid vector element array, expected one element of type subrange"); const auto Subrange = cast(Elements[0]); - const auto CI = Subrange->getCount().get(); - const int32_t NumVecElements = CI->getSExtValue(); + const auto NumVecElements = + Subrange->getCount() + ? Subrange->getCount().get()->getSExtValue() + : 0; // Ensure we found the element count and that the actual size is wide // enough to contain the requested size. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index a85ac80ef3652..86b5d2055f550 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -6482,9 +6482,7 @@ bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) { /// If we have a SelectInst that will likely profit from branch prediction, /// turn it into a branch. bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { - // If branch conversion isn't desirable, exit early. - if (DisableSelectToBranch || OptSize || - llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get())) + if (DisableSelectToBranch) return false; // Find all consecutive select instructions that share the same condition. @@ -6520,7 +6518,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { SelectKind = TargetLowering::ScalarValSelect; if (TLI->isSelectSupported(SelectKind) && - !isFormingBranchFromSelectProfitable(TTI, TLI, SI)) + (!isFormingBranchFromSelectProfitable(TTI, TLI, SI) || OptSize || + llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))) return false; // The DominatorTree needs to be rebuilt by any consumers after this diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 0ada09f469b76..a47608cf6dd93 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -85,6 +85,7 @@ CGOPT(bool, EnableStackSizeSection) CGOPT(bool, EnableAddrsig) CGOPT(bool, EmitCallSiteInfo) CGOPT(bool, EnableDebugEntryValues) +CGOPT(bool, ValueTrackingVariableLocations) CGOPT(bool, ForceDwarfFrameSection) CGOPT(bool, XRayOmitFunctionIndex) @@ -400,6 +401,12 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(EnableDebugEntryValues); + static cl::opt ValueTrackingVariableLocations( + "experimental-debug-variable-locations", + cl::desc("Use experimental new value-tracking variable locations"), + cl::init(false)); + CGBINDOPT(ValueTrackingVariableLocations); + static cl::opt ForceDwarfFrameSection( "force-dwarf-frame-section", cl::desc("Always emit a debug frame section."), cl::init(false)); @@ -475,6 +482,7 @@ TargetOptions codegen::InitTargetOptionsFromCodeGenFlags() { Options.EmitAddrsig = getEnableAddrsig(); Options.EmitCallSiteInfo = getEmitCallSiteInfo(); Options.EnableDebugEntryValues = getEnableDebugEntryValues(); + Options.ValueTrackingVariableLocations = getValueTrackingVariableLocations(); Options.ForceDwarfFrameSection = getForceDwarfFrameSection(); Options.XRayOmitFunctionIndex = getXRayOmitFunctionIndex(); diff --git a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp index 071cc5b737358..2fa208fbfaaff 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp @@ -367,23 +367,30 @@ GISelInstProfileBuilder::addNodeIDFlag(unsigned Flag) const { return *this; } +const GISelInstProfileBuilder & +GISelInstProfileBuilder::addNodeIDReg(Register Reg) const { + LLT Ty = MRI.getType(Reg); + if (Ty.isValid()) + addNodeIDRegType(Ty); + + if (const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(Reg)) { + if (const auto *RB = RCOrRB.dyn_cast()) + addNodeIDRegType(RB); + else if (const auto *RC = RCOrRB.dyn_cast()) + addNodeIDRegType(RC); + } + return *this; +} + const GISelInstProfileBuilder &GISelInstProfileBuilder::addNodeIDMachineOperand( const MachineOperand &MO) const { if (MO.isReg()) { Register Reg = MO.getReg(); if (!MO.isDef()) addNodeIDRegNum(Reg); - LLT Ty = MRI.getType(Reg); - if (Ty.isValid()) - addNodeIDRegType(Ty); - - if (const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(Reg)) { - if (const auto *RB = RCOrRB.dyn_cast()) - addNodeIDRegType(RB); - else if (const auto *RC = RCOrRB.dyn_cast()) - addNodeIDRegType(RC); - } + // Profile the register properties. + addNodeIDReg(Reg); assert(!MO.isImplicit() && "Unhandled case"); } else if (MO.isImm()) ID.AddInteger(MO.getImm()); diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp index 9048583ff7281..5441357e5fbe4 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -62,6 +62,11 @@ void CSEMIRBuilder::profileDstOp(const DstOp &Op, case DstOp::DstType::Ty_RC: B.addNodeIDRegType(Op.getRegClass()); break; + case DstOp::DstType::Ty_Reg: { + // Regs can have LLT&(RB|RC). If those exist, profile them as well. + B.addNodeIDReg(Op.getReg()); + break; + } default: B.addNodeIDRegType(Op.getLLTTy(*getMRI())); break; diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp index 48f4c5b0f371c..0e17a616cfde0 100644 --- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp @@ -153,5 +153,8 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF, MFChanged |= Changed; } while (Changed); + assert(!CSEInfo || !errorToBool(CSEInfo->verify()) && + "CSEInfo is not consistent. Likely missing calls to " + "observer on mutations"); return MFChanged; } diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 03086fa3532d4..0486be1dabb65 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -44,6 +44,10 @@ CombinerHelper::CombinerHelper(GISelChangeObserver &Observer, (void)this->KB; } +const TargetLowering &CombinerHelper::getTargetLowering() const { + return *Builder.getMF().getSubtarget().getTargetLowering(); +} + bool CombinerHelper::isLegalOrBeforeLegalizer( const LegalityQuery &Query) const { return !LI || LI->getAction(Query).Action == LegalizeActions::Legal; @@ -1381,13 +1385,11 @@ bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst, } bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { + const unsigned Opc = MI.getOpcode(); // This combine is fairly complex so it's not written with a separate // matcher function. - assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); - Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID(); - assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove || - ID == Intrinsic::memset) && - "Expected a memcpy like intrinsic"); + assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE || + Opc == TargetOpcode::G_MEMSET) && "Expected memcpy like instruction"); auto MMOIt = MI.memoperands_begin(); const MachineMemOperand *MemOp = *MMOIt; @@ -1398,11 +1400,11 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { Align DstAlign = MemOp->getBaseAlign(); Align SrcAlign; - Register Dst = MI.getOperand(1).getReg(); - Register Src = MI.getOperand(2).getReg(); - Register Len = MI.getOperand(3).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + Register Len = MI.getOperand(2).getReg(); - if (ID != Intrinsic::memset) { + if (Opc != TargetOpcode::G_MEMSET) { assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); MemOp = *(++MMOIt); SrcAlign = MemOp->getBaseAlign(); @@ -1422,11 +1424,11 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { if (MaxLen && KnownLen > MaxLen) return false; - if (ID == Intrinsic::memcpy) + if (Opc == TargetOpcode::G_MEMCPY) return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); - if (ID == Intrinsic::memmove) + if (Opc == TargetOpcode::G_MEMMOVE) return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); - if (ID == Intrinsic::memset) + if (Opc == TargetOpcode::G_MEMSET) return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile); return false; } @@ -1501,6 +1503,59 @@ bool CombinerHelper::applyCombineMulToShl(MachineInstr &MI, return true; } +// shl ([sza]ext x), y => zext (shl x, y), if shift does not overflow source +bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI, + RegisterImmPair &MatchData) { + assert(MI.getOpcode() == TargetOpcode::G_SHL && KB); + + Register LHS = MI.getOperand(1).getReg(); + + Register ExtSrc; + if (!mi_match(LHS, MRI, m_GAnyExt(m_Reg(ExtSrc))) && + !mi_match(LHS, MRI, m_GZExt(m_Reg(ExtSrc))) && + !mi_match(LHS, MRI, m_GSExt(m_Reg(ExtSrc)))) + return false; + + // TODO: Should handle vector splat. + Register RHS = MI.getOperand(2).getReg(); + auto MaybeShiftAmtVal = getConstantVRegValWithLookThrough(RHS, MRI); + if (!MaybeShiftAmtVal) + return false; + + if (LI) { + LLT SrcTy = MRI.getType(ExtSrc); + + // We only really care about the legality with the shifted value. We can + // pick any type the constant shift amount, so ask the target what to + // use. Otherwise we would have to guess and hope it is reported as legal. + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(SrcTy); + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SHL, {SrcTy, ShiftAmtTy}})) + return false; + } + + int64_t ShiftAmt = MaybeShiftAmtVal->Value; + MatchData.Reg = ExtSrc; + MatchData.Imm = ShiftAmt; + + unsigned MinLeadingZeros = KB->getKnownZeroes(ExtSrc).countLeadingOnes(); + return MinLeadingZeros >= ShiftAmt; +} + +bool CombinerHelper::applyCombineShlOfExtend(MachineInstr &MI, + const RegisterImmPair &MatchData) { + Register ExtSrcReg = MatchData.Reg; + int64_t ShiftAmtVal = MatchData.Imm; + + LLT ExtSrcTy = MRI.getType(ExtSrcReg); + Builder.setInstrAndDebugLoc(MI); + auto ShiftAmt = Builder.buildConstant(ExtSrcTy, ShiftAmtVal); + auto NarrowShift = + Builder.buildShl(ExtSrcTy, ExtSrcReg, ShiftAmt, MI.getFlags()); + Builder.buildZExt(MI.getOperand(0), NarrowShift); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize, unsigned &ShiftVal) { @@ -1644,6 +1699,51 @@ bool CombinerHelper::applyCombineP2IToI2P(MachineInstr &MI, Register &Reg) { return true; } +bool CombinerHelper::matchCombineAddP2IToPtrAdd( + MachineInstr &MI, std::pair &PtrReg) { + assert(MI.getOpcode() == TargetOpcode::G_ADD); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + LLT IntTy = MRI.getType(LHS); + + // G_PTR_ADD always has the pointer in the LHS, so we may need to commute the + // instruction. + PtrReg.second = false; + for (Register SrcReg : {LHS, RHS}) { + if (mi_match(SrcReg, MRI, m_GPtrToInt(m_Reg(PtrReg.first)))) { + // Don't handle cases where the integer is implicitly converted to the + // pointer width. + LLT PtrTy = MRI.getType(PtrReg.first); + if (PtrTy.getScalarSizeInBits() == IntTy.getScalarSizeInBits()) + return true; + } + + PtrReg.second = true; + } + + return false; +} + +bool CombinerHelper::applyCombineAddP2IToPtrAdd( + MachineInstr &MI, std::pair &PtrReg) { + Register Dst = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + + const bool DoCommute = PtrReg.second; + if (DoCommute) + std::swap(LHS, RHS); + LHS = PtrReg.first; + + LLT PtrTy = MRI.getType(LHS); + + Builder.setInstrAndDebugLoc(MI); + auto PtrAdd = Builder.buildPtrAdd(PtrTy, LHS, RHS); + Builder.buildPtrToInt(Dst, PtrAdd); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) { return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) { return MO.isReg() && @@ -1670,6 +1770,22 @@ bool CombinerHelper::matchUndefStore(MachineInstr &MI) { MRI); } +bool CombinerHelper::matchUndefSelectCmp(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SELECT); + return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(1).getReg(), + MRI); +} + +bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) { + assert(MI.getOpcode() == TargetOpcode::G_SELECT); + if (auto MaybeCstCmp = + getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI)) { + OpIdx = MaybeCstCmp->Value ? 2 : 3; + return true; + } + return false; +} + bool CombinerHelper::eraseInst(MachineInstr &MI) { MI.eraseFromParent(); return true; diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp index e0eedc557f8f2..af2f37739d6f6 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp @@ -94,6 +94,26 @@ dumpResult(const MachineInstr &MI, const KnownBits &Known, unsigned Depth) { << "\n"; } +/// Compute known bits for the intersection of \p Src0 and \p Src1 +void GISelKnownBits::computeKnownBitsMin(Register Src0, Register Src1, + KnownBits &Known, + const APInt &DemandedElts, + unsigned Depth) { + // Test src1 first, since we canonicalize simpler expressions to the RHS. + computeKnownBitsImpl(Src1, Known, DemandedElts, Depth); + + // If we don't know any bits, early out. + if (Known.isUnknown()) + return; + + KnownBits Known2; + computeKnownBitsImpl(Src0, Known2, DemandedElts, Depth); + + // Only known if known in both the LHS and RHS. + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; +} + void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth) { @@ -284,16 +304,16 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, break; } case TargetOpcode::G_SELECT: { - computeKnownBitsImpl(MI.getOperand(3).getReg(), Known, DemandedElts, - Depth + 1); - // If we don't know any bits, early out. - if (Known.isUnknown()) - break; - computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts, - Depth + 1); - // Only known if known in both the LHS and RHS. - Known.One &= Known2.One; - Known.Zero &= Known2.Zero; + computeKnownBitsMin(MI.getOperand(2).getReg(), MI.getOperand(3).getReg(), + Known, DemandedElts, Depth + 1); + break; + } + case TargetOpcode::G_SMIN: + case TargetOpcode::G_SMAX: + case TargetOpcode::G_UMIN: + case TargetOpcode::G_UMAX: { + computeKnownBitsMin(MI.getOperand(1).getReg(), MI.getOperand(2).getReg(), + Known, DemandedElts, Depth + 1); break; } case TargetOpcode::G_FCMP: @@ -387,6 +407,18 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, Known.Zero.setBitsFrom(SrcBitWidth); break; } + case TargetOpcode::G_MERGE_VALUES: { + Register NumOps = MI.getNumOperands(); + unsigned OpSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + + for (unsigned I = 0; I != NumOps - 1; ++I) { + KnownBits SrcOpKnown; + computeKnownBitsImpl(MI.getOperand(I + 1).getReg(), SrcOpKnown, + DemandedElts, Depth + 1); + Known.insertBits(SrcOpKnown, I * OpSize); + } + break; + } } assert(!Known.hasConflict() && "Bits known to be one AND zero?"); @@ -439,6 +471,13 @@ unsigned GISelKnownBits::computeNumSignBits(Register R, unsigned Tmp = DstTy.getScalarSizeInBits() - SrcTy.getScalarSizeInBits(); return computeNumSignBits(Src, DemandedElts, Depth + 1) + Tmp; } + case TargetOpcode::G_SEXT_INREG: { + // Max of the input and what this extends. + Register Src = MI.getOperand(1).getReg(); + unsigned SrcBits = MI.getOperand(2).getImm(); + unsigned InRegBits = TyBits - SrcBits + 1; + return std::max(computeNumSignBits(Src, DemandedElts, Depth + 1), InRegBits); + } case TargetOpcode::G_TRUNC: { Register Src = MI.getOperand(1).getReg(); LLT SrcTy = MRI.getType(Src); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 46041f0a8a827..c14f0ef882a81 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1283,16 +1283,33 @@ bool IRTranslator::translateGetElementPtr(const User &U, bool IRTranslator::translateMemFunc(const CallInst &CI, MachineIRBuilder &MIRBuilder, - Intrinsic::ID ID) { + unsigned Opcode) { // If the source is undef, then just emit a nop. if (isa(CI.getArgOperand(1))) return true; - ArrayRef Res; - auto ICall = MIRBuilder.buildIntrinsic(ID, Res, true); - for (auto AI = CI.arg_begin(), AE = CI.arg_end(); std::next(AI) != AE; ++AI) - ICall.addUse(getOrCreateVReg(**AI)); + SmallVector SrcRegs; + + unsigned MinPtrSize = UINT_MAX; + for (auto AI = CI.arg_begin(), AE = CI.arg_end(); std::next(AI) != AE; ++AI) { + Register SrcReg = getOrCreateVReg(**AI); + LLT SrcTy = MRI->getType(SrcReg); + if (SrcTy.isPointer()) + MinPtrSize = std::min(SrcTy.getSizeInBits(), MinPtrSize); + SrcRegs.push_back(SrcReg); + } + + LLT SizeTy = LLT::scalar(MinPtrSize); + + // The size operand should be the minimum of the pointer sizes. + Register &SizeOpReg = SrcRegs[SrcRegs.size() - 1]; + if (MRI->getType(SizeOpReg) != SizeTy) + SizeOpReg = MIRBuilder.buildZExtOrTrunc(SizeTy, SizeOpReg).getReg(0); + + auto ICall = MIRBuilder.buildInstr(Opcode); + for (Register SrcReg : SrcRegs) + ICall.addUse(SrcReg); Align DstAlign; Align SrcAlign; @@ -1321,7 +1338,7 @@ bool IRTranslator::translateMemFunc(const CallInst &CI, ICall.addMemOperand(MF->getMachineMemOperand( MachinePointerInfo(CI.getArgOperand(0)), MachineMemOperand::MOStore | VolFlag, 1, DstAlign)); - if (ID != Intrinsic::memset) + if (Opcode != TargetOpcode::G_MEMSET) ICall.addMemOperand(MF->getMachineMemOperand( MachinePointerInfo(CI.getArgOperand(1)), MachineMemOperand::MOLoad | VolFlag, 1, SrcAlign)); @@ -1713,9 +1730,11 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, MachineInstr::copyFlagsFromInstruction(CI)); return true; case Intrinsic::memcpy: + return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMCPY); case Intrinsic::memmove: + return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMMOVE); case Intrinsic::memset: - return translateMemFunc(CI, MIRBuilder, ID); + return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMSET); case Intrinsic::eh_typeid_for: { GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0)); Register Reg = getOrCreateVReg(CI); diff --git a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp index 1d7be54de3b04..5ba9367cac8a7 100644 --- a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -284,7 +284,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI, WrapperObserver)) { WorkListObserver.printNewInstrs(); for (auto *DeadMI : DeadInstructions) { - LLVM_DEBUG(dbgs() << *DeadMI << "Is dead\n"); + LLVM_DEBUG(dbgs() << "Is dead: " << *DeadMI); RemoveDeadInstFromLists(DeadMI); DeadMI->eraseFromParentAndMarkDBGValuesForRemoval(); } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index c6720a19c9d4b..347fe7b0ee98d 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -558,12 +558,11 @@ simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, LegalizerHelper::LegalizeResult llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, MachineInstr &MI) { - assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); SmallVector Args; // Add all the args, except for the last which is an imm denoting 'tail'. - for (unsigned i = 1; i < MI.getNumOperands() - 1; i++) { + for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) { Register Reg = MI.getOperand(i).getReg(); // Need derive an IR type for call lowering. @@ -578,30 +577,27 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); - Intrinsic::ID ID = MI.getOperand(0).getIntrinsicID(); RTLIB::Libcall RTLibcall; - switch (ID) { - case Intrinsic::memcpy: + switch (MI.getOpcode()) { + case TargetOpcode::G_MEMCPY: RTLibcall = RTLIB::MEMCPY; break; - case Intrinsic::memset: - RTLibcall = RTLIB::MEMSET; - break; - case Intrinsic::memmove: + case TargetOpcode::G_MEMMOVE: RTLibcall = RTLIB::MEMMOVE; break; + case TargetOpcode::G_MEMSET: + RTLibcall = RTLIB::MEMSET; + break; default: return LegalizerHelper::UnableToLegalize; } const char *Name = TLI.getLibcallName(RTLibcall); - MIRBuilder.setInstrAndDebugLoc(MI); - CallLowering::CallLoweringInfo Info; Info.CallConv = TLI.getLibcallCallingConv(RTLibcall); Info.Callee = MachineOperand::CreateES(Name); Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx)); - Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() == 1 && + Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() && isLibCallInTailPosition(MIRBuilder.getTII(), MI); std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs)); @@ -748,6 +744,13 @@ LegalizerHelper::libcall(MachineInstr &MI) { return Status; break; } + case TargetOpcode::G_MEMCPY: + case TargetOpcode::G_MEMMOVE: + case TargetOpcode::G_MEMSET: { + LegalizeResult Result = createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI); + MI.eraseFromParent(); + return Result; + } } MI.eraseFromParent(); diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp index b8584e8386cb2..770c46ec84369 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Target/TargetMachine.h" /// \file LiveDebugValues.cpp /// @@ -40,7 +41,10 @@ class LiveDebugValues : public MachineFunctionPass { static char ID; LiveDebugValues(); - ~LiveDebugValues() { delete TheImpl; } + ~LiveDebugValues() { + if (TheImpl) + delete TheImpl; + } /// Calculate the liveness information for the given machine function. bool runOnMachineFunction(MachineFunction &MF) override; @@ -57,6 +61,7 @@ class LiveDebugValues : public MachineFunctionPass { private: LDVImpl *TheImpl; + TargetPassConfig *TPC; }; char LiveDebugValues::ID = 0; @@ -69,10 +74,24 @@ INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", false, /// Default construct and initialize the pass. LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) { initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry()); - TheImpl = llvm::makeVarLocBasedLiveDebugValues(); + TheImpl = nullptr; } bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) { - auto *TPC = getAnalysisIfAvailable(); + if (!TheImpl) { + TPC = getAnalysisIfAvailable(); + + bool InstrRefBased = false; + if (TPC) { + auto &TM = TPC->getTM(); + InstrRefBased = TM.Options.ValueTrackingVariableLocations; + } + + if (InstrRefBased) + TheImpl = llvm::makeInstrRefBasedLiveDebugValues(); + else + TheImpl = llvm::makeVarLocBasedLiveDebugValues(); + } + return TheImpl->ExtendRanges(MF, TPC); } diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 2e0b0e745e9eb..badf5868c7578 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -322,9 +322,14 @@ bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI) { static bool isSSA(const MachineFunction &MF) { const MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - unsigned Reg = Register::index2VirtReg(I); + Register Reg = Register::index2VirtReg(I); if (!MRI.hasOneDef(Reg) && !MRI.def_empty(Reg)) return false; + + // Subregister defs are invalid in SSA. + const MachineOperand *RegDef = MRI.getOneDef(Reg); + if (RegDef && RegDef->getSubReg() != 0) + return false; } return true; } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index a8ccf2643f200..b6d0d9a74ac14 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1530,7 +1530,10 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation { void apply(ScheduleDAGInstrs *DAGInstrs) override; protected: - void clusterNeighboringMemOps(ArrayRef MemOps, ScheduleDAGInstrs *DAG); + void clusterNeighboringMemOps(ArrayRef MemOps, + ScheduleDAGInstrs *DAG); + void collectMemOpRecords(std::vector &SUnits, + SmallVectorImpl &MemOpRecords); }; class StoreClusterMutation : public BaseMemOpClusterMutation { @@ -1566,63 +1569,53 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII, } // end namespace llvm +// Sorting all the loads/stores first, then for each load/store, checking the +// following load/store one by one, until reach the first non-dependent one and +// call target hook to see if they can cluster. void BaseMemOpClusterMutation::clusterNeighboringMemOps( - ArrayRef MemOps, ScheduleDAGInstrs *DAG) { - SmallVector MemOpRecords; - for (SUnit *SU : MemOps) { - const MachineInstr &MI = *SU->getInstr(); - SmallVector BaseOps; - int64_t Offset; - bool OffsetIsScalable; - unsigned Width; - if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, - OffsetIsScalable, Width, TRI)) { - MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width)); - - LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " - << Offset << ", OffsetIsScalable: " << OffsetIsScalable - << ", Width: " << Width << "\n"); - } -#ifndef NDEBUG - for (auto *Op : BaseOps) - assert(Op); -#endif - } - if (MemOpRecords.size() < 2) - return; - - llvm::sort(MemOpRecords); + ArrayRef MemOpRecords, ScheduleDAGInstrs *DAG) { + // Keep track of the current cluster length and bytes for each SUnit. + DenseMap> SUnit2ClusterInfo; // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to // cluster mem ops collected within `MemOpRecords` array. - unsigned ClusterLength = 1; - unsigned CurrentClusterBytes = MemOpRecords[0].Width; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { // Decision to cluster mem ops is taken based on target dependent logic auto MemOpa = MemOpRecords[Idx]; - auto MemOpb = MemOpRecords[Idx + 1]; - ++ClusterLength; - CurrentClusterBytes += MemOpb.Width; - if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, - CurrentClusterBytes)) { - // Current mem ops pair could not be clustered, reset cluster length, and - // go to next pair - ClusterLength = 1; - CurrentClusterBytes = MemOpb.Width; + + // Seek for the next load/store to do the cluster. + unsigned NextIdx = Idx + 1; + for (; NextIdx < End; ++NextIdx) + // Skip if MemOpb has been clustered already or has dependency with + // MemOpa. + if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) && + !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) && + !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU)) + break; + if (NextIdx == End) continue; + + auto MemOpb = MemOpRecords[NextIdx]; + unsigned ClusterLength = 2; + unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width; + if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) { + ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1; + CurrentClusterBytes = + SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width; } + if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, + CurrentClusterBytes)) + continue; + SUnit *SUa = MemOpa.SU; SUnit *SUb = MemOpb.SU; if (SUa->NodeNum > SUb->NodeNum) std::swap(SUa, SUb); // FIXME: Is this check really required? - if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { - ClusterLength = 1; - CurrentClusterBytes = MemOpb.Width; + if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) continue; - } LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); @@ -1656,42 +1649,57 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( } } + SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength, + CurrentClusterBytes}; + LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength << ", Curr cluster bytes: " << CurrentClusterBytes << "\n"); } } -/// Callback from DAG postProcessing to create cluster edges for loads. -void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { - // Map DAG NodeNum to a set of dependent MemOps in store chain. - DenseMap> StoreChains; - for (SUnit &SU : DAG->SUnits) { +void BaseMemOpClusterMutation::collectMemOpRecords( + std::vector &SUnits, SmallVectorImpl &MemOpRecords) { + for (auto &SU : SUnits) { if ((IsLoad && !SU.getInstr()->mayLoad()) || (!IsLoad && !SU.getInstr()->mayStore())) continue; - unsigned ChainPredID = DAG->SUnits.size(); - for (const SDep &Pred : SU.Preds) { - // We only want to cluster the mem ops that have the same ctrl(non-data) - // pred so that they didn't have ctrl dependency for each other. But for - // store instrs, we can still cluster them if the pred is load instr. - if ((Pred.isCtrl() && - (IsLoad || - (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && - !Pred.isArtificial()) { - ChainPredID = Pred.getSUnit()->NodeNum; - break; - } + const MachineInstr &MI = *SU.getInstr(); + SmallVector BaseOps; + int64_t Offset; + bool OffsetIsScalable; + unsigned Width; + if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, + OffsetIsScalable, Width, TRI)) { + MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width)); + + LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " + << Offset << ", OffsetIsScalable: " << OffsetIsScalable + << ", Width: " << Width << "\n"); } - // Insert the SU to corresponding store chain. - auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second; - Chain.push_back(&SU); +#ifndef NDEBUG + for (auto *Op : BaseOps) + assert(Op); +#endif } +} + +/// Callback from DAG postProcessing to create cluster edges for loads/stores. +void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { + // Collect all the clusterable loads/stores + SmallVector MemOpRecords; + collectMemOpRecords(DAG->SUnits, MemOpRecords); + + if (MemOpRecords.size() < 2) + return; + + // Sorting the loads/stores, so that, we can stop the cluster as early as + // possible. + llvm::sort(MemOpRecords); - // Iterate over the store chains. - for (auto &SCD : StoreChains) - clusterNeighboringMemOps(SCD.second, DAG); + // Trying to cluster all the neighboring loads/stores. + clusterNeighboringMemOps(MemOpRecords, DAG); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 26c2914b77ec9..2aa14c8131edd 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -790,9 +790,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) { } // Ensure non-terminators don't follow terminators. - // Ignore predicated terminators formed by if conversion. - // FIXME: If conversion shouldn't need to violate this rule. - if (MI->isTerminator() && !TII->isPredicated(*MI)) { + if (MI->isTerminator()) { if (!FirstTerminator) FirstTerminator = MI; } else if (FirstTerminator) { @@ -1357,20 +1355,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { break; } } - switch (IntrID) { - case Intrinsic::memcpy: - if (MI->getNumOperands() != 5) - report("Expected memcpy intrinsic to have 5 operands", MI); - break; - case Intrinsic::memmove: - if (MI->getNumOperands() != 5) - report("Expected memmove intrinsic to have 5 operands", MI); - break; - case Intrinsic::memset: - if (MI->getNumOperands() != 5) - report("Expected memset intrinsic to have 5 operands", MI); - break; - } + break; } case TargetOpcode::G_SEXT_INREG: { @@ -1448,6 +1433,61 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { } break; } + case TargetOpcode::G_MEMCPY: + case TargetOpcode::G_MEMMOVE: { + ArrayRef MMOs = MI->memoperands(); + if (MMOs.size() != 2) { + report("memcpy/memmove must have 2 memory operands", MI); + break; + } + + if ((!MMOs[0]->isStore() || MMOs[0]->isLoad()) || + (MMOs[1]->isStore() || !MMOs[1]->isLoad())) { + report("wrong memory operand types", MI); + break; + } + + if (MMOs[0]->getSize() != MMOs[1]->getSize()) + report("inconsistent memory operand sizes", MI); + + LLT DstPtrTy = MRI->getType(MI->getOperand(0).getReg()); + LLT SrcPtrTy = MRI->getType(MI->getOperand(1).getReg()); + + if (!DstPtrTy.isPointer() || !SrcPtrTy.isPointer()) { + report("memory instruction operand must be a pointer", MI); + break; + } + + if (DstPtrTy.getAddressSpace() != MMOs[0]->getAddrSpace()) + report("inconsistent store address space", MI); + if (SrcPtrTy.getAddressSpace() != MMOs[1]->getAddrSpace()) + report("inconsistent load address space", MI); + + break; + } + case TargetOpcode::G_MEMSET: { + ArrayRef MMOs = MI->memoperands(); + if (MMOs.size() != 1) { + report("memset must have 1 memory operand", MI); + break; + } + + if ((!MMOs[0]->isStore() || MMOs[0]->isLoad())) { + report("memset memory operand must be a store", MI); + break; + } + + LLT DstPtrTy = MRI->getType(MI->getOperand(0).getReg()); + if (!DstPtrTy.isPointer()) { + report("memset operand must be a pointer", MI); + break; + } + + if (DstPtrTy.getAddressSpace() != MMOs[0]->getAddrSpace()) + report("inconsistent memset address space", MI); + + break; + } default: break; } diff --git a/llvm/lib/CodeGen/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp index b2a29bf451a2a..50bd910739b5a 100644 --- a/llvm/lib/CodeGen/RDFLiveness.cpp +++ b/llvm/lib/CodeGen/RDFLiveness.cpp @@ -230,13 +230,12 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, TmpBB.push_back(Bucket.first); if (Bucket.second.size() > 2) GetOrder(*Bucket.first); - std::sort(Bucket.second.begin(), Bucket.second.end(), Precedes); + llvm::sort(Bucket.second.begin(), Bucket.second.end(), Precedes); } // Sort the blocks with respect to dominance. - std::sort(TmpBB.begin(), TmpBB.end(), [this](auto A, auto B) { - return MDT.dominates(A, B); - }); + llvm::sort(TmpBB.begin(), TmpBB.end(), + [this](auto A, auto B) { return MDT.properlyDominates(A, B); }); std::vector TmpInst; for (auto I = TmpBB.rbegin(), E = TmpBB.rend(); I != E; ++I) { diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index dd70f91d481f9..cb53ea47e79fc 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -272,8 +272,10 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) const { } MachineInstr* ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI, - int PhysReg) const { - return getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg)); + int PhysReg) const { + return hasLocalDefBefore(MI, PhysReg) + ? getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg)) + : nullptr; } bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B, @@ -421,7 +423,9 @@ MachineInstr *ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI, SmallPtrSet VisitedBBs; SmallPtrSet Incoming; - for (auto *Pred : MI->getParent()->predecessors()) + MachineBasicBlock *Parent = MI->getParent(); + VisitedBBs.insert(Parent); + for (auto *Pred : Parent->predecessors()) getLiveOuts(Pred, PhysReg, Incoming, VisitedBBs); // If we have a local def and an incoming instruction, then there's not a diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp index 41b6de1441d75..a8ba5ea3fa8bf 100644 --- a/llvm/lib/CodeGen/RegisterScavenging.cpp +++ b/llvm/lib/CodeGen/RegisterScavenging.cpp @@ -154,25 +154,6 @@ void RegScavenger::determineKillsAndDefs() { } } -void RegScavenger::unprocess() { - assert(Tracking && "Cannot unprocess because we're not tracking"); - - MachineInstr &MI = *MBBI; - if (!MI.isDebugInstr()) { - determineKillsAndDefs(); - - // Commit the changes. - setUnused(DefRegUnits); - setUsed(KillRegUnits); - } - - if (MBBI == MBB->begin()) { - MBBI = MachineBasicBlock::iterator(nullptr); - Tracking = false; - } else - --MBBI; -} - void RegScavenger::forward() { // Move ptr forward. if (!Tracking) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 65640bf9471aa..59edd03b7ec82 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -464,6 +464,7 @@ namespace { SDValue visitFREEZE(SDNode *N); SDValue visitBUILD_PAIR(SDNode *N); SDValue visitFADD(SDNode *N); + SDValue visitSTRICT_FADD(SDNode *N); SDValue visitFSUB(SDNode *N); SDValue visitFMUL(SDNode *N); SDValue visitFMA(SDNode *N); @@ -1650,6 +1651,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::BITCAST: return visitBITCAST(N); case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); case ISD::FADD: return visitFADD(N); + case ISD::STRICT_FADD: return visitSTRICT_FADD(N); case ISD::FSUB: return visitFSUB(N); case ISD::FMUL: return visitFMUL(N); case ISD::FMA: return visitFMA(N); @@ -6869,8 +6871,9 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { SmallVector Stores; for (StoreSDNode *Store = N; Store; Store = dyn_cast(Chain)) { // TODO: Allow unordered atomics when wider type is legal (see D66309) - if (Store->getMemoryVT() != MVT::i8 || !Store->isSimple() || - Store->isIndexed()) + EVT MemVT = Store->getMemoryVT(); + if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) || + !Store->isSimple() || Store->isIndexed()) return SDValue(); Stores.push_back(Store); Chain = Store->getChain(); @@ -6959,12 +6962,6 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { assert(FirstOffset != INT64_MAX && "First byte offset must be set"); assert(FirstStore && "First store must be set"); - // Check if the bytes of the combined value we are looking at match with - // either big or little endian value store. - Optional IsBigEndian = isBigEndian(OffsetMap, FirstOffset); - if (!IsBigEndian.hasValue()) - return SDValue(); - // Check that a store of the wide type is both allowed and fast on the target const DataLayout &Layout = DAG.getDataLayout(); bool Fast = false; @@ -6973,6 +6970,31 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { if (!Allowed || !Fast) return SDValue(); + // Check if the pieces of the value are going to the expected places in memory + // to merge the stores. + auto checkOffsets = [&](bool MatchLittleEndian) { + if (MatchLittleEndian) { + for (unsigned i = 0; i != NumStores; ++i) + if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset) + return false; + } else { // MatchBigEndian by reversing loop counter. + for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j) + if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset) + return false; + } + return true; + }; + + // Check if the offsets line up for the native data layout of this target. + bool NeedBswap = false; + if (!checkOffsets(Layout.isLittleEndian())) { + // Special-case: check if byte offsets line up for the opposite endian. + // TODO: We could use rotates for 16/32-bit merge pairs. + if (NarrowNumBits != 8 || !checkOffsets(Layout.isBigEndian())) + return SDValue(); + NeedBswap = true; + } + SDLoc DL(N); if (WideVT != SourceValue.getValueType()) { assert(SourceValue.getValueType().getSizeInBits() > WideNumBits && @@ -6983,7 +7005,6 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { // Before legalize we can introduce illegal bswaps which will be later // converted to an explicit bswap sequence. This way we end up with a single // store and byte shuffling instead of several stores and byte shuffling. - bool NeedBswap = Layout.isBigEndian() != *IsBigEndian; if (NeedBswap) SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue); @@ -12814,6 +12835,33 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSTRICT_FADD(SDNode *N) { + SDValue Chain = N->getOperand(0); + SDValue N0 = N->getOperand(1); + SDValue N1 = N->getOperand(2); + EVT VT = N->getValueType(0); + EVT ChainVT = N->getValueType(1); + SDLoc DL(N); + const SDNodeFlags Flags = N->getFlags(); + + // fold (strict_fadd A, (fneg B)) -> (strict_fsub A, B) + if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT)) + if (SDValue NegN1 = TLI.getCheaperNegatedExpression( + N1, DAG, LegalOperations, ForCodeSize)) { + return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT), + {Chain, N0, NegN1}, Flags); + } + + // fold (strict_fadd (fneg A), B) -> (strict_fsub B, A) + if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT)) + if (SDValue NegN0 = TLI.getCheaperNegatedExpression( + N0, DAG, LegalOperations, ForCodeSize)) { + return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT), + {Chain, N1, NegN0}, Flags); + } + return SDValue(); +} + SDValue DAGCombiner::visitFSUB(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -13356,6 +13404,12 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { return RV; } + // Fold X/Sqrt(X) -> Sqrt(X) + if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) && + (Options.UnsafeFPMath || Flags.hasAllowReassociation())) + if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0)) + return N1; + // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) TargetLowering::NegatibleCost CostN0 = TargetLowering::NegatibleCost::Expensive; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 364d0bb12365a..e8cc916593fbc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -888,7 +888,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_Convert_StrictFP(SDNode *N); SDValue WidenVecRes_FCOPYSIGN(SDNode *N); SDValue WidenVecRes_POWI(SDNode *N); - SDValue WidenVecRes_Shift(SDNode *N); SDValue WidenVecRes_Unary(SDNode *N); SDValue WidenVecRes_InregOp(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 965d4a0955fb8..ae5c872a29e7b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -146,6 +146,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::SHL: case ISD::SRA: case ISD::SRL: + case ISD::ROTL: + case ISD::ROTR: R = ScalarizeVecRes_BinOp(N); break; case ISD::FMA: @@ -2798,6 +2800,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::OR: case ISD::SUB: case ISD::XOR: + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINIMUM: @@ -2812,6 +2817,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SSUBSAT: case ISD::SSHLSAT: case ISD::USHLSAT: + case ISD::ROTL: + case ISD::ROTR: Res = WidenVecRes_Binary(N); break; @@ -2860,12 +2867,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { Res = WidenVecRes_POWI(N); break; - case ISD::SHL: - case ISD::SRA: - case ISD::SRL: - Res = WidenVecRes_Shift(N); - break; - case ISD::ANY_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: @@ -3485,25 +3486,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) { return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp); } -SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) { - EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); - SDValue InOp = GetWidenedVector(N->getOperand(0)); - SDValue ShOp = N->getOperand(1); - - EVT ShVT = ShOp.getValueType(); - if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) { - ShOp = GetWidenedVector(ShOp); - ShVT = ShOp.getValueType(); - } - EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(), - ShVT.getVectorElementType(), - WidenVT.getVectorNumElements()); - if (ShVT != ShWidenVT) - ShOp = ModifyToType(ShOp, ShWidenVT); - - return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp); -} - SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) { // Unary op widening. EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 1aefe8cc5b3ab..0e2fe9b9dd413 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9076,6 +9076,10 @@ ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N, bool AllowUndefs) { return CN; } + if (N.getOpcode() == ISD::SPLAT_VECTOR) + if (ConstantFPSDNode *CN = dyn_cast(N.getOperand(0))) + return CN; + return nullptr; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 9e57fa084ad8c..85dc150e14613 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -389,7 +389,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, // as appropriate. for (unsigned i = 0; i != NumParts; ++i) Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1, - PartVT, IntermediateVT, V); + PartVT, IntermediateVT, V, CallConv); } else if (NumParts > 0) { // If the intermediate type was expanded, build the intermediate // operands from the parts. @@ -398,7 +398,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, unsigned Factor = NumParts / NumIntermediates; for (unsigned i = 0; i != NumIntermediates; ++i) Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor, - PartVT, IntermediateVT, V); + PartVT, IntermediateVT, V, CallConv); } // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the @@ -6890,31 +6890,31 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::get_active_lane_mask: { auto DL = getCurSDLoc(); SDValue Index = getValue(I.getOperand(0)); - SDValue BTC = getValue(I.getOperand(1)); + SDValue TripCount = getValue(I.getOperand(1)); Type *ElementTy = I.getOperand(0)->getType(); EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); unsigned VecWidth = VT.getVectorNumElements(); - SmallVector OpsBTC; + SmallVector OpsTripCount; SmallVector OpsIndex; SmallVector OpsStepConstants; for (unsigned i = 0; i < VecWidth; i++) { - OpsBTC.push_back(BTC); + OpsTripCount.push_back(TripCount); OpsIndex.push_back(Index); - OpsStepConstants.push_back(DAG.getConstant(i, DL, MVT::getVT(ElementTy))); + OpsStepConstants.push_back( + DAG.getConstant(i, DL, EVT::getEVT(ElementTy))); } - EVT CCVT = MVT::i1; - CCVT = EVT::getVectorVT(I.getContext(), CCVT, VecWidth); + EVT CCVT = EVT::getVectorVT(I.getContext(), MVT::i1, VecWidth); - auto VecTy = MVT::getVT(FixedVectorType::get(ElementTy, VecWidth)); + auto VecTy = EVT::getEVT(FixedVectorType::get(ElementTy, VecWidth)); SDValue VectorIndex = DAG.getBuildVector(VecTy, DL, OpsIndex); SDValue VectorStep = DAG.getBuildVector(VecTy, DL, OpsStepConstants); SDValue VectorInduction = DAG.getNode( ISD::UADDO, DL, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep); - SDValue VectorBTC = DAG.getBuildVector(VecTy, DL, OpsBTC); + SDValue VectorTripCount = DAG.getBuildVector(VecTy, DL, OpsTripCount); SDValue SetCC = DAG.getSetCC(DL, CCVT, VectorInduction.getValue(0), - VectorBTC, ISD::CondCode::SETULE); + VectorTripCount, ISD::CondCode::SETULT); setValue(&I, DAG.getNode(ISD::AND, DL, CCVT, DAG.getNOT(DL, VectorInduction.getValue(1), CCVT), SetCC)); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 2609ba72662b1..559bdbe9da207 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6167,6 +6167,32 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result, EVT ShVT = Z.getValueType(); + // If a funnel shift in the other direction is more supported, use it. + unsigned RevOpcode = IsFSHL ? ISD::FSHR : ISD::FSHL; + if (!isOperationLegalOrCustom(Node->getOpcode(), VT) && + isOperationLegalOrCustom(RevOpcode, VT) && isPowerOf2_32(BW)) { + if (isNonZeroModBitWidthOrUndef(Z, BW)) { + // fshl X, Y, Z -> fshr X, Y, -Z + // fshr X, Y, Z -> fshl X, Y, -Z + SDValue Zero = DAG.getConstant(0, DL, ShVT); + Z = DAG.getNode(ISD::SUB, DL, VT, Zero, Z); + } else { + // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z + // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z + SDValue One = DAG.getConstant(1, DL, ShVT); + if (IsFSHL) { + Y = DAG.getNode(RevOpcode, DL, VT, X, Y, One); + X = DAG.getNode(ISD::SRL, DL, VT, X, One); + } else { + X = DAG.getNode(RevOpcode, DL, VT, X, Y, One); + Y = DAG.getNode(ISD::SHL, DL, VT, Y, One); + } + Z = DAG.getNOT(DL, Z, ShVT); + } + Result = DAG.getNode(RevOpcode, DL, VT, X, Y, Z); + return true; + } + SDValue ShX, ShY; SDValue ShAmt, InvShAmt; if (isNonZeroModBitWidthOrUndef(Z, BW)) { @@ -6221,12 +6247,9 @@ bool TargetLowering::expandROT(SDNode *Node, SDValue &Result, EVT ShVT = Op1.getValueType(); SDValue Zero = DAG.getConstant(0, DL, ShVT); - assert(isPowerOf2_32(EltSizeInBits) && EltSizeInBits > 1 && - "Expecting the type bitwidth to be a power of 2"); - // If a rotate in the other direction is supported, use it. unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL; - if (isOperationLegalOrCustom(RevRot, VT)) { + if (isOperationLegalOrCustom(RevRot, VT) && isPowerOf2_32(EltSizeInBits)) { SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1); Result = DAG.getNode(RevRot, DL, VT, Op0, Sub); return true; @@ -6239,18 +6262,31 @@ bool TargetLowering::expandROT(SDNode *Node, SDValue &Result, !isOperationLegalOrCustomOrPromote(ISD::AND, VT))) return false; - // Otherwise, - // (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and -c, w-1))) - // (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and -c, w-1))) - // unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL; unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL; SDValue BitWidthMinusOneC = DAG.getConstant(EltSizeInBits - 1, DL, ShVT); - SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1); - SDValue And0 = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC); - SDValue And1 = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC); - Result = DAG.getNode(ISD::OR, DL, VT, DAG.getNode(ShOpc, DL, VT, Op0, And0), - DAG.getNode(HsOpc, DL, VT, Op0, And1)); + SDValue ShVal; + SDValue HsVal; + if (isPowerOf2_32(EltSizeInBits)) { + // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1)) + // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1)) + SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1); + SDValue ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC); + ShVal = DAG.getNode(ShOpc, DL, VT, Op0, ShAmt); + SDValue HsAmt = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC); + HsVal = DAG.getNode(HsOpc, DL, VT, Op0, HsAmt); + } else { + // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w)) + // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w)) + SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT); + SDValue ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Op1, BitWidthC); + ShVal = DAG.getNode(ShOpc, DL, VT, Op0, ShAmt); + SDValue HsAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthMinusOneC, ShAmt); + SDValue One = DAG.getConstant(1, DL, ShVT); + HsVal = + DAG.getNode(HsOpc, DL, VT, DAG.getNode(HsOpc, DL, VT, Op0, One), HsAmt); + } + Result = DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal); return true; } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 868302abc999d..40bb45a584dbd 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1844,7 +1844,10 @@ Value *TargetLoweringBase::getIRStackGuard(IRBuilder<> &IRB) const { if (getTargetMachine().getTargetTriple().isOSOpenBSD()) { Module &M = *IRB.GetInsertBlock()->getParent()->getParent(); PointerType *PtrTy = Type::getInt8PtrTy(M.getContext()); - return M.getOrInsertGlobal("__guard_local", PtrTy); + Constant *C = M.getOrInsertGlobal("__guard_local", PtrTy); + if (GlobalVariable *G = dyn_cast_or_null(C)) + G->setVisibility(GlobalValue::HiddenVisibility); + return C; } return nullptr; } diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 4c3e6cce78364..c2ea5d9b860aa 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -436,7 +436,8 @@ static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) { if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF, /*AddSegmentInfo=*/false) || Name == getInstrProfSectionName(IPSK_covfun, Triple::ELF, - /*AddSegmentInfo=*/false)) + /*AddSegmentInfo=*/false) || + Name == ".llvmbc" || Name == ".llvmcmd") return SectionKind::getMetadata(); if (Name.empty() || Name[0] != '.') return K; diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp index 54ba9ac39ea6a..9eda6fb9ea77b 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp @@ -763,12 +763,19 @@ Error deregisterEHFrameSection(const void *EHFrameSectionAddr, EHFrameRegistrar::~EHFrameRegistrar() {} -InProcessEHFrameRegistrar &InProcessEHFrameRegistrar::getInstance() { - static InProcessEHFrameRegistrar Instance; - return Instance; +Error InProcessEHFrameRegistrar::registerEHFrames( + JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) { + return registerEHFrameSection( + jitTargetAddressToPointer(EHFrameSectionAddr), + EHFrameSectionSize); } -InProcessEHFrameRegistrar::InProcessEHFrameRegistrar() {} +Error InProcessEHFrameRegistrar::deregisterEHFrames( + JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) { + return deregisterEHFrameSection( + jitTargetAddressToPointer(EHFrameSectionAddr), + EHFrameSectionSize); +} LinkGraphPassFunction createEHFrameRecorderPass(const Triple &TT, diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 531a71d50b9ec..56dec8688441e 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -984,7 +984,7 @@ Error LLJITBuilderState::prepareForConstruction() { ObjLinkingLayer = std::make_unique( ES, std::make_unique()); ObjLinkingLayer->addPlugin(std::make_unique( - jitlink::InProcessEHFrameRegistrar::getInstance())); + std::make_unique())); return std::move(ObjLinkingLayer); }; } diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp index 5b828ed84462a..50fa23d2f80fd 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp @@ -536,8 +536,8 @@ Error ObjectLinkingLayer::removeAllModules() { } EHFrameRegistrationPlugin::EHFrameRegistrationPlugin( - EHFrameRegistrar &Registrar) - : Registrar(Registrar) {} + std::unique_ptr Registrar) + : Registrar(std::move(Registrar)) {} void EHFrameRegistrationPlugin::modifyPassConfig( MaterializationResponsibility &MR, const Triple &TT, @@ -572,7 +572,7 @@ Error EHFrameRegistrationPlugin::notifyEmitted( else UntrackedEHFrameRanges.push_back(EHFrameRange); - return Registrar.registerEHFrames(EHFrameRange.Addr, EHFrameRange.Size); + return Registrar->registerEHFrames(EHFrameRange.Addr, EHFrameRange.Size); } Error EHFrameRegistrationPlugin::notifyRemovingModule(VModuleKey K) { @@ -587,7 +587,7 @@ Error EHFrameRegistrationPlugin::notifyRemovingModule(VModuleKey K) { TrackedEHFrameRanges.erase(EHFrameRangeItr); - return Registrar.deregisterEHFrames(EHFrameRange.Addr, EHFrameRange.Size); + return Registrar->deregisterEHFrames(EHFrameRange.Addr, EHFrameRange.Size); } Error EHFrameRegistrationPlugin::notifyRemovingAllModules() { @@ -608,9 +608,8 @@ Error EHFrameRegistrationPlugin::notifyRemovingAllModules() { auto EHFrameRange = EHFrameRanges.back(); assert(EHFrameRange.Addr && "Untracked eh-frame range must not be null"); EHFrameRanges.pop_back(); - Err = joinErrors(std::move(Err), - Registrar.deregisterEHFrames(EHFrameRange.Addr, - EHFrameRange.Size)); + Err = joinErrors(std::move(Err), Registrar->deregisterEHFrames( + EHFrameRange.Addr, EHFrameRange.Size)); } return Err; diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index f43ddce0118b9..0b2ac8582a62b 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1511,7 +1511,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, } if (isa(CV) || isa(CV)) { - auto *CVVTy = cast(CV->getType()); + auto *CVVTy = cast(CV->getType()); Type *ETy = CVVTy->getElementType(); Out << '<'; TypePrinter.print(ETy, Out); diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 581778c9c678f..369dc50895727 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -632,6 +632,63 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { return true; } } + + // Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8 + // respectively + if ((Name.startswith("arm.neon.bfdot.") || + Name.startswith("aarch64.neon.bfdot.")) && + Name.endswith("i8")) { + Intrinsic::ID IID = + StringSwitch(Name) + .Cases("arm.neon.bfdot.v2f32.v8i8", + "arm.neon.bfdot.v4f32.v16i8", + Intrinsic::arm_neon_bfdot) + .Cases("aarch64.neon.bfdot.v2f32.v8i8", + "aarch64.neon.bfdot.v4f32.v16i8", + Intrinsic::aarch64_neon_bfdot) + .Default(Intrinsic::not_intrinsic); + if (IID == Intrinsic::not_intrinsic) + break; + + size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits(); + assert((OperandWidth == 64 || OperandWidth == 128) && + "Unexpected operand width"); + LLVMContext &Ctx = F->getParent()->getContext(); + std::array Tys {{ + F->getReturnType(), + FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16) + }}; + NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys); + return true; + } + + // Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore + // and accept v8bf16 instead of v16i8 + if ((Name.startswith("arm.neon.bfm") || + Name.startswith("aarch64.neon.bfm")) && + Name.endswith(".v4f32.v16i8")) { + Intrinsic::ID IID = + StringSwitch(Name) + .Case("arm.neon.bfmmla.v4f32.v16i8", + Intrinsic::arm_neon_bfmmla) + .Case("arm.neon.bfmlalb.v4f32.v16i8", + Intrinsic::arm_neon_bfmlalb) + .Case("arm.neon.bfmlalt.v4f32.v16i8", + Intrinsic::arm_neon_bfmlalt) + .Case("aarch64.neon.bfmmla.v4f32.v16i8", + Intrinsic::aarch64_neon_bfmmla) + .Case("aarch64.neon.bfmlalb.v4f32.v16i8", + Intrinsic::aarch64_neon_bfmlalb) + .Case("aarch64.neon.bfmlalt.v4f32.v16i8", + Intrinsic::aarch64_neon_bfmlalt) + .Default(Intrinsic::not_intrinsic); + if (IID == Intrinsic::not_intrinsic) + break; + + std::array Tys; + NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys); + return true; + } break; } @@ -931,7 +988,7 @@ GlobalVariable *llvm::UpgradeGlobalVariable(GlobalVariable *GV) { // to byte shuffles. static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder, Value *Op, unsigned Shift) { - auto *ResultTy = cast(Op->getType()); + auto *ResultTy = cast(Op->getType()); unsigned NumElts = ResultTy->getNumElements() * 8; // Bitcast from a 64-bit element type to a byte element type. @@ -965,7 +1022,7 @@ static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder, // to byte shuffles. static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value *Op, unsigned Shift) { - auto *ResultTy = cast(Op->getType()); + auto *ResultTy = cast(Op->getType()); unsigned NumElts = ResultTy->getNumElements() * 8; // Bitcast from a 64-bit element type to a byte element type. @@ -1023,7 +1080,7 @@ static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask, return Op0; Mask = getX86MaskVec(Builder, Mask, - cast(Op0->getType())->getNumElements()); + cast(Op0->getType())->getNumElements()); return Builder.CreateSelect(Mask, Op0, Op1); } @@ -1050,7 +1107,7 @@ static Value *UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0, bool IsVALIGN) { unsigned ShiftVal = cast(Shift)->getZExtValue(); - unsigned NumElts = cast(Op0->getType())->getNumElements(); + unsigned NumElts = cast(Op0->getType())->getNumElements(); assert((IsVALIGN || NumElts % 16 == 0) && "Illegal NumElts for PALIGNR!"); assert((!IsVALIGN || NumElts <= 16) && "NumElts too large for VALIGN!"); assert(isPowerOf2_32(NumElts) && "NumElts not a power of 2!"); @@ -1181,7 +1238,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallInst &CI, // Funnel shifts amounts are treated as modulo and types are all power-of-2 so // we only care about the lowest log2 bits anyway. if (Amt->getType() != Ty) { - unsigned NumElts = cast(Ty)->getNumElements(); + unsigned NumElts = cast(Ty)->getNumElements(); Amt = Builder.CreateIntCast(Amt, Ty->getScalarType(), false); Amt = Builder.CreateVectorSplat(NumElts, Amt); } @@ -1251,7 +1308,7 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallInst &CI, // Funnel shifts amounts are treated as modulo and types are all power-of-2 so // we only care about the lowest log2 bits anyway. if (Amt->getType() != Ty) { - unsigned NumElts = cast(Ty)->getNumElements(); + unsigned NumElts = cast(Ty)->getNumElements(); Amt = Builder.CreateIntCast(Amt, Ty->getScalarType(), false); Amt = Builder.CreateVectorSplat(NumElts, Amt); } @@ -1288,7 +1345,7 @@ static Value *UpgradeMaskedStore(IRBuilder<> &Builder, return Builder.CreateAlignedStore(Data, Ptr, Alignment); // Convert the mask from an integer type to a vector of i1. - unsigned NumElts = cast(Data->getType())->getNumElements(); + unsigned NumElts = cast(Data->getType())->getNumElements(); Mask = getX86MaskVec(Builder, Mask, NumElts); return Builder.CreateMaskedStore(Data, Ptr, Alignment, Mask); } @@ -1311,7 +1368,8 @@ static Value *UpgradeMaskedLoad(IRBuilder<> &Builder, return Builder.CreateAlignedLoad(ValTy, Ptr, Alignment); // Convert the mask from an integer type to a vector of i1. - unsigned NumElts = cast(Passthru->getType())->getNumElements(); + unsigned NumElts = + cast(Passthru->getType())->getNumElements(); Mask = getX86MaskVec(Builder, Mask, NumElts); return Builder.CreateMaskedLoad(Ptr, Alignment, Mask, Passthru); } @@ -1375,7 +1433,7 @@ static Value *upgradePMULDQ(IRBuilder<> &Builder, CallInst &CI, bool IsSigned) { // Applying mask on vector of i1's and make sure result is at least 8 bits wide. static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec, Value *Mask) { - unsigned NumElts = cast(Vec->getType())->getNumElements(); + unsigned NumElts = cast(Vec->getType())->getNumElements(); if (Mask) { const auto *C = dyn_cast(Mask); if (!C || !C->isAllOnesValue()) @@ -1398,7 +1456,7 @@ static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec, static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI, unsigned CC, bool Signed) { Value *Op0 = CI.getArgOperand(0); - unsigned NumElts = cast(Op0->getType())->getNumElements(); + unsigned NumElts = cast(Op0->getType())->getNumElements(); Value *Cmp; if (CC == 3) { @@ -1453,7 +1511,7 @@ static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) { static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) { Value* Op = CI.getArgOperand(0); Type* ReturnOp = CI.getType(); - unsigned NumElts = cast(CI.getType())->getNumElements(); + unsigned NumElts = cast(CI.getType())->getNumElements(); Value *Mask = getX86MaskVec(Builder, Op, NumElts); return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2"); } @@ -1902,8 +1960,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateICmp(Pred, Rep, Zero); Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, Mask); } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){ - unsigned NumElts = - cast(CI->getArgOperand(1)->getType())->getNumElements(); + unsigned NumElts = cast(CI->getArgOperand(1)->getType()) + ->getNumElements(); Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0)); Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); @@ -2151,9 +2209,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name == "avx.cvt.ps2.pd.256" || Name == "avx512.mask.cvtps2pd.128" || Name == "avx512.mask.cvtps2pd.256")) { - auto *DstTy = cast(CI->getType()); + auto *DstTy = cast(CI->getType()); Rep = CI->getArgOperand(0); - auto *SrcTy = cast(Rep->getType()); + auto *SrcTy = cast(Rep->getType()); unsigned NumDstElts = DstTy->getNumElements(); if (NumDstElts < SrcTy->getNumElements()) { @@ -2183,9 +2241,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { CI->getArgOperand(1)); } else if (IsX86 && (Name.startswith("avx512.mask.vcvtph2ps.") || Name.startswith("vcvtph2ps."))) { - auto *DstTy = cast(CI->getType()); + auto *DstTy = cast(CI->getType()); Rep = CI->getArgOperand(0); - auto *SrcTy = cast(Rep->getType()); + auto *SrcTy = cast(Rep->getType()); unsigned NumDstElts = DstTy->getNumElements(); if (NumDstElts != SrcTy->getNumElements()) { assert(NumDstElts == 4 && "Unexpected vector size"); @@ -2206,7 +2264,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { CI->getArgOperand(1),CI->getArgOperand(2), /*Aligned*/true); } else if (IsX86 && Name.startswith("avx512.mask.expand.load.")) { - auto *ResultTy = cast(CI->getType()); + auto *ResultTy = cast(CI->getType()); Type *PtrTy = ResultTy->getElementType(); // Cast the pointer to element type. @@ -2228,8 +2286,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Value *Ptr = Builder.CreateBitCast(CI->getOperand(0), llvm::PointerType::getUnqual(PtrTy)); - Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2), - ResultTy->getNumElements()); + Value *MaskVec = + getX86MaskVec(Builder, CI->getArgOperand(2), + cast(ResultTy)->getNumElements()); Function *CSt = Intrinsic::getDeclaration(F->getParent(), Intrinsic::masked_compressstore, @@ -2237,7 +2296,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateCall(CSt, { CI->getArgOperand(1), Ptr, MaskVec }); } else if (IsX86 && (Name.startswith("avx512.mask.compress.") || Name.startswith("avx512.mask.expand."))) { - auto *ResultTy = cast(CI->getType()); + auto *ResultTy = cast(CI->getType()); Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2), ResultTy->getNumElements()); @@ -2317,7 +2376,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { } else if (IsX86 && (Name.startswith("avx.vbroadcast.s") || Name.startswith("avx512.vbroadcast.s"))) { // Replace broadcasts with a series of insertelements. - auto *VecTy = cast(CI->getType()); + auto *VecTy = cast(CI->getType()); Type *EltTy = VecTy->getElementType(); unsigned EltNum = VecTy->getNumElements(); Value *Cast = Builder.CreateBitCast(CI->getArgOperand(0), @@ -2334,8 +2393,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx2.pmovzx") || Name.startswith("avx512.mask.pmovsx") || Name.startswith("avx512.mask.pmovzx"))) { - VectorType *SrcTy = cast(CI->getArgOperand(0)->getType()); - VectorType *DstTy = cast(CI->getType()); + auto *SrcTy = cast(CI->getArgOperand(0)->getType()); + auto *DstTy = cast(CI->getType()); unsigned NumDstElts = DstTy->getNumElements(); // Extract a subvector of the first NumDstElts lanes and sign/zero extend. @@ -2402,8 +2461,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { }else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") || Name.startswith("avx512.mask.broadcasti"))) { unsigned NumSrcElts = - cast(CI->getArgOperand(0)->getType())->getNumElements(); - unsigned NumDstElts = cast(CI->getType())->getNumElements(); + cast(CI->getArgOperand(0)->getType()) + ->getNumElements(); + unsigned NumDstElts = + cast(CI->getType())->getNumElements(); SmallVector ShuffleMask(NumDstElts); for (unsigned i = 0; i != NumDstElts; ++i) @@ -2492,7 +2553,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Value *Op0 = CI->getArgOperand(0); Value *Op1 = CI->getArgOperand(1); unsigned Imm = cast (CI->getArgOperand(2))->getZExtValue(); - VectorType *VecTy = cast(CI->getType()); + auto *VecTy = cast(CI->getType()); unsigned NumElts = VecTy->getNumElements(); SmallVector Idxs(NumElts); @@ -2506,8 +2567,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Value *Op0 = CI->getArgOperand(0); Value *Op1 = CI->getArgOperand(1); unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); - unsigned DstNumElts = cast(CI->getType())->getNumElements(); - unsigned SrcNumElts = cast(Op1->getType())->getNumElements(); + unsigned DstNumElts = + cast(CI->getType())->getNumElements(); + unsigned SrcNumElts = + cast(Op1->getType())->getNumElements(); unsigned Scale = DstNumElts / SrcNumElts; // Mask off the high bits of the immediate value; hardware ignores those. @@ -2550,8 +2613,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx512.mask.vextract"))) { Value *Op0 = CI->getArgOperand(0); unsigned Imm = cast(CI->getArgOperand(1))->getZExtValue(); - unsigned DstNumElts = cast(CI->getType())->getNumElements(); - unsigned SrcNumElts = cast(Op0->getType())->getNumElements(); + unsigned DstNumElts = + cast(CI->getType())->getNumElements(); + unsigned SrcNumElts = + cast(Op0->getType())->getNumElements(); unsigned Scale = SrcNumElts / DstNumElts; // Mask off the high bits of the immediate value; hardware ignores those. @@ -2574,7 +2639,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx512.mask.perm.di."))) { Value *Op0 = CI->getArgOperand(0); unsigned Imm = cast(CI->getArgOperand(1))->getZExtValue(); - VectorType *VecTy = cast(CI->getType()); + auto *VecTy = cast(CI->getType()); unsigned NumElts = VecTy->getNumElements(); SmallVector Idxs(NumElts); @@ -2598,7 +2663,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { uint8_t Imm = cast(CI->getArgOperand(2))->getZExtValue(); - unsigned NumElts = cast(CI->getType())->getNumElements(); + unsigned NumElts = cast(CI->getType())->getNumElements(); unsigned HalfSize = NumElts / 2; SmallVector ShuffleMask(NumElts); @@ -2628,7 +2693,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx512.mask.pshuf.d."))) { Value *Op0 = CI->getArgOperand(0); unsigned Imm = cast(CI->getArgOperand(1))->getZExtValue(); - VectorType *VecTy = cast(CI->getType()); + auto *VecTy = cast(CI->getType()); unsigned NumElts = VecTy->getNumElements(); // Calculate the size of each index in the immediate. unsigned IdxSize = 64 / VecTy->getScalarSizeInBits(); @@ -2650,7 +2715,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx512.mask.pshufl.w."))) { Value *Op0 = CI->getArgOperand(0); unsigned Imm = cast(CI->getArgOperand(1))->getZExtValue(); - unsigned NumElts = cast(CI->getType())->getNumElements(); + unsigned NumElts = cast(CI->getType())->getNumElements(); SmallVector Idxs(NumElts); for (unsigned l = 0; l != NumElts; l += 8) { @@ -2669,7 +2734,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx512.mask.pshufh.w."))) { Value *Op0 = CI->getArgOperand(0); unsigned Imm = cast(CI->getArgOperand(1))->getZExtValue(); - unsigned NumElts = cast(CI->getType())->getNumElements(); + unsigned NumElts = cast(CI->getType())->getNumElements(); SmallVector Idxs(NumElts); for (unsigned l = 0; l != NumElts; l += 8) { @@ -2688,7 +2753,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Value *Op0 = CI->getArgOperand(0); Value *Op1 = CI->getArgOperand(1); unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); - unsigned NumElts = cast(CI->getType())->getNumElements(); + unsigned NumElts = cast(CI->getType())->getNumElements(); unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits(); unsigned HalfLaneElts = NumLaneElts / 2; @@ -2713,7 +2778,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx512.mask.movshdup") || Name.startswith("avx512.mask.movsldup"))) { Value *Op0 = CI->getArgOperand(0); - unsigned NumElts = cast(CI->getType())->getNumElements(); + unsigned NumElts = cast(CI->getType())->getNumElements(); unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits(); unsigned Offset = 0; @@ -2735,7 +2800,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx512.mask.unpckl."))) { Value *Op0 = CI->getArgOperand(0); Value *Op1 = CI->getArgOperand(1); - int NumElts = cast(CI->getType())->getNumElements(); + int NumElts = cast(CI->getType())->getNumElements(); int NumLaneElts = 128/CI->getType()->getScalarSizeInBits(); SmallVector Idxs(NumElts); @@ -2751,7 +2816,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx512.mask.unpckh."))) { Value *Op0 = CI->getArgOperand(0); Value *Op1 = CI->getArgOperand(1); - int NumElts = cast(CI->getType())->getNumElements(); + int NumElts = cast(CI->getType())->getNumElements(); int NumLaneElts = 128/CI->getType()->getScalarSizeInBits(); SmallVector Idxs(NumElts); @@ -3319,7 +3384,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), Ops); } else { - int NumElts = cast(CI->getType())->getNumElements(); + int NumElts = cast(CI->getType())->getNumElements(); Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2) }; @@ -3618,6 +3683,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { break; } + case Intrinsic::arm_neon_bfdot: + case Intrinsic::arm_neon_bfmmla: + case Intrinsic::arm_neon_bfmlalb: + case Intrinsic::arm_neon_bfmlalt: + case Intrinsic::aarch64_neon_bfdot: + case Intrinsic::aarch64_neon_bfmmla: + case Intrinsic::aarch64_neon_bfmlalb: + case Intrinsic::aarch64_neon_bfmlalt: { + SmallVector Args; + assert(CI->getNumArgOperands() == 3 && + "Mismatch between function args and call args"); + size_t OperandWidth = + CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits(); + assert((OperandWidth == 64 || OperandWidth == 128) && + "Unexpected operand width"); + Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16); + auto Iter = CI->arg_operands().begin(); + Args.push_back(*Iter++); + Args.push_back(Builder.CreateBitCast(*Iter++, NewTy)); + Args.push_back(Builder.CreateBitCast(*Iter++, NewTy)); + NewCall = Builder.CreateCall(NewFn, Args); + break; + } + case Intrinsic::bitreverse: NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)}); break; @@ -3751,7 +3840,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { case Intrinsic::x86_avx512_mask_cmp_ps_512: { SmallVector Args(CI->arg_operands().begin(), CI->arg_operands().end()); - unsigned NumElts = cast(Args[0]->getType())->getNumElements(); + unsigned NumElts = + cast(Args[0]->getType())->getNumElements(); Args[3] = getX86MaskVec(Builder, Args[3], NumElts); NewCall = Builder.CreateCall(NewFn, Args); diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index 8fcc10fa38af3..49805d5b8c274 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -47,6 +47,7 @@ add_llvm_component_library(LLVMCore SafepointIRVerifier.cpp ProfileSummary.cpp Statepoint.cpp + StructuralHash.cpp Type.cpp TypeFinder.cpp Use.cpp diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index 1cd2ced469304..8d960ea9a5faa 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -161,7 +161,7 @@ bool Constant::isNotOneValue() const { // Check that vectors don't contain 1 if (auto *VTy = dyn_cast(this->getType())) { - unsigned NumElts = VTy->getNumElements(); + unsigned NumElts = cast(VTy)->getNumElements(); for (unsigned i = 0; i != NumElts; ++i) { Constant *Elt = this->getAggregateElement(i); if (!Elt || !Elt->isNotOneValue()) @@ -211,7 +211,7 @@ bool Constant::isNotMinSignedValue() const { // Check that vectors don't contain INT_MIN if (auto *VTy = dyn_cast(this->getType())) { - unsigned NumElts = VTy->getNumElements(); + unsigned NumElts = cast(VTy)->getNumElements(); for (unsigned i = 0; i != NumElts; ++i) { Constant *Elt = this->getAggregateElement(i); if (!Elt || !Elt->isNotMinSignedValue()) @@ -227,7 +227,7 @@ bool Constant::isNotMinSignedValue() const { bool Constant::isFiniteNonZeroFP() const { if (auto *CFP = dyn_cast(this)) return CFP->getValueAPF().isFiniteNonZero(); - auto *VTy = dyn_cast(getType()); + auto *VTy = dyn_cast(getType()); if (!VTy) return false; for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) { @@ -306,7 +306,15 @@ bool Constant::isElementWiseEqual(Value *Y) const { bool Constant::containsUndefElement() const { if (auto *VTy = dyn_cast(getType())) { - for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) + if (isa(this)) + return true; + if (isa(this)) + return false; + if (isa(getType())) + return false; + + for (unsigned i = 0, e = cast(VTy)->getNumElements(); + i != e; ++i) if (isa(getAggregateElement(i))) return true; } @@ -315,7 +323,7 @@ bool Constant::containsUndefElement() const { } bool Constant::containsConstantExpression() const { - if (auto *VTy = dyn_cast(getType())) { + if (auto *VTy = dyn_cast(getType())) { for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) if (isa(getAggregateElement(i))) return true; @@ -1029,7 +1037,7 @@ unsigned ConstantAggregateZero::getNumElements() const { if (auto *AT = dyn_cast(Ty)) return AT->getNumElements(); if (auto *VT = dyn_cast(Ty)) - return VT->getNumElements(); + return cast(VT)->getNumElements(); return Ty->getStructNumElements(); } @@ -1064,7 +1072,7 @@ unsigned UndefValue::getNumElements() const { if (auto *AT = dyn_cast(Ty)) return AT->getNumElements(); if (auto *VT = dyn_cast(Ty)) - return VT->getNumElements(); + return cast(VT)->getNumElements(); return Ty->getStructNumElements(); } @@ -1246,7 +1254,7 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef V) { ConstantVector::ConstantVector(VectorType *T, ArrayRef V) : ConstantAggregate(T, ConstantVectorVal, V) { - assert(V.size() == T->getNumElements() && + assert(V.size() == cast(T)->getNumElements() && "Invalid initializer for constant vector"); } @@ -2004,8 +2012,8 @@ Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy, "PtrToInt destination must be integer or integer vector"); assert(isa(C->getType()) == isa(DstTy)); if (isa(C->getType())) - assert(cast(C->getType())->getNumElements() == - cast(DstTy)->getNumElements() && + assert(cast(C->getType())->getNumElements() == + cast(DstTy)->getNumElements() && "Invalid cast between a different number of vector elements"); return getFoldedCast(Instruction::PtrToInt, C, DstTy, OnlyIfReduced); } @@ -2018,8 +2026,8 @@ Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy, "IntToPtr destination must be a pointer or pointer vector"); assert(isa(C->getType()) == isa(DstTy)); if (isa(C->getType())) - assert(cast(C->getType())->getNumElements() == - cast(DstTy)->getNumElements() && + assert(cast(C->getType())->getElementCount() == + cast(DstTy)->getElementCount() && "Invalid cast between a different number of vector elements"); return getFoldedCast(Instruction::IntToPtr, C, DstTy, OnlyIfReduced); } @@ -2050,7 +2058,8 @@ Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy, Type *MidTy = PointerType::get(DstElemTy, SrcScalarTy->getAddressSpace()); if (VectorType *VT = dyn_cast(DstTy)) { // Handle vectors of pointers. - MidTy = FixedVectorType::get(MidTy, VT->getNumElements()); + MidTy = FixedVectorType::get(MidTy, + cast(VT)->getNumElements()); } C = getBitCast(C, MidTy); } @@ -2692,7 +2701,7 @@ bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) { unsigned ConstantDataSequential::getNumElements() const { if (ArrayType *AT = dyn_cast(getType())) return AT->getNumElements(); - return cast(getType())->getNumElements(); + return cast(getType())->getNumElements(); } diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 6f3bbc80d4fd5..71faa5002b9ff 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -781,7 +781,7 @@ unsigned LLVMGetPointerAddressSpace(LLVMTypeRef PointerTy) { } unsigned LLVMGetVectorSize(LLVMTypeRef VectorTy) { - return unwrap(VectorTy)->getNumElements(); + return unwrap(VectorTy)->getElementCount().Min; } /*--.. Operations on other types ...........................................--*/ diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp index 6528c723fbfae..28882cfa8f65d 100644 --- a/llvm/lib/IR/DiagnosticInfo.cpp +++ b/llvm/lib/IR/DiagnosticInfo.cpp @@ -213,6 +213,13 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, unsigned long long N) : Key(std::string(Key)), Val(utostr(N)) {} +DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, + ElementCount EC) + : Key(std::string(Key)) { + raw_string_ostream OS(Val); + EC.print(OS); +} + DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc) : Key(std::string(Key)), Loc(Loc) { if (Loc) { diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index c925adc0ea8e4..b29a00c5fe460 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1396,10 +1396,11 @@ static bool matchIntrinsicType( // Verify the overloaded type "matches" the Ref type. // i.e. Ty is a vector with the same width as Ref. // Composed of pointers to the same element type as Ref. - VectorType *ReferenceType = dyn_cast(ArgTys[RefArgNumber]); - VectorType *ThisArgVecTy = dyn_cast(Ty); + auto *ReferenceType = dyn_cast(ArgTys[RefArgNumber]); + auto *ThisArgVecTy = dyn_cast(Ty); if (!ThisArgVecTy || !ReferenceType || - (ReferenceType->getNumElements() != ThisArgVecTy->getNumElements())) + (cast(ReferenceType)->getNumElements() != + cast(ThisArgVecTy)->getNumElements())) return true; PointerType *ThisArgEltTy = dyn_cast(ThisArgVecTy->getElementType()); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 1de25c25cd056..33a0f5b09d0b9 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -523,8 +523,8 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id, CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, Align Alignment, Value *Mask, Value *PassThru, const Twine &Name) { - auto PtrsTy = cast(Ptrs->getType()); - auto PtrTy = cast(PtrsTy->getElementType()); + auto *PtrsTy = cast(Ptrs->getType()); + auto *PtrTy = cast(PtrsTy->getElementType()); unsigned NumElts = PtrsTy->getNumElements(); auto *DataTy = FixedVectorType::get(PtrTy->getElementType(), NumElts); @@ -553,8 +553,8 @@ CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, Align Alignment, /// be accessed in memory CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs, Align Alignment, Value *Mask) { - auto PtrsTy = cast(Ptrs->getType()); - auto DataTy = cast(Data->getType()); + auto *PtrsTy = cast(Ptrs->getType()); + auto *DataTy = cast(Data->getType()); unsigned NumElts = PtrsTy->getNumElements(); #ifndef NDEBUG diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index bfbd801cb7a72..f09142530949c 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -483,17 +483,33 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const { if (getNumOperands() == 0 && I->getNumOperands() == 0) return haveSameSpecialState(this, I); - // We have two instructions of identical opcode and #operands. Check to see - // if all operands are the same. - if (!std::equal(op_begin(), op_end(), I->op_begin())) - return false; - + // PHI nodes are special. if (const PHINode *thisPHI = dyn_cast(this)) { const PHINode *otherPHI = cast(I); - return std::equal(thisPHI->block_begin(), thisPHI->block_end(), + // PHI nodes don't nessesairly have their operands in the same order, + // so we shouldn't just compare ranges of incoming blocks/values. + + // If both PHI's are in the same basic block, which is the most interesting + // case, we know they must have identical predecessor list, + // so we only need to check the incoming values. + if (thisPHI->getParent() == otherPHI->getParent()) { + return all_of(thisPHI->blocks(), [thisPHI, otherPHI](BasicBlock *PredBB) { + return thisPHI->getIncomingValueForBlock(PredBB) == + otherPHI->getIncomingValueForBlock(PredBB); + }); + } + + // Otherwise, let's just naively compare operands/blocks. + return std::equal(op_begin(), op_end(), I->op_begin()) && + std::equal(thisPHI->block_begin(), thisPHI->block_end(), otherPHI->block_begin()); } + // We have two instructions of identical opcode and #operands. Check to see + // if all operands are the same. + if (!std::equal(op_begin(), op_end(), I->op_begin())) + return false; + return haveSameSpecialState(this, I); } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 763b1f564ce19..48f416173dde1 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1943,7 +1943,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef Mask, } void ShuffleVectorInst::commute() { - int NumOpElts = cast(Op<0>()->getType())->getNumElements(); + int NumOpElts = cast(Op<0>()->getType())->getNumElements(); int NumMaskElts = ShuffleMask.size(); SmallVector NewMask(NumMaskElts); for (int i = 0; i != NumMaskElts; ++i) { @@ -1997,7 +1997,7 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2, return true; if (const auto *MV = dyn_cast(Mask)) { - unsigned V1Size = cast(V1->getType())->getNumElements(); + unsigned V1Size = cast(V1->getType())->getNumElements(); for (Value *Op : MV->operands()) { if (auto *CI = dyn_cast(Op)) { if (CI->uge(V1Size*2)) @@ -2010,8 +2010,9 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2, } if (const auto *CDS = dyn_cast(Mask)) { - unsigned V1Size = cast(V1->getType())->getNumElements(); - for (unsigned i = 0, e = MaskTy->getNumElements(); i != e; ++i) + unsigned V1Size = cast(V1->getType())->getNumElements(); + for (unsigned i = 0, e = cast(MaskTy)->getNumElements(); + i != e; ++i) if (CDS->getElementAsInteger(i) >= V1Size*2) return false; return true; @@ -2022,12 +2023,26 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2, void ShuffleVectorInst::getShuffleMask(const Constant *Mask, SmallVectorImpl &Result) { - unsigned NumElts = cast(Mask->getType())->getElementCount().Min; + ElementCount EC = cast(Mask->getType())->getElementCount(); + if (isa(Mask)) { - Result.resize(NumElts, 0); + Result.resize(EC.Min, 0); return; } - Result.reserve(NumElts); + + Result.reserve(EC.Min); + + if (EC.Scalable) { + assert((isa(Mask) || isa(Mask)) && + "Scalable vector shuffle mask must be undef or zeroinitializer"); + int MaskVal = isa(Mask) ? -1 : 0; + for (unsigned I = 0; I < EC.Min; ++I) + Result.emplace_back(MaskVal); + return; + } + + unsigned NumElts = EC.Min; + if (auto *CDS = dyn_cast(Mask)) { for (unsigned i = 0; i != NumElts; ++i) Result.push_back(CDS->getElementAsInteger(i)); @@ -2209,8 +2224,8 @@ bool ShuffleVectorInst::isExtractSubvectorMask(ArrayRef Mask, bool ShuffleVectorInst::isIdentityWithPadding() const { if (isa(Op<2>())) return false; - int NumOpElts = cast(Op<0>()->getType())->getNumElements(); - int NumMaskElts = cast(getType())->getNumElements(); + int NumOpElts = cast(Op<0>()->getType())->getNumElements(); + int NumMaskElts = cast(getType())->getNumElements(); if (NumMaskElts <= NumOpElts) return false; @@ -2250,8 +2265,8 @@ bool ShuffleVectorInst::isConcat() const { isa(Op<2>())) return false; - int NumOpElts = cast(Op<0>()->getType())->getNumElements(); - int NumMaskElts = getType()->getNumElements(); + int NumOpElts = cast(Op<0>()->getType())->getNumElements(); + int NumMaskElts = cast(getType())->getNumElements(); if (NumMaskElts != NumOpElts * 2) return false; @@ -2992,8 +3007,8 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, "Invalid cast"); assert(Ty->isVectorTy() == S->getType()->isVectorTy() && "Invalid cast"); assert((!Ty->isVectorTy() || - cast(Ty)->getNumElements() == - cast(S->getType())->getNumElements()) && + cast(Ty)->getNumElements() == + cast(S->getType())->getNumElements()) && "Invalid cast"); if (Ty->isIntOrIntVectorTy()) @@ -3011,8 +3026,8 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, "Invalid cast"); assert(Ty->isVectorTy() == S->getType()->isVectorTy() && "Invalid cast"); assert((!Ty->isVectorTy() || - cast(Ty)->getNumElements() == - cast(S->getType())->getNumElements()) && + cast(Ty)->getNumElements() == + cast(S->getType())->getNumElements()) && "Invalid cast"); if (Ty->isIntOrIntVectorTy()) @@ -3123,7 +3138,8 @@ bool CastInst::isCastable(Type *SrcTy, Type *DestTy) { if (VectorType *SrcVecTy = dyn_cast(SrcTy)) if (VectorType *DestVecTy = dyn_cast(DestTy)) - if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) { + if (cast(SrcVecTy)->getNumElements() == + cast(DestVecTy)->getNumElements()) { // An element by element cast. Valid if casting the elements is valid. SrcTy = SrcVecTy->getElementType(); DestTy = DestVecTy->getElementType(); @@ -3245,7 +3261,7 @@ CastInst::getCastOpcode( // FIXME: Check address space sizes here if (VectorType *SrcVecTy = dyn_cast(SrcTy)) if (VectorType *DestVecTy = dyn_cast(DestTy)) - if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) { + if (SrcVecTy->getElementCount() == DestVecTy->getElementCount()) { // An element by element cast. Find the appropriate opcode based on the // element types. SrcTy = SrcVecTy->getElementType(); diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp index 96434ae3306b3..8d9ed917bb617 100644 --- a/llvm/lib/IR/LegacyPassManager.cpp +++ b/llvm/lib/IR/LegacyPassManager.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/LegacyPassNameParser.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassTimingInfo.h" +#include "llvm/IR/StructuralHash.h" #include "llvm/Support/Chrono.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -1475,75 +1476,6 @@ void FPPassManager::dumpPassStructure(unsigned Offset) { } } -#ifdef EXPENSIVE_CHECKS -namespace { -namespace details { - -// Basic hashing mechanism to detect structural change to the IR, used to verify -// pass return status consistency with actual change. Loosely copied from -// llvm/lib/Transforms/Utils/FunctionComparator.cpp - -class StructuralHash { - uint64_t Hash = 0x6acaa36bef8325c5ULL; - - void update(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); } - -public: - StructuralHash() = default; - - void update(Function &F) { - if (F.empty()) - return; - - update(F.isVarArg()); - update(F.arg_size()); - - SmallVector BBs; - SmallPtrSet VisitedBBs; - - BBs.push_back(&F.getEntryBlock()); - VisitedBBs.insert(BBs[0]); - while (!BBs.empty()) { - const BasicBlock *BB = BBs.pop_back_val(); - update(45798); // Block header - for (auto &Inst : *BB) - update(Inst.getOpcode()); - - const Instruction *Term = BB->getTerminator(); - for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { - if (!VisitedBBs.insert(Term->getSuccessor(i)).second) - continue; - BBs.push_back(Term->getSuccessor(i)); - } - } - } - - void update(Module &M) { - for (Function &F : M) - update(F); - } - - uint64_t getHash() const { return Hash; } -}; - -} // namespace details - -uint64_t StructuralHash(Function &F) { - details::StructuralHash H; - H.update(F); - return H.getHash(); -} - -uint64_t StructuralHash(Module &M) { - details::StructuralHash H; - H.update(M); - return H.getHash(); -} - -} // end anonymous namespace - -#endif - /// Execute all of the passes scheduled for execution by invoking /// runOnFunction method. Keep track of whether any of the passes modifies /// the function, and if so, return true. @@ -1590,7 +1522,7 @@ bool FPPassManager::runOnFunction(Function &F) { if (!LocalChanged && (RefHash != StructuralHash(F))) { llvm::errs() << "Pass modifies its input and doesn't report it: " << FP->getPassName() << "\n"; - assert(false && "Pass modifies its input and doesn't report it."); + llvm_unreachable("Pass modifies its input and doesn't report it"); } #endif diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp new file mode 100644 index 0000000000000..5a6e074513268 --- /dev/null +++ b/llvm/lib/IR/StructuralHash.cpp @@ -0,0 +1,84 @@ +//===-- StructuralHash.cpp - IR Hash for expensive checks -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// + +#ifdef EXPENSIVE_CHECKS + +#include "llvm/IR/StructuralHash.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" + +using namespace llvm; + +namespace { +namespace details { + +// Basic hashing mechanism to detect structural change to the IR, used to verify +// pass return status consistency with actual change. Loosely copied from +// llvm/lib/Transforms/Utils/FunctionComparator.cpp + +class StructuralHash { + uint64_t Hash = 0x6acaa36bef8325c5ULL; + + void update(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); } + +public: + StructuralHash() = default; + + void update(const Function &F) { + if (F.empty()) + return; + + update(F.isVarArg()); + update(F.arg_size()); + + SmallVector BBs; + SmallPtrSet VisitedBBs; + + BBs.push_back(&F.getEntryBlock()); + VisitedBBs.insert(BBs[0]); + while (!BBs.empty()) { + const BasicBlock *BB = BBs.pop_back_val(); + update(45798); // Block header + for (auto &Inst : *BB) + update(Inst.getOpcode()); + + const Instruction *Term = BB->getTerminator(); + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + if (!VisitedBBs.insert(Term->getSuccessor(i)).second) + continue; + BBs.push_back(Term->getSuccessor(i)); + } + } + } + + void update(const Module &M) { + for (const Function &F : M) + update(F); + } + + uint64_t getHash() const { return Hash; } +}; + +} // namespace details + +} // namespace + +uint64_t llvm::StructuralHash(const Function &F) { + details::StructuralHash H; + H.update(F); + return H.getHash(); +} + +uint64_t llvm::StructuralHash(const Module &M) { + details::StructuralHash H; + H.update(M); + return H.getHash(); +} + +#endif diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 26a983596ab24..3e5dd8fdaf989 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -147,6 +147,14 @@ bool Value::hasNUsesOrMore(unsigned N) const { return hasNItemsOrMore(use_begin(), use_end(), N); } +bool Value::hasOneUser() const { + if (use_empty()) + return false; + if (hasOneUse()) + return true; + return std::equal(++user_begin(), user_end(), user_begin()); +} + static bool isUnDroppableUser(const User *U) { return !U->isDroppable(); } Use *Value::getSingleUndroppableUse() { diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index d9e3a61e6fa28..89fc0d073749c 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2918,8 +2918,8 @@ void Verifier::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { Assert(SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace(), "AddrSpaceCast must be between different address spaces", &I); if (auto *SrcVTy = dyn_cast(SrcTy)) - Assert(SrcVTy->getNumElements() == - cast(DestTy)->getNumElements(), + Assert(cast(SrcVTy)->getNumElements() == + cast(DestTy)->getNumElements(), "AddrSpaceCast vector pointer number of elements mismatch", &I); visitInstruction(I); } @@ -5061,7 +5061,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "Vector element type mismatch of the result and second operand " "vector!", IF); - Assert(ResultTy->getNumElements() == + Assert(cast(ResultTy)->getNumElements() == NumRows->getZExtValue() * NumColumns->getZExtValue(), "Result of a matrix operation does not fit in the returned vector!"); @@ -5147,7 +5147,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) { Assert(Operand->getType()->isFPOrFPVectorTy(), "Intrinsic first argument must be floating point", &FPI); if (auto *OperandT = dyn_cast(Operand->getType())) { - NumSrcElem = OperandT->getNumElements(); + NumSrcElem = cast(OperandT)->getNumElements(); } Operand = &FPI; @@ -5156,7 +5156,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) { Assert(Operand->getType()->isIntOrIntVectorTy(), "Intrinsic result must be an integer", &FPI); if (auto *OperandT = dyn_cast(Operand->getType())) { - Assert(NumSrcElem == OperandT->getNumElements(), + Assert(NumSrcElem == cast(OperandT)->getNumElements(), "Intrinsic first argument and result vector lengths must be equal", &FPI); } @@ -5170,7 +5170,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) { Assert(Operand->getType()->isIntOrIntVectorTy(), "Intrinsic first argument must be integer", &FPI); if (auto *OperandT = dyn_cast(Operand->getType())) { - NumSrcElem = OperandT->getNumElements(); + NumSrcElem = cast(OperandT)->getNumElements(); } Operand = &FPI; @@ -5179,7 +5179,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) { Assert(Operand->getType()->isFPOrFPVectorTy(), "Intrinsic result must be a floating point", &FPI); if (auto *OperandT = dyn_cast(Operand->getType())) { - Assert(NumSrcElem == OperandT->getNumElements(), + Assert(NumSrcElem == cast(OperandT)->getNumElements(), "Intrinsic first argument and result vector lengths must be equal", &FPI); } @@ -5198,9 +5198,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) { Assert(OperandTy->isVectorTy() == ResultTy->isVectorTy(), "Intrinsic first argument and result disagree on vector use", &FPI); if (OperandTy->isVectorTy()) { - auto *OperandVecTy = cast(OperandTy); - auto *ResultVecTy = cast(ResultTy); - Assert(OperandVecTy->getNumElements() == ResultVecTy->getNumElements(), + Assert(cast(OperandTy)->getNumElements() == + cast(ResultTy)->getNumElements(), "Intrinsic first argument and result vector lengths must be equal", &FPI); } diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 6e1e3998e490e..6230216aa4466 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -798,7 +798,7 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod, for (GlobalValue *GV : Mod.Keep) { if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID())) { if (Function *F = dyn_cast(GV)) { - OptimizationRemarkEmitter ORE(F); + OptimizationRemarkEmitter ORE(F, nullptr); ORE.emit(OptimizationRemark(DEBUG_TYPE, "deadfunction", F) << ore::NV("Function", F) << " not added to the combined module "); diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp index 25ab1404b4e12..aff1977850b88 100644 --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -466,8 +466,6 @@ void LTOCodeGenerator::applyScopeRestrictions() { internalizeModule(*MergedModule, mustPreserveGV); - MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1); - ScopeRestrictionsDone = true; } @@ -559,6 +557,9 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline, // Mark which symbols can not be internalized this->applyScopeRestrictions(); + // Write LTOPostLink flag for passes that require all the modules. + MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1); + // Instantiate the pass manager to organize the passes. legacy::PassManager passes; diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index d0a1e1889c610..4adc9a22a7b2d 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -269,16 +269,26 @@ addUsedSymbolToPreservedGUID(const lto::InputFile &File, } // Convert the PreservedSymbols map from "Name" based to "GUID" based. +static void computeGUIDPreservedSymbols(const lto::InputFile &File, + const StringSet<> &PreservedSymbols, + const Triple &TheTriple, + DenseSet &GUIDs) { + // Iterate the symbols in the input file and if the input has preserved symbol + // compute the GUID for the symbol. + for (const auto &Sym : File.symbols()) { + if (PreservedSymbols.count(Sym.getName())) + GUIDs.insert(GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( + Sym.getIRName(), GlobalValue::ExternalLinkage, ""))); + } +} + static DenseSet -computeGUIDPreservedSymbols(const StringSet<> &PreservedSymbols, +computeGUIDPreservedSymbols(const lto::InputFile &File, + const StringSet<> &PreservedSymbols, const Triple &TheTriple) { DenseSet GUIDPreservedSymbols(PreservedSymbols.size()); - for (auto &Entry : PreservedSymbols) { - StringRef Name = Entry.first(); - if (TheTriple.isOSBinFormatMachO() && Name.size() > 0 && Name[0] == '_') - Name = Name.drop_front(); - GUIDPreservedSymbols.insert(GlobalValue::getGUID(Name)); - } + computeGUIDPreservedSymbols(File, PreservedSymbols, TheTriple, + GUIDPreservedSymbols); return GUIDPreservedSymbols; } @@ -652,7 +662,7 @@ void ThinLTOCodeGenerator::promote(Module &TheModule, ModuleSummaryIndex &Index, // Convert the preserved symbols set from string to GUID auto GUIDPreservedSymbols = computeGUIDPreservedSymbols( - PreservedSymbols, Triple(TheModule.getTargetTriple())); + File, PreservedSymbols, Triple(TheModule.getTargetTriple())); // Add used symbol to the preserved symbols. addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols); @@ -702,7 +712,7 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule, // Convert the preserved symbols set from string to GUID auto GUIDPreservedSymbols = computeGUIDPreservedSymbols( - PreservedSymbols, Triple(TheModule.getTargetTriple())); + File, PreservedSymbols, Triple(TheModule.getTargetTriple())); addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols); @@ -737,7 +747,7 @@ void ThinLTOCodeGenerator::gatherImportedSummariesForModule( // Convert the preserved symbols set from string to GUID auto GUIDPreservedSymbols = computeGUIDPreservedSymbols( - PreservedSymbols, Triple(TheModule.getTargetTriple())); + File, PreservedSymbols, Triple(TheModule.getTargetTriple())); addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols); @@ -770,7 +780,7 @@ void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName, // Convert the preserved symbols set from string to GUID auto GUIDPreservedSymbols = computeGUIDPreservedSymbols( - PreservedSymbols, Triple(TheModule.getTargetTriple())); + File, PreservedSymbols, Triple(TheModule.getTargetTriple())); addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols); @@ -808,7 +818,7 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule, // Convert the preserved symbols set from string to GUID auto GUIDPreservedSymbols = - computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple); + computeGUIDPreservedSymbols(File, PreservedSymbols, TMBuilder.TheTriple); addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols); @@ -972,8 +982,10 @@ void ThinLTOCodeGenerator::run() { // Convert the preserved symbols set from string to GUID, this is needed for // computing the caching hash and the internalization. - auto GUIDPreservedSymbols = - computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple); + DenseSet GUIDPreservedSymbols; + for (const auto &M : Modules) + computeGUIDPreservedSymbols(*M, PreservedSymbols, TMBuilder.TheTriple, + GUIDPreservedSymbols); // Add used symbol from inputs to the preserved symbols. for (const auto &M : Modules) diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp index ac288ca08c932..76c82cd176a1a 100644 --- a/llvm/lib/MC/MCWin64EH.cpp +++ b/llvm/lib/MC/MCWin64EH.cpp @@ -494,6 +494,27 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // If this UNWIND_INFO already has a symbol, it's already been emitted. if (info->Symbol) return; + // If there's no unwind info here (not even a terminating UOP_End), the + // unwind info is considered bogus and skipped. If this was done in + // response to an explicit .seh_handlerdata, the associated trailing + // handler data is left orphaned in the xdata section. + if (info->empty()) { + info->EmitAttempted = true; + return; + } + if (info->EmitAttempted) { + // If we tried to emit unwind info before (due to an explicit + // .seh_handlerdata directive), but skipped it (because there was no + // valid information to emit at the time), and it later got valid unwind + // opcodes, we can't emit it here, because the trailing handler data + // was already emitted elsewhere in the xdata section. + streamer.getContext().reportError( + SMLoc(), "Earlier .seh_handlerdata for " + info->Function->getName() + + " skipped due to no unwind info at the time " + "(.seh_handlerdata too early?), but the function later " + "did get unwind info that can't be emitted"); + return; + } MCContext &context = streamer.getContext(); MCSymbol *Label = context.createTempSymbol(); @@ -657,16 +678,25 @@ static void ARM64EmitRuntimeFunction(MCStreamer &streamer, void llvm::Win64EH::ARM64UnwindEmitter::Emit(MCStreamer &Streamer) const { // Emit the unwind info structs first. for (const auto &CFI : Streamer.getWinFrameInfos()) { + WinEH::FrameInfo *Info = CFI.get(); + if (Info->empty()) + continue; MCSection *XData = Streamer.getAssociatedXDataSection(CFI->TextSection); Streamer.SwitchSection(XData); - ARM64EmitUnwindInfo(Streamer, CFI.get()); + ARM64EmitUnwindInfo(Streamer, Info); } // Now emit RUNTIME_FUNCTION entries. for (const auto &CFI : Streamer.getWinFrameInfos()) { + WinEH::FrameInfo *Info = CFI.get(); + // ARM64EmitUnwindInfo above clears the info struct, so we can't check + // empty here. But if a Symbol is set, we should create the corresponding + // pdata entry. + if (!Info->Symbol) + continue; MCSection *PData = Streamer.getAssociatedPDataSection(CFI->TextSection); Streamer.SwitchSection(PData); - ARM64EmitRuntimeFunction(Streamer, CFI.get()); + ARM64EmitRuntimeFunction(Streamer, Info); } } diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index d6565f2a43784..5047b5041aa75 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -49,6 +49,7 @@ namespace { constexpr unsigned DefaultSectionAlign = 4; constexpr int16_t MaxSectionIndex = INT16_MAX; +constexpr uint16_t MaxTOCSizeInARegion = UINT16_MAX; // Packs the csect's alignment and type into a byte. uint8_t getEncodedType(const MCSectionXCOFF *); @@ -68,11 +69,6 @@ struct Symbol { XCOFF::StorageClass getStorageClass() const { return MCSym->getStorageClass(); } - - XCOFF::VisibilityType getVisibilityType() const { - return MCSym->getVisibilityType(); - } - StringRef getSymbolTableName() const { return MCSym->getSymbolTableName(); } Symbol(const MCSymbolXCOFF *MCSym) : MCSym(MCSym), SymbolTableIndex(-1) {} }; @@ -309,6 +305,7 @@ CsectGroup &XCOFFObjectWriter::getCsectGroup(const MCSectionXCOFF *MCSec) { "in this CsectGroup."); return TOCCsects; case XCOFF::XMC_TC: + case XCOFF::XMC_TE: assert(XCOFF::XTY_SD == MCSec->getCSectType() && "Only an initialized csect can contain TC entry."); assert(!TOCCsects.empty() && @@ -432,9 +429,15 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm, // The FixedValue should be symbol's virtual address in this object file // plus any constant value that we might get. FixedValue = getVirtualAddress(SymA, SymASec) + Target.getConstant(); - else if (Type == XCOFF::RelocationType::R_TOC) + else if (Type == XCOFF::RelocationType::R_TOC || + Type == XCOFF::RelocationType::R_TOCL) { // The FixedValue should be the TC entry offset from TOC-base. FixedValue = SectionMap[SymASec]->Address - TOCCsects.front().Address; + if (FixedValue >= MaxTOCSizeInARegion) + report_fatal_error( + "handling of TOC entries could not fit in the initial TOC " + "entry region is not yet supported"); + } assert( (TargetObjectWriter->is64Bit() || @@ -566,12 +569,13 @@ void XCOFFObjectWriter::writeSymbolTableEntryForCsectMemberLabel( W.write(CSectionRef.Address + SymbolOffset); W.write(SectionIndex); // Basic/Derived type. See the description of the n_type field for symbol - // table entries for a detailed description. Since we support visibility, and - // all other bits are either optionally set or reserved, we only set bits 0-3 - // for symbol's visibility and leave other bits to zero. + // table entries for a detailed description. Since we don't yet support + // visibility, and all other bits are either optionally set or reserved, this + // is always zero. + // TODO FIXME How to assert a symbol's visibilty is default? // TODO Set the function indicator (bit 10, 0x0020) for functions // when debugging is enabled. - W.write(SymbolRef.getVisibilityType()); + W.write(0); W.write(SymbolRef.getStorageClass()); // Always 1 aux entry for now. W.write(1); @@ -602,12 +606,13 @@ void XCOFFObjectWriter::writeSymbolTableEntryForControlSection( // n_scnum W.write(SectionIndex); // Basic/Derived type. See the description of the n_type field for symbol - // table entries for a detailed description. Since we support visibility, and - // all other bits are either optionally set or reserved, we only set bits 0-3 - // for symbol's visibility and leave other bits to zero. + // table entries for a detailed description. Since we don't yet support + // visibility, and all other bits are either optionally set or reserved, this + // is always zero. + // TODO FIXME How to assert a symbol's visibilty is default? // TODO Set the function indicator (bit 10, 0x0020) for functions // when debugging is enabled. - W.write(CSectionRef.MCCsect->getVisibilityType()); + W.write(0); // n_sclass W.write(StorageClass); // Always 1 aux entry for now. diff --git a/llvm/lib/Object/Binary.cpp b/llvm/lib/Object/Binary.cpp index 944d2bc1bca77..384df4b843586 100644 --- a/llvm/lib/Object/Binary.cpp +++ b/llvm/lib/Object/Binary.cpp @@ -93,7 +93,8 @@ Expected> object::createBinary(MemoryBufferRef Buffer, llvm_unreachable("Unexpected Binary File Type"); } -Expected> object::createBinary(StringRef Path) { +Expected> object::createBinary(StringRef Path, + LLVMContext *Context) { ErrorOr> FileOrErr = MemoryBuffer::getFileOrSTDIN(Path, /*FileSize=*/-1, /*RequiresNullTerminator=*/false); @@ -102,7 +103,7 @@ Expected> object::createBinary(StringRef Path) { std::unique_ptr &Buffer = FileOrErr.get(); Expected> BinOrErr = - createBinary(Buffer->getMemBufferRef()); + createBinary(Buffer->getMemBufferRef(), Context); if (!BinOrErr) return BinOrErr.takeError(); std::unique_ptr &Bin = BinOrErr.get(); diff --git a/llvm/lib/Object/MachOUniversal.cpp b/llvm/lib/Object/MachOUniversal.cpp index a178ecde949e5..f3ce005e6ef9b 100644 --- a/llvm/lib/Object/MachOUniversal.cpp +++ b/llvm/lib/Object/MachOUniversal.cpp @@ -12,6 +12,7 @@ #include "llvm/Object/MachOUniversal.h" #include "llvm/Object/Archive.h" +#include "llvm/Object/IRObjectFile.h" #include "llvm/Object/MachO.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Casting.h" @@ -80,6 +81,25 @@ MachOUniversalBinary::ObjectForArch::getAsObjectFile() const { return ObjectFile::createMachOObjectFile(ObjBuffer, cputype, Index); } +Expected> +MachOUniversalBinary::ObjectForArch::getAsIRObject(LLVMContext &Ctx) const { + if (!Parent) + report_fatal_error("MachOUniversalBinary::ObjectForArch::getAsIRObject() " + "called when Parent is a nullptr"); + + StringRef ParentData = Parent->getData(); + StringRef ObjectData; + if (Parent->getMagic() == MachO::FAT_MAGIC) { + ObjectData = ParentData.substr(Header.offset, Header.size); + } else { // Parent->getMagic() == MachO::FAT_MAGIC_64 + ObjectData = ParentData.substr(Header64.offset, Header64.size); + } + StringRef ObjectName = Parent->getFileName(); + MemoryBufferRef ObjBuffer(ObjectData, ObjectName); + + return IRObjectFile::create(ObjBuffer, Ctx); +} + Expected> MachOUniversalBinary::ObjectForArch::getAsArchive() const { if (!Parent) @@ -234,6 +254,15 @@ MachOUniversalBinary::getMachOObjectForArch(StringRef ArchName) const { return O->getAsObjectFile(); } +Expected> +MachOUniversalBinary::getIRObjectForArch(StringRef ArchName, + LLVMContext &Ctx) const { + Expected O = getObjectForArch(ArchName); + if (!O) + return O.takeError(); + return O->getAsIRObject(Ctx); +} + Expected> MachOUniversalBinary::getArchiveForArch(StringRef ArchName) const { Expected O = getObjectForArch(ArchName); diff --git a/llvm/lib/Object/MachOUniversalWriter.cpp b/llvm/lib/Object/MachOUniversalWriter.cpp index 169d64430284e..35db26aae632c 100644 --- a/llvm/lib/Object/MachOUniversalWriter.cpp +++ b/llvm/lib/Object/MachOUniversalWriter.cpp @@ -12,9 +12,11 @@ //===----------------------------------------------------------------------===// #include "llvm/Object/MachOUniversalWriter.h" +#include "llvm/ADT/Triple.h" #include "llvm/Object/Archive.h" #include "llvm/Object/Binary.h" #include "llvm/Object/Error.h" +#include "llvm/Object/IRObjectFile.h" #include "llvm/Object/MachO.h" #include "llvm/Object/MachOUniversal.h" #include "llvm/Support/FileOutputBuffer.h" @@ -79,13 +81,36 @@ Slice::Slice(const MachOObjectFile &O, uint32_t Align) ArchName(std::string(O.getArchTriple().getArchName())), P2Alignment(Align) {} +Slice::Slice(const IRObjectFile *IRO, uint32_t CPUType, uint32_t CPUSubType, + std::string ArchName, uint32_t Align) + : B(IRO), CPUType(CPUType), CPUSubType(CPUSubType), + ArchName(std::move(ArchName)), P2Alignment(Align) {} + Slice::Slice(const MachOObjectFile &O) : Slice(O, calculateAlignment(O)) {} -Expected Slice::create(const Archive *A) { +using MachoCPUTy = std::pair; + +static Expected getMachoCPUFromTriple(Triple TT) { + auto CPU = std::make_pair(MachO::getCPUType(TT), MachO::getCPUSubType(TT)); + if (!CPU.first) { + return CPU.first.takeError(); + } + if (!CPU.second) { + return CPU.second.takeError(); + } + return std::make_pair(*CPU.first, *CPU.second); +} + +static Expected getMachoCPUFromTriple(StringRef TT) { + return getMachoCPUFromTriple(Triple{TT}); +} + +Expected Slice::create(const Archive *A, LLVMContext *LLVMCtx) { Error Err = Error::success(); - std::unique_ptr FO = nullptr; + std::unique_ptr MFO = nullptr; + std::unique_ptr IRFO = nullptr; for (const Archive::Child &Child : A->children(Err)) { - Expected> ChildOrErr = Child.getAsBinary(); + Expected> ChildOrErr = Child.getAsBinary(LLVMCtx); if (!ChildOrErr) return createFileError(A->getFileName(), ChildOrErr.takeError()); Binary *Bin = ChildOrErr.get().get(); @@ -95,36 +120,79 @@ Expected Slice::create(const Archive *A) { " is a fat file (not allowed in an archive)") .str() .c_str()); - if (!Bin->isMachO()) - return createStringError( - std::errc::invalid_argument, - ("archive member " + Bin->getFileName() + - " is not a MachO file (not allowed in an archive)") - .str() - .c_str()); - MachOObjectFile *O = cast(Bin); - if (FO && std::tie(FO->getHeader().cputype, FO->getHeader().cpusubtype) != - std::tie(O->getHeader().cputype, O->getHeader().cpusubtype)) { - return createStringError( - std::errc::invalid_argument, - ("archive member " + O->getFileName() + " cputype (" + - Twine(O->getHeader().cputype) + ") and cpusubtype(" + - Twine(O->getHeader().cpusubtype) + - ") does not match previous archive members cputype (" + - Twine(FO->getHeader().cputype) + ") and cpusubtype(" + - Twine(FO->getHeader().cpusubtype) + ") (all members must match) " + - FO->getFileName()) - .str() - .c_str()); - } - if (!FO) { - ChildOrErr.get().release(); - FO.reset(O); - } + if (Bin->isMachO()) { + MachOObjectFile *O = cast(Bin); + if (IRFO) { + return createStringError( + std::errc::invalid_argument, + "archive member %s is a MachO, while previous archive member " + "%s was an IR LLVM object", + O->getFileName().str().c_str(), IRFO->getFileName().str().c_str()); + } + if (MFO && + std::tie(MFO->getHeader().cputype, MFO->getHeader().cpusubtype) != + std::tie(O->getHeader().cputype, O->getHeader().cpusubtype)) { + return createStringError( + std::errc::invalid_argument, + ("archive member " + O->getFileName() + " cputype (" + + Twine(O->getHeader().cputype) + ") and cpusubtype(" + + Twine(O->getHeader().cpusubtype) + + ") does not match previous archive members cputype (" + + Twine(MFO->getHeader().cputype) + ") and cpusubtype(" + + Twine(MFO->getHeader().cpusubtype) + + ") (all members must match) " + MFO->getFileName()) + .str() + .c_str()); + } + if (!MFO) { + ChildOrErr.get().release(); + MFO.reset(O); + } + } else if (Bin->isIR()) { + IRObjectFile *O = cast(Bin); + if (MFO) { + return createStringError(std::errc::invalid_argument, + "archive member '%s' is an LLVM IR object, " + "while previous archive member " + "'%s' was a MachO", + O->getFileName().str().c_str(), + MFO->getFileName().str().c_str()); + } + if (IRFO) { + Expected CPUO = getMachoCPUFromTriple(O->getTargetTriple()); + Expected CPUFO = + getMachoCPUFromTriple(IRFO->getTargetTriple()); + if (!CPUO) + return CPUO.takeError(); + if (!CPUFO) + return CPUFO.takeError(); + if (*CPUO != *CPUFO) { + return createStringError( + std::errc::invalid_argument, + ("archive member " + O->getFileName() + " cputype (" + + Twine(CPUO->first) + ") and cpusubtype(" + Twine(CPUO->second) + + ") does not match previous archive members cputype (" + + Twine(CPUFO->first) + ") and cpusubtype(" + + Twine(CPUFO->second) + ") (all members must match) " + + IRFO->getFileName()) + .str() + .c_str()); + } + } else { + ChildOrErr.get().release(); + IRFO.reset(O); + } + } else + return createStringError(std::errc::invalid_argument, + ("archive member " + Bin->getFileName() + + " is neither a MachO file or an LLVM IR file " + "(not allowed in an archive)") + .str() + .c_str()); } if (Err) return createFileError(A->getFileName(), std::move(Err)); - if (!FO) + if (!MFO && !IRFO) return createStringError( std::errc::invalid_argument, ("empty archive with no architecture specification: " + @@ -132,9 +200,32 @@ Expected Slice::create(const Archive *A) { .str() .c_str()); - Slice ArchiveSlice = Slice(*(FO.get()), FO->is64Bit() ? 3 : 2); + if (MFO) { + Slice ArchiveSlice(*(MFO.get()), MFO->is64Bit() ? 3 : 2); + ArchiveSlice.B = A; + return ArchiveSlice; + } + + // For IR objects + Expected ArchiveSliceOrErr = Slice::create(IRFO.get(), 0); + if (!ArchiveSliceOrErr) + return createFileError(A->getFileName(), ArchiveSliceOrErr.takeError()); + auto &ArchiveSlice = ArchiveSliceOrErr.get(); ArchiveSlice.B = A; - return ArchiveSlice; + return Slice{std::move(ArchiveSlice)}; +} + +Expected Slice::create(const IRObjectFile *IRO, uint32_t Align) { + Expected CPUOrErr = getMachoCPUFromTriple(IRO->getTargetTriple()); + if (!CPUOrErr) + return CPUOrErr.takeError(); + unsigned CPUType, CPUSubType; + std::tie(CPUType, CPUSubType) = CPUOrErr.get(); + // We don't directly use the architecture name of the target triple T, as, + // for instance, thumb is treated as ARM by the MachOUniversal object. + std::string ArchName( + MachOObjectFile::getArchTriple(CPUType, CPUSubType).getArchName()); + return Slice{IRO, CPUType, CPUSubType, std::move(ArchName), Align}; } static Expected> diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp index ad4b31614b9d8..bbcf56cc0cec0 100644 --- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp +++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp @@ -448,91 +448,113 @@ static void emitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) { encodeULEB128(File.Length, OS); } +static void writeLineTableOpcode(const DWARFYAML::LineTableOpcode &Op, + uint8_t OpcodeBase, uint8_t AddrSize, + raw_ostream &OS, bool IsLittleEndian) { + writeInteger((uint8_t)Op.Opcode, OS, IsLittleEndian); + if (Op.Opcode == 0) { + encodeULEB128(Op.ExtLen, OS); + writeInteger((uint8_t)Op.SubOpcode, OS, IsLittleEndian); + switch (Op.SubOpcode) { + case dwarf::DW_LNE_set_address: + cantFail( + writeVariableSizedInteger(Op.Data, AddrSize, OS, IsLittleEndian)); + break; + case dwarf::DW_LNE_define_file: + emitFileEntry(OS, Op.FileEntry); + break; + case dwarf::DW_LNE_set_discriminator: + encodeULEB128(Op.Data, OS); + break; + case dwarf::DW_LNE_end_sequence: + break; + default: + for (auto OpByte : Op.UnknownOpcodeData) + writeInteger((uint8_t)OpByte, OS, IsLittleEndian); + } + } else if (Op.Opcode < OpcodeBase) { + switch (Op.Opcode) { + case dwarf::DW_LNS_copy: + case dwarf::DW_LNS_negate_stmt: + case dwarf::DW_LNS_set_basic_block: + case dwarf::DW_LNS_const_add_pc: + case dwarf::DW_LNS_set_prologue_end: + case dwarf::DW_LNS_set_epilogue_begin: + break; + + case dwarf::DW_LNS_advance_pc: + case dwarf::DW_LNS_set_file: + case dwarf::DW_LNS_set_column: + case dwarf::DW_LNS_set_isa: + encodeULEB128(Op.Data, OS); + break; + + case dwarf::DW_LNS_advance_line: + encodeSLEB128(Op.SData, OS); + break; + + case dwarf::DW_LNS_fixed_advance_pc: + writeInteger((uint16_t)Op.Data, OS, IsLittleEndian); + break; + + default: + for (auto OpData : Op.StandardOpcodeData) { + encodeULEB128(OpData, OS); + } + } + } +} + Error DWARFYAML::emitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) { - for (const auto &LineTable : DI.DebugLines) { - writeInitialLength(LineTable.Format, LineTable.Length, OS, - DI.IsLittleEndian); - uint64_t SizeOfPrologueLength = LineTable.Format == dwarf::DWARF64 ? 8 : 4; - writeInteger((uint16_t)LineTable.Version, OS, DI.IsLittleEndian); - cantFail(writeVariableSizedInteger( - LineTable.PrologueLength, SizeOfPrologueLength, OS, DI.IsLittleEndian)); - writeInteger((uint8_t)LineTable.MinInstLength, OS, DI.IsLittleEndian); + for (const DWARFYAML::LineTable &LineTable : DI.DebugLines) { + // Buffer holds the bytes following the header_length (or prologue_length in + // DWARFv2) field to the end of the line number program itself. + std::string Buffer; + raw_string_ostream BufferOS(Buffer); + + writeInteger(LineTable.MinInstLength, BufferOS, DI.IsLittleEndian); + // TODO: Add support for emitting DWARFv5 line table. if (LineTable.Version >= 4) - writeInteger((uint8_t)LineTable.MaxOpsPerInst, OS, DI.IsLittleEndian); - writeInteger((uint8_t)LineTable.DefaultIsStmt, OS, DI.IsLittleEndian); - writeInteger((uint8_t)LineTable.LineBase, OS, DI.IsLittleEndian); - writeInteger((uint8_t)LineTable.LineRange, OS, DI.IsLittleEndian); - writeInteger((uint8_t)LineTable.OpcodeBase, OS, DI.IsLittleEndian); - - for (auto OpcodeLength : LineTable.StandardOpcodeLengths) - writeInteger((uint8_t)OpcodeLength, OS, DI.IsLittleEndian); - - for (auto IncludeDir : LineTable.IncludeDirs) { - OS.write(IncludeDir.data(), IncludeDir.size()); - OS.write('\0'); + writeInteger(LineTable.MaxOpsPerInst, BufferOS, DI.IsLittleEndian); + writeInteger(LineTable.DefaultIsStmt, BufferOS, DI.IsLittleEndian); + writeInteger(LineTable.LineBase, BufferOS, DI.IsLittleEndian); + writeInteger(LineTable.LineRange, BufferOS, DI.IsLittleEndian); + writeInteger(LineTable.OpcodeBase, BufferOS, DI.IsLittleEndian); + + for (uint8_t OpcodeLength : LineTable.StandardOpcodeLengths) + writeInteger(OpcodeLength, BufferOS, DI.IsLittleEndian); + + for (StringRef IncludeDir : LineTable.IncludeDirs) { + BufferOS.write(IncludeDir.data(), IncludeDir.size()); + BufferOS.write('\0'); } - OS.write('\0'); + BufferOS.write('\0'); - for (auto File : LineTable.Files) - emitFileEntry(OS, File); - OS.write('\0'); + for (const DWARFYAML::File &File : LineTable.Files) + emitFileEntry(BufferOS, File); + BufferOS.write('\0'); - uint8_t AddrSize = DI.Is64BitAddrSize ? 8 : 4; - - for (auto Op : LineTable.Opcodes) { - writeInteger((uint8_t)Op.Opcode, OS, DI.IsLittleEndian); - if (Op.Opcode == 0) { - encodeULEB128(Op.ExtLen, OS); - writeInteger((uint8_t)Op.SubOpcode, OS, DI.IsLittleEndian); - switch (Op.SubOpcode) { - case dwarf::DW_LNE_set_address: - cantFail(writeVariableSizedInteger(Op.Data, AddrSize, OS, - DI.IsLittleEndian)); - break; - case dwarf::DW_LNE_define_file: - emitFileEntry(OS, Op.FileEntry); - break; - case dwarf::DW_LNE_set_discriminator: - encodeULEB128(Op.Data, OS); - break; - case dwarf::DW_LNE_end_sequence: - break; - default: - for (auto OpByte : Op.UnknownOpcodeData) - writeInteger((uint8_t)OpByte, OS, DI.IsLittleEndian); - } - } else if (Op.Opcode < LineTable.OpcodeBase) { - switch (Op.Opcode) { - case dwarf::DW_LNS_copy: - case dwarf::DW_LNS_negate_stmt: - case dwarf::DW_LNS_set_basic_block: - case dwarf::DW_LNS_const_add_pc: - case dwarf::DW_LNS_set_prologue_end: - case dwarf::DW_LNS_set_epilogue_begin: - break; - - case dwarf::DW_LNS_advance_pc: - case dwarf::DW_LNS_set_file: - case dwarf::DW_LNS_set_column: - case dwarf::DW_LNS_set_isa: - encodeULEB128(Op.Data, OS); - break; - - case dwarf::DW_LNS_advance_line: - encodeSLEB128(Op.SData, OS); - break; - - case dwarf::DW_LNS_fixed_advance_pc: - writeInteger((uint16_t)Op.Data, OS, DI.IsLittleEndian); - break; - - default: - for (auto OpData : Op.StandardOpcodeData) { - encodeULEB128(OpData, OS); - } - } - } + uint64_t HeaderLength = + LineTable.PrologueLength ? *LineTable.PrologueLength : Buffer.size(); + + for (const DWARFYAML::LineTableOpcode &Op : LineTable.Opcodes) + writeLineTableOpcode(Op, LineTable.OpcodeBase, DI.Is64BitAddrSize ? 8 : 4, + BufferOS, DI.IsLittleEndian); + + uint64_t Length; + if (LineTable.Length) { + Length = *LineTable.Length; + } else { + Length = 2; // sizeof(version) + Length += + (LineTable.Format == dwarf::DWARF64 ? 8 : 4); // sizeof(header_length) + Length += Buffer.size(); } + + writeInitialLength(LineTable.Format, Length, OS, DI.IsLittleEndian); + writeInteger(LineTable.Version, OS, DI.IsLittleEndian); + writeDWARFOffset(HeaderLength, LineTable.Format, OS, DI.IsLittleEndian); + OS.write(Buffer.data(), Buffer.size()); } return Error::success(); diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp index c55ae9e1e477c..39de3ff18f370 100644 --- a/llvm/lib/ObjectYAML/DWARFYAML.cpp +++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp @@ -116,7 +116,7 @@ void MappingTraits::mapping(IO &IO, IO.mapOptional("Code", Abbrev.Code); IO.mapRequired("Tag", Abbrev.Tag); IO.mapRequired("Children", Abbrev.Children); - IO.mapRequired("Attributes", Abbrev.Attributes); + IO.mapOptional("Attributes", Abbrev.Attributes); } void MappingTraits::mapping( @@ -230,9 +230,9 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DWARFYAML::LineTable &LineTable) { IO.mapOptional("Format", LineTable.Format, dwarf::DWARF32); - IO.mapRequired("Length", LineTable.Length); + IO.mapOptional("Length", LineTable.Length); IO.mapRequired("Version", LineTable.Version); - IO.mapRequired("PrologueLength", LineTable.PrologueLength); + IO.mapOptional("PrologueLength", LineTable.PrologueLength); IO.mapRequired("MinInstLength", LineTable.MinInstLength); if(LineTable.Version >= 4) IO.mapRequired("MaxOpsPerInst", LineTable.MaxOpsPerInst); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 23288bb1ac07f..515aaea44dea7 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -110,6 +110,7 @@ #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" +#include "llvm/Transforms/Instrumentation/HeapProfiler.h" #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" #include "llvm/Transforms/Instrumentation/MemorySanitizer.h" @@ -258,6 +259,10 @@ static cl::opt cl::Hidden, cl::desc("Enable inline deferral during PGO")); +static cl::opt EnableHeapProfiler("enable-heap-prof", cl::init(false), + cl::Hidden, cl::ZeroOrMore, + cl::desc("Enable heap profiler")); + PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; LoopVectorization = true; @@ -1034,6 +1039,12 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline( MPM.addPass(SyntheticCountsPropagation()); MPM.addPass(buildInlinerPipeline(Level, Phase, DebugLogging)); + + if (EnableHeapProfiler && Phase != ThinLTOPhase::PreLink) { + MPM.addPass(createModuleToFunctionPassAdaptor(HeapProfilerPass())); + MPM.addPass(ModuleHeapProfilerPass()); + } + return MPM; } diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 4bbecfeb82a95..406a41967e4ac 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -97,6 +97,7 @@ MODULE_PASS("msan-module", MemorySanitizerPass({})) MODULE_PASS("tsan-module", ThreadSanitizerPass()) MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false)) MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass()) +MODULE_PASS("heapprof-module", ModuleHeapProfilerPass()) MODULE_PASS("poison-checking", PoisonCheckingPass()) #undef MODULE_PASS @@ -276,6 +277,7 @@ FUNCTION_PASS("kasan", AddressSanitizerPass(true, false, false)) FUNCTION_PASS("msan", MemorySanitizerPass({})) FUNCTION_PASS("kmsan", MemorySanitizerPass({0, false, /*Kernel=*/true})) FUNCTION_PASS("tsan", ThreadSanitizerPass()) +FUNCTION_PASS("heapprof", HeapProfilerPass()) #undef FUNCTION_PASS #ifndef FUNCTION_PASS_WITH_PARAMS diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp index e5d0fdba5fc4a..6e0542f6d433f 100644 --- a/llvm/lib/ProfileData/SampleProf.cpp +++ b/llvm/lib/ProfileData/SampleProf.cpp @@ -14,6 +14,7 @@ #include "llvm/ProfileData/SampleProf.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Error.h" @@ -174,8 +175,8 @@ unsigned FunctionSamples::getOffset(const DILocation *DIL) { 0xffff; } -const FunctionSamples * -FunctionSamples::findFunctionSamples(const DILocation *DIL) const { +const FunctionSamples *FunctionSamples::findFunctionSamples( + const DILocation *DIL, SampleProfileReaderItaniumRemapper *Remapper) const { assert(DIL); SmallVector, 10> S; @@ -190,11 +191,59 @@ FunctionSamples::findFunctionSamples(const DILocation *DIL) const { return this; const FunctionSamples *FS = this; for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) { - FS = FS->findFunctionSamplesAt(S[i].first, S[i].second); + FS = FS->findFunctionSamplesAt(S[i].first, S[i].second, Remapper); } return FS; } +void FunctionSamples::findAllNames(DenseSet &NameSet) const { + NameSet.insert(Name); + for (const auto &BS : BodySamples) + for (const auto &TS : BS.second.getCallTargets()) + NameSet.insert(TS.getKey()); + + for (const auto &CS : CallsiteSamples) { + for (const auto &NameFS : CS.second) { + NameSet.insert(NameFS.first); + NameFS.second.findAllNames(NameSet); + } + } +} + +const FunctionSamples *FunctionSamples::findFunctionSamplesAt( + const LineLocation &Loc, StringRef CalleeName, + SampleProfileReaderItaniumRemapper *Remapper) const { + std::string CalleeGUID; + CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID); + + auto iter = CallsiteSamples.find(Loc); + if (iter == CallsiteSamples.end()) + return nullptr; + auto FS = iter->second.find(CalleeName); + if (FS != iter->second.end()) + return &FS->second; + if (Remapper) { + if (auto NameInProfile = Remapper->lookUpNameInProfile(CalleeName)) { + auto FS = iter->second.find(*NameInProfile); + if (FS != iter->second.end()) + return &FS->second; + } + } + // If we cannot find exact match of the callee name, return the FS with + // the max total count. Only do this when CalleeName is not provided, + // i.e., only for indirect calls. + if (!CalleeName.empty()) + return nullptr; + uint64_t MaxTotalSamples = 0; + const FunctionSamples *R = nullptr; + for (const auto &NameFS : iter->second) + if (NameFS.second.getTotalSamples() >= MaxTotalSamples) { + MaxTotalSamples = NameFS.second.getTotalSamples(); + R = &NameFS.second; + } + return R; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); } #endif diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index 03f1ac190b91c..59fae9e236f37 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -1291,18 +1291,22 @@ void SampleProfileReaderItaniumRemapper::applyRemapping(LLVMContext &Ctx) { } assert(Remappings && "should be initialized while creating remapper"); - for (auto &Sample : Reader.getProfiles()) - if (auto Key = Remappings->insert(Sample.first())) - SampleMap.insert({Key, &Sample.second}); + for (auto &Sample : Reader.getProfiles()) { + DenseSet NamesInSample; + Sample.second.findAllNames(NamesInSample); + for (auto &Name : NamesInSample) + if (auto Key = Remappings->insert(Name)) + NameMap.insert({Key, Name}); + } RemappingApplied = true; } -FunctionSamples * -SampleProfileReaderItaniumRemapper::getSamplesFor(StringRef Fname) { +Optional +SampleProfileReaderItaniumRemapper::lookUpNameInProfile(StringRef Fname) { if (auto Key = Remappings->lookup(Fname)) - return SampleMap.lookup(Key); - return nullptr; + return NameMap.lookup(Key); + return None; } /// Prepare a memory buffer for the contents of \p Filename. diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 6d9d48a6597bc..f106ee6feabb2 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -2,19 +2,6 @@ if(LLVM_ENABLE_ZLIB) set(imported_libs ZLIB::ZLIB) endif() -function(get_system_libname libpath libname) - get_filename_component(libpath ${libpath} NAME) - if( CMAKE_FIND_LIBRARY_PREFIXES ) - string(REPLACE ";" "|" PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}") - string(REGEX REPLACE "^(${PREFIXES})" "" libpath ${libpath}) - endif() - if( CMAKE_FIND_LIBRARY_SUFFIXES ) - string(REPLACE ";" "|" SUFFIXES "${CMAKE_FIND_LIBRARY_SUFFIXES}") - string(REGEX REPLACE "(${SUFFIXES})$" "" libpath ${libpath}) - endif() - set(${libname} "${libpath}" PARENT_SCOPE) -endfunction() - if( MSVC OR MINGW ) # libuuid required for FOLDERID_Profile usage in lib/Support/Windows/Path.inc. # advapi32 required for CryptAcquireContextW in lib/Support/Windows/Path.inc. @@ -34,8 +21,10 @@ elseif( CMAKE_HOST_UNIX ) STRING(REGEX REPLACE "^lib" "" Backtrace_LIBFILE ${Backtrace_LIBFILE}) set(system_libs ${system_libs} ${Backtrace_LIBFILE}) endif() - if( LLVM_ENABLE_TERMINFO ) - set(imported_libs ${imported_libs} "${TERMINFO_LIB}") + if(LLVM_ENABLE_TERMINFO) + if(HAVE_TERMINFO) + set(system_libs ${system_libs} ${TERMINFO_LIBS}) + endif() endif() if( LLVM_ENABLE_THREADS AND (HAVE_LIBATOMIC OR HAVE_CXX_LIBATOMICS64) ) set(system_libs ${system_libs} atomic) @@ -57,9 +46,35 @@ endif() # Link Z3 if the user wants to build it. if(LLVM_WITH_Z3) - set(Z3_LINK_FILES ${Z3_LIBRARIES}) -else() - set(Z3_LINK_FILES "") + set(system_libs ${system_libs} ${Z3_LIBRARIES}) +endif() + +# Override the C runtime allocator on Windows and embed it into LLVM tools & libraries +if(LLVM_INTEGRATED_CRT_ALLOC) + if (CMAKE_BUILD_TYPE AND NOT ${LLVM_USE_CRT_${uppercase_CMAKE_BUILD_TYPE}} MATCHES "^(MT|MTd)$") + message(FATAL_ERROR "LLVM_INTEGRATED_CRT_ALLOC only works with /MT or /MTd. Use LLVM_USE_CRT_${uppercase_CMAKE_BUILD_TYPE} to set the appropriate option.") + endif() + + string(REGEX REPLACE "(/|\\\\)$" "" LLVM_INTEGRATED_CRT_ALLOC "${LLVM_INTEGRATED_CRT_ALLOC}") + + if(NOT EXISTS "${LLVM_INTEGRATED_CRT_ALLOC}") + message(FATAL_ERROR "Cannot find the path to `git clone` for the CRT allocator! (${LLVM_INTEGRATED_CRT_ALLOC}). Currently, rpmalloc, snmalloc and mimalloc are supported.") + endif() + + if(LLVM_INTEGRATED_CRT_ALLOC MATCHES "rpmalloc$") + add_definitions(-DENABLE_OVERRIDE -DENABLE_PRELOAD) + set(ALLOCATOR_FILES "${LLVM_INTEGRATED_CRT_ALLOC}/rpmalloc/rpmalloc.c") + elseif(LLVM_INTEGRATED_CRT_ALLOC MATCHES "snmalloc$") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /std:c++17" PARENT_SCOPE) + set(ALLOCATOR_FILES "${LLVM_INTEGRATED_CRT_ALLOC}/src/override/malloc.cc" "${LLVM_INTEGRATED_CRT_ALLOC}/src/override/new.cc") + set(system_libs ${system_libs} "mincore.lib" "-INCLUDE:malloc") + elseif(LLVM_INTEGRATED_CRT_ALLOC MATCHES "mimalloc$") + set(MIMALLOC_LIB "${LLVM_INTEGRATED_CRT_ALLOC}/out/msvc-x64/Release/mimalloc-static.lib") + if(NOT EXISTS "${MIMALLOC_LIB}") + message(FATAL_ERROR "Cannot find the mimalloc static library. To build it, first apply the patch from https://github.com/microsoft/mimalloc/issues/268 then build the Release x64 target through ${LLVM_INTEGRATED_CRT_ALLOC}\\ide\\vs2019\\mimalloc.sln") + endif() + set(system_libs ${system_libs} "${MIMALLOC_LIB}" "-INCLUDE:malloc") + endif() endif() add_llvm_component_library(LLVMSupport @@ -186,6 +201,8 @@ add_llvm_component_library(LLVMSupport xxhash.cpp Z3Solver.cpp + ${ALLOCATOR_FILES} + # System Atomic.cpp DynamicLibrary.cpp @@ -209,7 +226,7 @@ add_llvm_component_library(LLVMSupport ${LLVM_MAIN_INCLUDE_DIR}/llvm/ADT ${LLVM_MAIN_INCLUDE_DIR}/llvm/Support ${Backtrace_INCLUDE_DIRS} - LINK_LIBS ${system_libs} ${imported_libs} ${delayload_flags} ${Z3_LINK_FILES} + LINK_LIBS ${system_libs} ${imported_libs} ${delayload_flags} ) set(llvm_system_libs ${system_libs}) @@ -223,15 +240,20 @@ if(LLVM_ENABLE_ZLIB) if(NOT zlib_library) get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION) endif() - get_system_libname(${zlib_library} zlib_library) + get_filename_component(zlib_library ${zlib_library} NAME) + if(CMAKE_STATIC_LIBRARY_PREFIX AND CMAKE_STATIC_LIBRARY_SUFFIX AND + zlib_library MATCHES "^${CMAKE_STATIC_LIBRARY_PREFIX}.*${CMAKE_STATIC_LIBRARY_SUFFIX}$") + STRING(REGEX REPLACE "^${CMAKE_STATIC_LIBRARY_PREFIX}" "" zlib_library ${zlib_library}) + STRING(REGEX REPLACE "${CMAKE_STATIC_LIBRARY_SUFFIX}$" "" zlib_library ${zlib_library}) + endif() + if(CMAKE_SHARED_LIBRARY_PREFIX AND CMAKE_SHARED_LIBRARY_SUFFIX AND + zlib_library MATCHES "^${CMAKE_SHARED_LIBRARY_PREFIX}.*${CMAKE_SHARED_LIBRARY_SUFFIX}$") + STRING(REGEX REPLACE "^${CMAKE_SHARED_LIBRARY_PREFIX}" "" zlib_library ${zlib_library}) + STRING(REGEX REPLACE "${CMAKE_SHARED_LIBRARY_SUFFIX}$" "" zlib_library ${zlib_library}) + endif() set(llvm_system_libs ${llvm_system_libs} "${zlib_library}") endif() -if(LLVM_ENABLE_TERMINFO) - get_system_libname(${TERMINFO_LIB} terminfo_library) - set(llvm_system_libs ${llvm_system_libs} "${terminfo_library}") -endif() - set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${llvm_system_libs}") if(LLVM_WITH_Z3) diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp index ec7d7d641dce5..d4fd8216123f9 100644 --- a/llvm/lib/Support/CrashRecoveryContext.cpp +++ b/llvm/lib/Support/CrashRecoveryContext.cpp @@ -9,14 +9,12 @@ #include "llvm/Support/CrashRecoveryContext.h" #include "llvm/Config/llvm-config.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ExitCodes.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Signals.h" #include "llvm/Support/ThreadLocal.h" #include #include -#if LLVM_ON_UNIX -#include // EX_IOERR -#endif using namespace llvm; diff --git a/llvm/lib/Support/FileCheck.cpp b/llvm/lib/Support/FileCheck.cpp index 137eea0a65a64..59d8b6bbc2173 100644 --- a/llvm/lib/Support/FileCheck.cpp +++ b/llvm/lib/Support/FileCheck.cpp @@ -2197,6 +2197,7 @@ bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer, const std::vector &NotStrings, const FileCheckRequest &Req, std::vector *Diags) const { + bool DirectiveFail = false; for (const Pattern *Pat : NotStrings) { assert((Pat->getCheckTy() == Check::CheckNot) && "Expect CHECK-NOT!"); @@ -2212,11 +2213,11 @@ bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer, PrintMatch(false, SM, Prefix, Pat->getLoc(), *Pat, 1, Buffer, Pos, MatchLen, Req, Diags); - - return true; + DirectiveFail = true; + continue; } - return false; + return DirectiveFail; } size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer, diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp index bc06a7858f5b0..26534580d02d3 100644 --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -730,6 +730,13 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, *Subtype = X86::INTEL_COREI7_ICELAKE_SERVER; break; + // Sapphire Rapids: + case 0x8f: + CPU = "sapphirerapids"; + *Type = X86::INTEL_COREI7; + *Subtype = X86::INTEL_COREI7_SAPPHIRERAPIDS; + break; + case 0x1c: // Most 45 nm Intel Atom processors case 0x26: // 45 nm Atom Lincroft case 0x27: // 32 nm Atom Medfield diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc index 7425d084da27a..24f16b51af7be 100644 --- a/llvm/lib/Support/Unix/Process.inc +++ b/llvm/lib/Support/Unix/Process.inc @@ -313,7 +313,7 @@ unsigned Process::StandardErrColumns() { return getColumns(); } -#ifdef LLVM_ENABLE_TERMINFO +#ifdef HAVE_TERMINFO // We manually declare these extern functions because finding the correct // headers from various terminfo, curses, or other sources is harder than // writing their specs down. @@ -323,12 +323,12 @@ extern "C" int del_curterm(struct term *termp); extern "C" int tigetnum(char *capname); #endif -#ifdef LLVM_ENABLE_TERMINFO +#ifdef HAVE_TERMINFO static ManagedStatic TermColorMutex; #endif static bool terminalHasColors(int fd) { -#ifdef LLVM_ENABLE_TERMINFO +#ifdef HAVE_TERMINFO // First, acquire a global lock because these C routines are thread hostile. std::lock_guard G(*TermColorMutex); diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc index ce1fccf0b4271..50b2ad4b57728 100644 --- a/llvm/lib/Support/Unix/Signals.inc +++ b/llvm/lib/Support/Unix/Signals.inc @@ -36,6 +36,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Config/config.h" #include "llvm/Demangle/Demangle.h" +#include "llvm/Support/ExitCodes.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/Format.h" @@ -46,7 +47,6 @@ #include "llvm/Support/raw_ostream.h" #include #include -#include #ifdef HAVE_BACKTRACE # include BACKTRACE_HEADER // For backtrace(). #endif @@ -553,7 +553,7 @@ static int unwindBacktrace(void **StackTrace, int MaxEntries) { // // On glibc systems we have the 'backtrace' function, which works nicely, but // doesn't demangle symbols. -void llvm::sys::PrintStackTrace(raw_ostream &OS) { +void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) { #if ENABLE_BACKTRACES static void *StackTrace[256]; int depth = 0; @@ -570,8 +570,11 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) { #endif if (!depth) return; - - if (printSymbolizedStackTrace(Argv0, StackTrace, depth, OS)) + // If "Depth" is not provided by the caller, use the return value of + // backtrace() for printing a symbolized stack trace. + if (!Depth) + Depth = depth; + if (printSymbolizedStackTrace(Argv0, StackTrace, Depth, OS)) return; #if HAVE_DLFCN_H && HAVE_DLADDR int width = 0; @@ -614,7 +617,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) { OS << '\n'; } #elif defined(HAVE_BACKTRACE) - backtrace_symbols_fd(StackTrace, depth, STDERR_FILENO); + backtrace_symbols_fd(StackTrace, Depth, STDERR_FILENO); #endif #endif } diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index 0115161636c4b..399b054d36bd2 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -19,7 +19,6 @@ #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/WindowsError.h" #include -#include #include #include @@ -352,13 +351,13 @@ std::error_code is_local(const Twine &path, bool &result) { static std::error_code realPathFromHandle(HANDLE H, SmallVectorImpl &Buffer) { DWORD CountChars = ::GetFinalPathNameByHandleW( - H, Buffer.begin(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED); - if (CountChars > Buffer.capacity()) { + H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED); + if (CountChars && CountChars >= Buffer.capacity()) { // The buffer wasn't big enough, try again. In this case the return value // *does* indicate the size of the null terminator. Buffer.reserve(CountChars); CountChars = ::GetFinalPathNameByHandleW( - H, Buffer.data(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED); + H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED); } if (CountChars == 0) return mapWindowsError(GetLastError()); diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc index 0c3681fa96548..71dc6324e99f2 100644 --- a/llvm/lib/Support/Windows/Signals.inc +++ b/llvm/lib/Support/Windows/Signals.inc @@ -552,7 +552,8 @@ static void LocalPrintStackTrace(raw_ostream &OS, PCONTEXT C) { StackFrame, C); } -void llvm::sys::PrintStackTrace(raw_ostream &OS) { +void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) { + // FIXME: Handle "Depth" parameter to print stack trace upto specified Depth LocalPrintStackTrace(OS, nullptr); } diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp index 680ec91dc8efc..a5af98582452b 100644 --- a/llvm/lib/Support/X86TargetParser.cpp +++ b/llvm/lib/Support/X86TargetParser.cpp @@ -195,6 +195,11 @@ static constexpr FeatureBitset FeaturesICLServer = static constexpr FeatureBitset FeaturesTigerlake = FeaturesICLClient | FeatureAVX512VP2INTERSECT | FeatureMOVDIR64B | FeatureMOVDIRI | FeatureSHSTK; +static constexpr FeatureBitset FeaturesSapphireRapids = + FeaturesICLServer | FeatureAMX_TILE | FeatureAMX_INT8 | FeatureAMX_BF16 | + FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE | FeatureENQCMD | + FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE | FeatureSERIALIZE | + FeatureSHSTK | FeatureTSXLDTRK | FeatureWAITPKG; // Intel Atom processors. // Bonnell has feature parity with Core2 and adds MOVBE. @@ -342,6 +347,8 @@ static constexpr ProcInfo Processors[] = { { {"icelake-server"}, CK_IcelakeServer, FEATURE_AVX512VBMI2, FeaturesICLServer }, // Tigerlake microarchitecture based processors. { {"tigerlake"}, CK_Tigerlake, FEATURE_AVX512VP2INTERSECT, FeaturesTigerlake }, + // Sapphire Rapids microarchitecture based processors. + { {"sapphirerapids"}, CK_SapphireRapids, FEATURE_AVX512VP2INTERSECT, FeaturesSapphireRapids }, // Knights Landing processor. { {"knl"}, CK_KNL, FEATURE_AVX512F, FeaturesKNL }, // Knights Mill processor. diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index 86c48993957a0..83050c8574d9d 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -858,7 +858,9 @@ bool raw_fd_ostream::is_displayed() const { } bool raw_fd_ostream::has_colors() const { - return sys::Process::FileDescriptorHasColors(FD); + if (!HasColors) + HasColors = sys::Process::FileDescriptorHasColors(FD); + return *HasColors; } Expected raw_fd_ostream::lock() { diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 83653dcbb8cf7..751791bdb354a 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1024,10 +1024,6 @@ static bool needsWinCFI(const MachineFunction &MF) { F.needsUnwindTableEntry(); } -static bool isTargetDarwin(const MachineFunction &MF) { - return MF.getSubtarget().isTargetDarwin(); -} - static bool isTargetWindows(const MachineFunction &MF) { return MF.getSubtarget().isTargetWindows(); } @@ -1185,7 +1181,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // For funclets the FP belongs to the containing function. if (!IsFunclet && HasFP) { // Only set up FP if we actually need to. - int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; + int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset(); if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); @@ -1409,11 +1405,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } if (needsFrameMoves) { - const DataLayout &TD = MF.getDataLayout(); - const int StackGrowth = isTargetDarwin(MF) - ? (2 * -TD.getPointerSize(0)) - : -AFI->getCalleeSavedStackSize(); - Register FramePtr = RegInfo->getFrameRegister(MF); // An example of the prologue: // // .globl __foo @@ -1481,10 +1472,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // .cfi_offset w28, -32 if (HasFP) { + const int OffsetToFirstCalleeSaveFromFP = + AFI->getCalleeSaveBaseToFrameRecordOffset() - + AFI->getCalleeSavedStackSize(); + Register FramePtr = RegInfo->getFrameRegister(MF); + // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth)); + MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1694,11 +1690,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { - RestoreBegin = std::prev(RestoreEnd);; - while (IsSVECalleeSave(RestoreBegin) && - RestoreBegin != MBB.begin()) + RestoreBegin = std::prev(RestoreEnd); + while (RestoreBegin != MBB.begin() && + IsSVECalleeSave(std::prev(RestoreBegin))) --RestoreBegin; - ++RestoreBegin; assert(IsSVECalleeSave(RestoreBegin) && IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); @@ -1776,10 +1771,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) { - int64_t OffsetToFrameRecord = - isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0; emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - {OffsetToFrameRecord, MVT::i8}, + {-AFI->getCalleeSaveBaseToFrameRecordOffset(), MVT::i8}, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); } else if (NumBytes) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, @@ -1840,11 +1833,11 @@ static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) const auto &Subtarget = MF.getSubtarget(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false); - unsigned FPAdjust = isTargetDarwin(MF) - ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo()); + int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo()); + int64_t FPAdjust = + CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset(); return {ObjectOffset + FixedObject + FPAdjust, MVT::i8}; } @@ -2232,6 +2225,14 @@ static void computeCalleeSaveRegisterPairs( (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && "Offset out of bounds for LDP/STP immediate"); + // Save the offset to frame record so that the FP register can point to the + // innermost frame record (spilled FP and LR registers). + if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR && + RPI.Reg2 == AArch64::FP) || + (IsWindows && RPI.Reg1 == AArch64::FP && + RPI.Reg2 == AArch64::LR))) + AFI->setCalleeSaveBaseToFrameRecordOffset(Offset); + RegPairs.push_back(RPI); if (RPI.isPaired()) ++i; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index da4ca30d9cbde..b04d245ac7a0c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -135,8 +135,10 @@ static bool isMergePassthruOpcode(unsigned Opc) { default: return false; case AArch64ISD::DUP_MERGE_PASSTHRU: + case AArch64ISD::FNEG_MERGE_PASSTHRU: case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU: case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU: + case AArch64ISD::FCEIL_MERGE_PASSTHRU: return true; } } @@ -969,7 +971,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FDIV, VT, Custom); setOperationAction(ISD::FMA, VT, Custom); setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FSUB, VT, Custom); + setOperationAction(ISD::FCEIL, VT, Custom); } } @@ -1471,8 +1475,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::UDIV_PRED) MAKE_CASE(AArch64ISD::UMAX_PRED) MAKE_CASE(AArch64ISD::UMIN_PRED) + MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) MAKE_CASE(AArch64ISD::ADC) MAKE_CASE(AArch64ISD::SBC) @@ -3331,6 +3337,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_convert_from_svbool: return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::aarch64_sve_fneg: + return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frintp: + return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_convert_to_svbool: { EVT OutVT = Op.getValueType(); EVT InVT = Op.getOperand(1).getValueType(); @@ -3625,6 +3637,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, if (Op.getValueType() == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::DIV_F128); return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); + case ISD::FNEG: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); + case ISD::FCEIL: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU); case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); @@ -3726,9 +3742,17 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: return LowerFixedLengthVectorIntExtendToSVE(Op, DAG); - case ISD::SIGN_EXTEND_INREG: + case ISD::SIGN_EXTEND_INREG: { + // Only custom lower when ExtraVT has a legal byte based element type. + EVT ExtraVT = cast(Op.getOperand(1))->getVT(); + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) && + (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) + return SDValue(); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); + } case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::LOAD: @@ -4201,9 +4225,7 @@ SDValue AArch64TargetLowering::LowerCallResult( const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { - CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS - ? RetCC_AArch64_WebKit_JS - : RetCC_AArch64_AAPCS; + CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); // Assign locations to each value returned by this call. SmallVector RVLocs; DenseMap CopiedRegs; @@ -4913,9 +4935,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool AArch64TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { - CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS - ? RetCC_AArch64_WebKit_JS - : RetCC_AArch64_AAPCS; + CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); @@ -4930,9 +4950,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, auto &MF = DAG.getMachineFunction(); auto *FuncInfo = MF.getInfo(); - CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS - ? RetCC_AArch64_WebKit_JS - : RetCC_AArch64_AAPCS; + CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); @@ -7363,6 +7381,81 @@ static bool isSingletonEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { return true; } +/// Check if a vector shuffle corresponds to a DUP instructions with a larger +/// element width than the vector lane type. If that is the case the function +/// returns true and writes the value of the DUP instruction lane operand into +/// DupLaneOp +static bool isWideDUPMask(ArrayRef M, EVT VT, unsigned BlockSize, + unsigned &DupLaneOp) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for wide DUP are: 16, 32, 64"); + + if (BlockSize <= VT.getScalarSizeInBits()) + return false; + if (BlockSize % VT.getScalarSizeInBits() != 0) + return false; + if (VT.getSizeInBits() % BlockSize != 0) + return false; + + size_t SingleVecNumElements = VT.getVectorNumElements(); + size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits(); + size_t NumBlocks = VT.getSizeInBits() / BlockSize; + + // We are looking for masks like + // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element + // might be replaced by 'undefined'. BlockIndices will eventually contain + // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7] + // for the above examples) + SmallVector BlockElts(NumEltsPerBlock, -1); + for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++) + for (size_t I = 0; I < NumEltsPerBlock; I++) { + int Elt = M[BlockIndex * NumEltsPerBlock + I]; + if (Elt < 0) + continue; + // For now we don't support shuffles that use the second operand + if ((unsigned)Elt >= SingleVecNumElements) + return false; + if (BlockElts[I] < 0) + BlockElts[I] = Elt; + else if (BlockElts[I] != Elt) + return false; + } + + // We found a candidate block (possibly with some undefs). It must be a + // sequence of consecutive integers starting with a value divisible by + // NumEltsPerBlock with some values possibly replaced by undef-s. + + // Find first non-undef element + auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; }); + assert(FirstRealEltIter != BlockElts.end() && + "Shuffle with all-undefs must have been caught by previous cases, " + "e.g. isSplat()"); + if (FirstRealEltIter == BlockElts.end()) { + DupLaneOp = 0; + return true; + } + + // Index of FirstRealElt in BlockElts + size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin(); + + if ((unsigned)*FirstRealEltIter < FirstRealIndex) + return false; + // BlockElts[0] must have the following value if it isn't undef: + size_t Elt0 = *FirstRealEltIter - FirstRealIndex; + + // Check the first element + if (Elt0 % NumEltsPerBlock != 0) + return false; + // Check that the sequence indeed consists of consecutive integers (modulo + // undefs) + for (size_t I = 0; I < NumEltsPerBlock; I++) + if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I) + return false; + + DupLaneOp = Elt0 / NumEltsPerBlock; + return true; +} + // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are different. static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, @@ -7796,6 +7889,60 @@ static unsigned getDUPLANEOp(EVT EltType) { llvm_unreachable("Invalid vector element type?"); } +static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, + unsigned Opcode, SelectionDAG &DAG) { + // Try to eliminate a bitcasted extract subvector before a DUPLANE. + auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { + // Match: dup (bitcast (extract_subv X, C)), LaneC + if (BitCast.getOpcode() != ISD::BITCAST || + BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + + // The extract index must align in the destination type. That may not + // happen if the bitcast is from narrow to wide type. + SDValue Extract = BitCast.getOperand(0); + unsigned ExtIdx = Extract.getConstantOperandVal(1); + unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); + unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; + unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); + if (ExtIdxInBits % CastedEltBitWidth != 0) + return false; + + // Update the lane value by offsetting with the scaled extract index. + LaneC += ExtIdxInBits / CastedEltBitWidth; + + // Determine the casted vector type of the wide vector input. + // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' + // Examples: + // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 + // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 + unsigned SrcVecNumElts = + Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; + CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), + SrcVecNumElts); + return true; + }; + MVT CastVT; + if (getScaledOffsetDup(V, Lane, CastVT)) { + V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); + } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + // The lane is incremented by the index of the extract. + // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 + Lane += V.getConstantOperandVal(1); + V = V.getOperand(0); + } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { + // The lane is decremented if we are splatting from the 2nd operand. + // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 + unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; + Lane -= Idx * VT.getVectorNumElements() / 2; + V = WidenVector(V.getOperand(Idx), DAG); + } else if (VT.getSizeInBits() == 64) { + // Widen the operand to 128-bit register with undef. + V = WidenVector(V, DAG); + } + return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); +} + SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -7829,57 +7976,26 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // Otherwise, duplicate from the lane of the input vector. unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); - - // Try to eliminate a bitcasted extract subvector before a DUPLANE. - auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { - // Match: dup (bitcast (extract_subv X, C)), LaneC - if (BitCast.getOpcode() != ISD::BITCAST || - BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) - return false; - - // The extract index must align in the destination type. That may not - // happen if the bitcast is from narrow to wide type. - SDValue Extract = BitCast.getOperand(0); - unsigned ExtIdx = Extract.getConstantOperandVal(1); - unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); - unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; - unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); - if (ExtIdxInBits % CastedEltBitWidth != 0) - return false; - - // Update the lane value by offsetting with the scaled extract index. - LaneC += ExtIdxInBits / CastedEltBitWidth; - - // Determine the casted vector type of the wide vector input. - // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' - // Examples: - // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 - // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 - unsigned SrcVecNumElts = - Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; - CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), - SrcVecNumElts); - return true; - }; - MVT CastVT; - if (getScaledOffsetDup(V1, Lane, CastVT)) { - V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0)); - } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { - // The lane is incremented by the index of the extract. - // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 - Lane += V1.getConstantOperandVal(1); - V1 = V1.getOperand(0); - } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { - // The lane is decremented if we are splatting from the 2nd operand. - // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 - unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; - Lane -= Idx * VT.getVectorNumElements() / 2; - V1 = WidenVector(V1.getOperand(Idx), DAG); - } else if (VT.getSizeInBits() == 64) { - // Widen the operand to 128-bit register with undef. - V1 = WidenVector(V1, DAG); - } - return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); + return constructDup(V1, Lane, dl, VT, Opcode, DAG); + } + + // Check if the mask matches a DUP for a wider element + for (unsigned LaneSize : {64U, 32U, 16U}) { + unsigned Lane = 0; + if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) { + unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64 + : LaneSize == 32 ? AArch64ISD::DUPLANE32 + : AArch64ISD::DUPLANE16; + // Cast V1 to an integer vector with required lane size + MVT NewEltTy = MVT::getIntegerVT(LaneSize); + unsigned NewEltCount = VT.getSizeInBits() / LaneSize; + MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount); + V1 = DAG.getBitcast(NewVecTy, V1); + // Constuct the DUP instruction + V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG); + // Cast back to the original type + return DAG.getBitcast(VT, V1); + } } if (isREVMask(ShuffleMask, VT, 64)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index e550713ed5a00..38caa6a481418 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -95,6 +95,8 @@ enum NodeType : unsigned { // Predicated instructions with the result of inactive lanes provided by the // last operand. + FCEIL_MERGE_PASSTHRU, + FNEG_MERGE_PASSTHRU, SIGN_EXTEND_INREG_MERGE_PASSTHRU, ZERO_EXTEND_INREG_MERGE_PASSTHRU, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 088c129bc5f3c..25d478ebfc055 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7841,9 +7841,9 @@ class BaseSIMDThreeSameVectorBFDot { def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64, - v2f32, v8i8>; + v2f32, v4bf16>; def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128, - v4f32, v16i8>; + v4f32, v8bf16>; } class BaseSIMDThreeSameVectorBF16DotI { + VectorIndexS:$idx)))))))]> { bits<2> idx; let Inst{21} = idx{0}; // L @@ -7871,16 +7871,16 @@ class BaseSIMDThreeSameVectorBF16DotI { def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h", - ".2h", V64, v2f32, v8i8>; + ".2h", V64, v2f32, v4bf16>; def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h", - ".2h", V128, v4f32, v16i8>; + ".2h", V128, v4f32, v8bf16>; } class SIMDBF16MLAL : BaseSIMDThreeSameVectorTied { + (v8bf16 V128:$Rn), + (v8bf16 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); } @@ -7890,10 +7890,10 @@ class SIMDBF16MLALIndex "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst", [(set (v4f32 V128:$dst), (v4f32 (OpNode (v4f32 V128:$Rd), - (v16i8 V128:$Rn), - (v16i8 (bitconvert (v8bf16 + (v8bf16 V128:$Rn), + (v8bf16 (AArch64duplane16 (v8bf16 V128_lo:$Rm), - VectorIndexH:$idx)))))))]>, + VectorIndexH:$idx)))))]>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; @@ -7917,8 +7917,8 @@ class SIMDThreeSameVectorBF16MatrixMul V128, asm, ".4s", [(set (v4f32 V128:$dst), (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), - (v16i8 V128:$Rn), - (v16i8 V128:$Rm)))]> { + (v8bf16 V128:$Rn), + (v8bf16 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", ", $Rm", ".8h", "}"); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 86aacedebdfe5..9e37d0292e7a7 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3442,8 +3442,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, // First emit non-scalable frame offsets, or a simple 'mov'. if (Bytes || (!Offset && SrcReg != DestReg)) { - assert((DestReg != AArch64::SP || Bytes % 16 == 0) && - "SP increment/decrement not 16-byte aligned"); + assert((DestReg != AArch64::SP || Bytes % 8 == 0) && + "SP increment/decrement not 8-byte aligned"); unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; if (Bytes < 0) { Bytes = -Bytes; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 39e1ee3ad8c18..85cb230517433 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -798,6 +798,23 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>; def BFCVTN : SIMD_BFCVTN; def BFCVTN2 : SIMD_BFCVTN2; def BFCVT : BF16ToSinglePrecision<"bfcvt">; + +// Vector-scalar BFDOT: +// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit +// register (the instruction uses a single 32-bit lane from it), so the pattern +// is a bit tricky. +def : Pat<(v2f32 (int_aarch64_neon_bfdot + (v2f32 V64:$Rd), (v4bf16 V64:$Rn), + (v4bf16 (bitconvert + (v2i32 (AArch64duplane32 + (v4i32 (bitconvert + (v8bf16 (insert_subvector undef, + (v4bf16 V64:$Rm), + (i64 0))))), + VectorIndexS:$idx)))))), + (BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn), + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), + VectorIndexS:$idx)>; } // ARMv8.6A AArch64 matrix multiplication diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 84aa53f2bece1..9562269336d8d 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -135,6 +135,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// e.g. Tail Call, Thunk, or Function if none apply. Optional OutliningStyle; + // Offset from SP-after-callee-saved-spills (i.e. SP-at-entry minus + // CalleeSavedStackSize) to the address of the frame record. + int CalleeSaveBaseToFrameRecordOffset = 0; + public: AArch64FunctionInfo() = default; @@ -338,6 +342,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { TaggedBasePointerOffset = Offset; } + int getCalleeSaveBaseToFrameRecordOffset() const { + return CalleeSaveBaseToFrameRecordOffset; + } + void setCalleeSaveBaseToFrameRecordOffset(int Offset) { + CalleeSaveBaseToFrameRecordOffset = Offset; + } + private: // Hold the lists of LOHs. MILOHContainer LOHContainerSet; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 027db639cb6a4..eadf23dc46225 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -199,8 +199,10 @@ def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [ ]>; // Predicated operations with the result of inactive lanes provided by the last operand. -def AArch64sxt_mt : SDNode<"AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; -def AArch64uxt_mt : SDNode<"AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; +def AArch64fneg_mt : SDNode<"AArch64ISD::FNEG_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64sxt_mt : SDNode<"AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; +def AArch64uxt_mt : SDNode<"AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; +def AArch64frintp_mt : SDNode<"AArch64ISD::FCEIL_MERGE_PASSTHRU", SDT_AArch64Arith>; def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; @@ -349,8 +351,8 @@ let Predicates = [HasSVE] in { defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>; defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>; - defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>; - defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>; + defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs, null_frag>; + defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", null_frag, AArch64fneg_mt>; defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>; defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>; @@ -1415,7 +1417,7 @@ multiclass sve_prefetch; defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>; - defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>; + defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp, AArch64frintp_mt>; defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", int_aarch64_sve_frintm>; defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", int_aarch64_sve_frintz>; defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", int_aarch64_sve_frinta>; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index b8d375737e7ea..2fe43f9e80e0d 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -2071,14 +2071,14 @@ void AArch64Operand::print(raw_ostream &OS) const { case k_PSBHint: OS << getPSBHintName(); break; + case k_BTIHint: + OS << getBTIHintName(); + break; case k_Register: OS << ""; if (!getShiftExtendAmount() && !hasShiftExtendAmount()) break; LLVM_FALLTHROUGH; - case k_BTIHint: - OS << getBTIHintName(); - break; case k_ShiftExtend: OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #" << getShiftExtendAmount(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 063c451440dc1..77e5f374c1af0 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -622,6 +622,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); + getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); + computeTables(); verify(*ST.getInstrInfo()); } @@ -706,19 +708,6 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI, bool AArch64LegalizerInfo::legalizeIntrinsic( LegalizerHelper &Helper, MachineInstr &MI) const { - MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; - switch (MI.getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memset: - case Intrinsic::memmove: - if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) == - LegalizerHelper::UnableToLegalize) - return false; - MI.eraseFromParent(); - return true; - default: - break; - } return true; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 9a1f200d52222..595e12237747d 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -96,24 +96,6 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, CombinerHelper Helper(Observer, B, KB, MDT); AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper); - switch (MI.getOpcode()) { - case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: - switch (MI.getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memmove: - case Intrinsic::memset: { - // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other - // heuristics decide. - unsigned MaxLen = EnableOpt ? 0 : 32; - // Try to inline memcpy type calls if optimizations are enabled. - return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen) - : false; - } - default: - break; - } - } - if (Generated.tryCombineAll(Observer, MI, B)) return true; @@ -122,6 +104,15 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, return Helper.tryCombineConcatVectors(MI); case TargetOpcode::G_SHUFFLE_VECTOR: return Helper.tryCombineShuffleVector(MI); + case TargetOpcode::G_MEMCPY: + case TargetOpcode::G_MEMMOVE: + case TargetOpcode::G_MEMSET: { + // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other + // heuristics decide. + unsigned MaxLen = EnableOpt ? 0 : 32; + // Try to inline memcpy type calls if optimizations are enabled. + return !EnableMinSize ? Helper.tryCombineMemCpyFamily(MI, MaxLen) : false; + } } return false; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index cd3ec64219c89..0f135c3e80593 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -312,6 +312,11 @@ class SVE_1_Op_Pat; +class SVE_1_Op_Passthru_Pat +: Pat<(vtd (op pg:$Op1, vts:$Op2, vtd:$Op3)), + (inst $Op3, $Op1, $Op2)>; + class SVE_1_Op_Imm_OptLsl_Reverse_Pat : Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))), @@ -2281,14 +2286,22 @@ multiclass sve_fp_2op_p_zd opc, string asm, def : SVE_3_Op_Pat(NAME)>; } -multiclass sve_fp_2op_p_zd_HSD opc, string asm, SDPatternOperator op> { +multiclass sve_fp_2op_p_zd_HSD opc, string asm, SDPatternOperator op_merge, + SDPatternOperator op_pt = null_frag> { def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>; def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>; def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>; - def : SVE_3_Op_Pat(NAME # _H)>; - def : SVE_3_Op_Pat(NAME # _S)>; - def : SVE_3_Op_Pat(NAME # _D)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; } multiclass sve2_fp_flogb { @@ -3755,15 +3768,24 @@ multiclass sve_int_un_pred_arit_1 opc, string asm, def : SVE_3_Op_Pat(NAME # _D)>; } +// TODO: Remove int_op once its last use is converted to ir_op. multiclass sve_int_un_pred_arit_1_fp opc, string asm, - SDPatternOperator op> { + SDPatternOperator int_op, + SDPatternOperator ir_op> { def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>; def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>; - def : SVE_3_Op_Pat(NAME # _H)>; - def : SVE_3_Op_Pat(NAME # _S)>; - def : SVE_3_Op_Pat(NAME # _D)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -5473,7 +5495,7 @@ multiclass sve_mem_64b_sst_vi_ptrs opc, string asm, def : InstAlias(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>; def : InstAlias(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + (!cast(NAME # _IMM) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt), (!cast(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index a795493017402..ee6d35ddddf8a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -60,6 +60,12 @@ static cl::opt WidenLoads( cl::ReallyHidden, cl::init(false)); +static cl::opt Widen16BitOps( + "amdgpu-codegenprepare-widen-16-bit-ops", + cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(true)); + static cl::opt UseMul24Intrin( "amdgpu-codegenprepare-mul24", cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"), @@ -269,6 +275,9 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { } bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { + if (!Widen16BitOps) + return false; + const IntegerType *IntTy = dyn_cast(T); if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 2a5dac1f1e106..151b1bdd55381 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1304,9 +1304,9 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base, } else if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { unsigned OffsetValue0 = CAddr->getZExtValue() / Align; unsigned OffsetValue1 = OffsetValue0 + 1; - assert(Align * OffsetValue0 == CAddr->getZExtValue()); + bool OffsetIsAligned = Align * OffsetValue0 == CAddr->getZExtValue(); - if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1)) { + if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1) && OffsetIsAligned) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 880c7e4d44c79..3f39f6f21c1cc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1422,6 +1422,20 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } +bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { + if (TM.getOptLevel() > CodeGenOpt::None) { + unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; + if (WGSize <= STI.getWavefrontSize()) { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); + MI.eraseFromParent(); + return true; + } + } + return selectImpl(MI, *CoverageInfo); +} + static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail) { if (TexFailCtrl) @@ -1726,6 +1740,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectDSAppendConsume(I, true); case Intrinsic::amdgcn_ds_consume: return selectDSAppendConsume(I, false); + case Intrinsic::amdgcn_s_barrier: + return selectSBarrier(I); default: { return selectImpl(I, *CoverageInfo); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 4478938911477..2176e2b549511 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -118,6 +118,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; + bool selectSBarrier(MachineInstr &MI) const; bool selectImageIntrinsic(MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 17ac0c026912b..a28ea4acbde0e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -335,7 +335,12 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) { return false; if (!Ty.isVector()) return true; - unsigned EltSize = Ty.getElementType().getSizeInBits(); + + LLT EltTy = Ty.getElementType(); + if (EltTy.isPointer()) + return true; + + unsigned EltSize = EltTy.getSizeInBits(); return EltSize != 32 && EltSize != 64; } @@ -1492,6 +1497,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }; auto &Builder = getActionDefinitionsBuilder(Op) + .legalIf(all(isRegisterType(0), isRegisterType(1))) .lowerFor({{S16, V2S16}}) .lowerIf([=](const LegalityQuery &Query) { const LLT BigTy = Query.Types[BigTyIdx]; @@ -1547,19 +1553,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); }) - .legalIf([=](const LegalityQuery &Query) { - const LLT &BigTy = Query.Types[BigTyIdx]; - const LLT &LitTy = Query.Types[LitTyIdx]; - - if (BigTy.isVector() && BigTy.getSizeInBits() < 32) - return false; - if (LitTy.isVector() && LitTy.getSizeInBits() < 32) - return false; - - return BigTy.getSizeInBits() % 16 == 0 && - LitTy.getSizeInBits() % 16 == 0 && - BigTy.getSizeInBits() <= MaxRegisterSize; - }) // Any vectors left are the wrong size. Scalarize them. .scalarize(0) .scalarize(1); @@ -2632,23 +2625,42 @@ bool AMDGPULegalizerInfo::legalizeBuildVector( return true; } +// Check that this is a G_XOR x, -1 +static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { + if (MI.getOpcode() != TargetOpcode::G_XOR) + return false; + auto ConstVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); + return ConstVal && *ConstVal == -1; +} + // Return the use branch instruction, otherwise null if the usage is invalid. -static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineInstr *&Br, - MachineBasicBlock *&UncondBrTarget) { +static MachineInstr * +verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, + MachineBasicBlock *&UncondBrTarget, bool &Negated) { Register CondDef = MI.getOperand(0).getReg(); if (!MRI.hasOneNonDBGUse(CondDef)) return nullptr; MachineBasicBlock *Parent = MI.getParent(); - MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); - if (UseMI.getParent() != Parent || - UseMI.getOpcode() != AMDGPU::G_BRCOND) + MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); + + if (isNot(MRI, *UseMI)) { + Register NegatedCond = UseMI->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(NegatedCond)) + return nullptr; + + // We're deleting the def of this value, so we need to remove it. + UseMI->eraseFromParent(); + + UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); + Negated = true; + } + + if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) return nullptr; // Make sure the cond br is followed by a G_BR, or is the last instruction. - MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); + MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); if (Next == Parent->end()) { MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. @@ -2661,7 +2673,7 @@ static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, UncondBrTarget = Br->getOperand(0).getMBB(); } - return &UseMI; + return UseMI; } bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, @@ -3442,7 +3454,9 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, MachineIRBuilder &B, unsigned AddrSpace) const { Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); - auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); + auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); + Register Hi32 = Unmerge.getReg(1); + B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); MI.eraseFromParent(); return true; @@ -4472,7 +4486,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_else: { MachineInstr *Br = nullptr; MachineBasicBlock *UncondBrTarget = nullptr; - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { + bool Negated = false; + if (MachineInstr *BrCond = + verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { const SIRegisterInfo *TRI = static_cast(MRI.getTargetRegisterInfo()); @@ -4480,6 +4496,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, Register Use = MI.getOperand(3).getReg(); MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); + + if (Negated) + std::swap(CondBrTarget, UncondBrTarget); + B.setInsertPt(B.getMBB(), BrCond->getIterator()); if (IntrID == Intrinsic::amdgcn_if) { B.buildInstr(AMDGPU::SI_IF) @@ -4515,13 +4535,18 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_loop: { MachineInstr *Br = nullptr; MachineBasicBlock *UncondBrTarget = nullptr; - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { + bool Negated = false; + if (MachineInstr *BrCond = + verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { const SIRegisterInfo *TRI = static_cast(MRI.getTargetRegisterInfo()); MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); Register Reg = MI.getOperand(2).getReg(); + if (Negated) + std::swap(CondBrTarget, UncondBrTarget); + B.setInsertPt(B.getMBB(), BrCond->getIterator()); B.buildInstr(AMDGPU::SI_LOOP) .addUse(Reg) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index 54c15e4e4d397..f4640252c9d54 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -131,7 +131,9 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const { if (!CI) continue; - Changed |= AMDGPUSubtarget::get(TM, F).makeLIDRangeMetadata(CI); + Function *Caller = CI->getParent()->getParent(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *Caller); + Changed |= ST.makeLIDRangeMetadata(CI); } return Changed; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 8f0f53df7a54b..c0bef6a5ada16 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -847,7 +847,18 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( continue; } - LLT OpTy = MRI.getType(Op.getReg()); + Register OpReg = Op.getReg(); + LLT OpTy = MRI.getType(OpReg); + + const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); + if (OpBank != &AMDGPU::VGPRRegBank) { + // Insert copy from AGPR to VGPR before the loop. + B.setMBB(MBB); + OpReg = B.buildCopy(OpTy, OpReg).getReg(0); + MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); + B.setInstr(*I); + } + unsigned OpSize = OpTy.getSizeInBits(); // Can only do a readlane of 32-bit pieces. @@ -857,11 +868,11 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MRI.setType(CurrentLaneOpReg, OpTy); - constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI); // Read the next variant <- also loop target. BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) - .addReg(Op.getReg()); + .addReg(OpReg); Register NewCondReg = MRI.createVirtualRegister(WaveRC); bool First = CondReg == AMDGPU::NoRegister; @@ -872,7 +883,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) .addDef(NewCondReg) .addReg(CurrentLaneOpReg) - .addReg(Op.getReg()); + .addReg(OpReg); Op.setReg(CurrentLaneOpReg); if (!First) { @@ -904,7 +915,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( // Insert the unmerge before the loop. B.setMBB(MBB); - auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); + auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg); B.setInstr(*I); unsigned NumPieces = Unmerge->getNumOperands() - 1; @@ -1048,7 +1059,7 @@ bool AMDGPURegisterBankInfo::collectWaterfallOperands( assert(MI.getOperand(Op).isUse()); Register Reg = MI.getOperand(Op).getReg(); const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); - if (OpBank->getID() == AMDGPU::VGPRRegBankID) + if (OpBank->getID() != AMDGPU::SGPRRegBankID) SGPROperandRegs.insert(Reg); } @@ -1083,16 +1094,24 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { Register Reg = MI.getOperand(OpIdx).getReg(); const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); - if (Bank != &AMDGPU::VGPRRegBank) + if (Bank == &AMDGPU::SGPRRegBank) return; + LLT Ty = MRI.getType(Reg); MachineIRBuilder B(MI); + + if (Bank != &AMDGPU::VGPRRegBank) { + // We need to copy from AGPR to VGPR + Reg = B.buildCopy(Ty, Reg).getReg(0); + MRI.setRegBank(Reg, AMDGPU::VGPRRegBank); + } + Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) .addDef(SGPR) .addReg(Reg); - MRI.setType(SGPR, MRI.getType(Reg)); + MRI.setType(SGPR, Ty); const TargetRegisterClass *Constrained = constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); @@ -1922,7 +1941,7 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( const RegisterBank &IdxBank = *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; - bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank; + bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; LLT VecTy = MRI.getType(VecReg); unsigned EltSize = VecTy.getScalarSizeInBits(); @@ -2004,7 +2023,7 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( const RegisterBank &IdxBank = *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; - bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank; + bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; LLT VecTy = MRI.getType(VecReg); unsigned EltSize = VecTy.getScalarSizeInBits(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 3698881ffb7b4..c03d24016cac2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -438,6 +438,21 @@ std::pair AMDGPUSubtarget::getWavesPerEU( return Requested; } +static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { + auto Node = Kernel.getMetadata("reqd_work_group_size"); + if (Node && Node->getNumOperands() == 3) + return mdconst::extract(Node->getOperand(Dim))->getZExtValue(); + return std::numeric_limits::max(); +} + +unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, + unsigned Dimension) const { + unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); + if (ReqdSize != std::numeric_limits::max()) + return ReqdSize - 1; + return getFlatWorkGroupSizes(Kernel).second - 1; +} + bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { Function *Kernel = I->getParent()->getParent(); unsigned MinSize = 0; @@ -474,11 +489,11 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { default: break; } + if (Dim <= 3) { - if (auto Node = Kernel->getMetadata("reqd_work_group_size")) - if (Node->getNumOperands() == 3) - MinSize = MaxSize = mdconst::extract( - Node->getOperand(Dim))->getZExtValue(); + unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); + if (ReqdSize != std::numeric_limits::max()) + MinSize = MaxSize = ReqdSize; } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index d5fca0313d75e..ce669bb250cae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -78,7 +78,7 @@ class AMDGPUSubtarget { bool EnablePromoteAlloca; bool HasTrigReducedRange; unsigned MaxWavesPerEU; - int LocalMemorySize; + unsigned LocalMemorySize; char WavefrontSizeLog2; public: @@ -202,7 +202,7 @@ class AMDGPUSubtarget { return WavefrontSizeLog2; } - int getLocalMemorySize() const { + unsigned getLocalMemorySize() const { return LocalMemorySize; } @@ -239,7 +239,11 @@ class AMDGPUSubtarget { /// subtarget without any kind of limitation. unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } - /// Creates value range metadata on an workitemid.* inrinsic call or load. + /// Return the maximum workitem ID value in the function, for the given (0, 1, + /// 2) dimension. + unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; + + /// Creates value range metadata on an workitemid.* intrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; /// \returns Number of bytes of arguments that are passed to a shader or diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 7b5dd5ea43de6..775cec6904a42 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -596,8 +596,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); MDT = &getAnalysis(); - SmallVector Worklist; - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3b342776870f9..b6f78abb71825 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4333,6 +4333,12 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { return (VT == MVT::i16) ? MVT::i16 : MVT::i32; } +LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const { + return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts()) + ? Ty.changeElementSize(16) + : Ty.changeElementSize(32); +} + // Answering this is somewhat tricky and depends on the specific device which // have different rates for fma or all f64 operations. // @@ -11389,6 +11395,50 @@ void SITargetLowering::computeKnownBitsForFrameIndex( Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } +static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, + KnownBits &Known, unsigned Dim) { + unsigned MaxValue = + ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim); + Known.Zero.setHighBits(countLeadingZeros(MaxValue)); +} + +void SITargetLowering::computeKnownBitsForTargetInstr( + GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts, + const MachineRegisterInfo &MRI, unsigned Depth) const { + const MachineInstr *MI = MRI.getVRegDef(R); + switch (MI->getOpcode()) { + case AMDGPU::G_INTRINSIC: { + switch (MI->getIntrinsicID()) { + case Intrinsic::amdgcn_workitem_id_x: + knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0); + break; + case Intrinsic::amdgcn_workitem_id_y: + knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1); + break; + case Intrinsic::amdgcn_workitem_id_z: + knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2); + break; + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_mbcnt_hi: { + // These return at most the wavefront size - 1. + unsigned Size = MRI.getType(R).getSizeInBits(); + Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2()); + break; + } + case Intrinsic::amdgcn_groupstaticsize: { + // We can report everything over the maximum size as 0. We can't report + // based on the actual size because we don't know if it's accurate or not + // at any given point. + Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize())); + break; + } + default: + break; + } + } + } +} + Align SITargetLowering::computeKnownAlignForTargetInstr( GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, unsigned Depth) const { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index ebff1b5930c3b..3e8220ad9db22 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -372,6 +372,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; + LLT getPreferredShiftAmountTy(LLT Ty) const override; + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override; @@ -418,6 +420,11 @@ class SITargetLowering final : public AMDGPUTargetLowering { void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override; + void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, + KnownBits &Known, + const APInt &DemandedElts, + const MachineRegisterInfo &MRI, + unsigned Depth = 0) const override; Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 5ab6edf3f606b..87ef8bcaa92e4 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -876,7 +876,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV)); + Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } // Resolve vm waits before gs-done. else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || @@ -1057,7 +1057,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // requiring a WAITCNT beforehand. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier()) { - Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV)); + Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } // TODO: Remove this work-around, enable the assert for Bug 457939 @@ -1090,8 +1090,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( } else { assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); - ScoreBrackets.applyWaitcnt( - AMDGPU::Waitcnt(~0u, ~0u, ~0u, II->getOperand(1).getImm())); + auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm(); + ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W)); } } } @@ -1099,7 +1099,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( } if (ForceEmitZeroWaitcnts) - Wait = AMDGPU::Waitcnt::allZero(IV); + Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt()); if (ForceEmitWaitcnt[VM_CNT]) Wait.VmCnt = 0; @@ -1139,12 +1139,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); - unsigned ICnt = II->getOperand(1).getImm(); + unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16) + ->getImm(); OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt); if (!TrackedWaitcntSet.count(&*II)) Wait.VsCnt = std::min(Wait.VsCnt, ICnt); if (Wait.VsCnt != ICnt) { - II->getOperand(1).setImm(Wait.VsCnt); + TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt); Modified = true; } Wait.VsCnt = ~0u; @@ -1268,7 +1269,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } else if (Inst.isCall()) { if (callWaitsOnFunctionReturn(Inst)) { // Act as a wait on everything - ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV)); + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } else { // May need to way wait for anything. ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e43c4d55698eb..74f8864640691 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -991,8 +991,6 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, Register TrueReg, Register FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineFunction *MF = MBB.getParent(); - const GCNSubtarget &ST = MF->getSubtarget(); const TargetRegisterClass *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && @@ -1496,7 +1494,6 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( unsigned FrameOffset, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); - const GCNSubtarget &ST = MF->getSubtarget(); const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -3312,9 +3309,6 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (OpInfo.RegClass < 0) return false; - const MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget(); - if (MO.isImm() && isInlineConstant(MO, OpInfo)) { if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), @@ -3467,8 +3461,11 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, Inst32.add(*Src2); } else { // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is - // replaced with an implicit read of vcc. This was already added - // during the initial BuildMI, so find it to preserve the flags. + // replaced with an implicit read of vcc or vcc_lo. The implicit read + // of vcc was already added during the initial BuildMI, but we + // 1) may need to change vcc to vcc_lo to preserve the original register + // 2) have to preserve the original flags. + fixImplicitOperands(*Inst32); copyFlagsToImplicitVCC(*Inst32, *Src2); } } @@ -3849,7 +3846,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } } - const GCNSubtarget &ST = MF->getSubtarget(); // v_writelane_b32 is an exception from constant bus restriction: // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && @@ -4017,7 +4013,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - if (isFLAT(MI) && !MF->getSubtarget().hasFlatInstOffsets()) { + if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); if (Offset->getImm() != 0) { ErrInfo = "subtarget does not support offsets in flat instructions"; @@ -4236,11 +4232,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock *MBB = MI.getParent(); MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - const SIRegisterInfo *TRI = - static_cast(MRI.getTargetRegisterInfo()); unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); - unsigned Size = TRI->getRegSizeInBits(*RC); + unsigned Size = RI.getRegSizeInBits(*RC); unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; if (MO.isReg()) Opcode = AMDGPU::COPY; @@ -4361,7 +4355,6 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; - const GCNSubtarget &ST = MF.getSubtarget(); const TargetRegisterClass *DefinedRC = OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; if (!MO) @@ -5181,8 +5174,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, } else if (!VAddr && ST.hasAddr64()) { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. - assert(MBB.getParent()->getSubtarget().getGeneration() - < AMDGPUSubtarget::VOLCANIC_ISLANDS && + assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"); unsigned RsrcPtr, NewSRsrc; @@ -6663,8 +6655,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { case TargetOpcode::INLINEASM_BR: { const MachineFunction *MF = MI.getParent()->getParent(); const char *AsmStr = MI.getOperand(0).getSymbolName(); - return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), - &MF->getSubtarget()); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); } default: return DescSize; @@ -6871,10 +6862,6 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con } void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { - MachineBasicBlock *MBB = MI.getParent(); - MachineFunction *MF = MBB->getParent(); - const GCNSubtarget &ST = MF->getSubtarget(); - if (!ST.isWave32()) return; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index d250cc2ec03db..9c66b27733dbe 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -368,8 +368,8 @@ struct Waitcnt { Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {} - static Waitcnt allZero(const IsaVersion &Version) { - return Waitcnt(0, 0, 0, Version.Major >= 10 ? 0 : ~0u); + static Waitcnt allZero(bool HasVscnt) { + return Waitcnt(0, 0, 0, HasVscnt ? 0 : ~0u); } static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); } diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index ac00b931a8eab..dd7b520effa86 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -317,8 +317,8 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, TBB = nullptr; FBB = nullptr; - MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin()) + MachineBasicBlock::instr_iterator I = MBB.instr_end(); + if (I == MBB.instr_begin()) return false; // Empty blocks are easy. --I; @@ -332,7 +332,7 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, // Skip over DEBUG values and predicated nonterminators. while (I->isDebugInstr() || !I->isTerminator()) { - if (I == MBB.begin()) + if (I == MBB.instr_begin()) return false; --I; } @@ -356,7 +356,7 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, Cond.push_back(I->getOperand(2)); } else if (I->isReturn()) { // Returns can't be analyzed, but we should run cleanup. - CantAnalyze = !isPredicated(*I); + CantAnalyze = true; } else { // We encountered other unrecognized terminator. Bail out immediately. return true; @@ -377,7 +377,7 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, // unconditional branch. if (AllowModify) { MachineBasicBlock::iterator DI = std::next(I); - while (DI != MBB.end()) { + while (DI != MBB.instr_end()) { MachineInstr &InstToDelete = *DI; ++DI; InstToDelete.eraseFromParent(); @@ -385,10 +385,19 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, } } - if (CantAnalyze) + if (CantAnalyze) { + // We may not be able to analyze the block, but we could still have + // an unconditional branch as the last instruction in the block, which + // just branches to layout successor. If this is the case, then just + // remove it if we're allowed to make modifications. + if (AllowModify && !isPredicated(MBB.back()) && + isUncondBranchOpcode(MBB.back().getOpcode()) && + TBB && MBB.isLayoutSuccessor(TBB)) + removeBranch(MBB); return true; + } - if (I == MBB.begin()) + if (I == MBB.instr_begin()) return false; --I; @@ -6120,3 +6129,8 @@ MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall( It--; return CallPt; } + +bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault( + MachineFunction &MF) const { + return Subtarget.isMClass() && MF.getFunction().hasMinSize(); +} diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 9ddd928246834..53c627c209343 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -372,6 +372,9 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const override; + /// Enable outlining by default at -Oz. + bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; + private: /// Returns an unused general-purpose register which can be used for /// constructing an outlined call if one exists. Returns 0 otherwise. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index f154c9340f1d5..eb433035e34da 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -143,7 +143,7 @@ static cl::opt ConstpoolPromotionMaxTotal( cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128)); -static cl::opt +cl::opt MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2)); diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index b05a38803553d..bb30dbd3a5c9d 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -9079,11 +9079,11 @@ multiclass BF16VDOTI(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>; } -def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>; -def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>; +def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>; +def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>; -defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>; -defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; +defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>; +defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; class BF16MM @@ -9091,8 +9091,8 @@ class BF16MM { + (v8bf16 QPR:$Vn), + (v8bf16 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm"); let DecoderNamespace = "VFPV8"; @@ -9106,8 +9106,8 @@ class VBF16MALQ NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "", [(set (v4f32 QPR:$dst), (OpNode (v4f32 QPR:$Vd), - (v16i8 QPR:$Vn), - (v16i8 QPR:$Vm)))]> { + (v8bf16 QPR:$Vn), + (v8bf16 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let DecoderNamespace = "VFPV8"; } @@ -9128,9 +9128,9 @@ multiclass VBF16MALQI { def : Pat< (v4f32 (OpNode (v4f32 QPR:$Vd), - (v16i8 QPR:$Vn), - (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm), - VectorIndex16:$lane)))))), + (v8bf16 QPR:$Vn), + (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm), + VectorIndex16:$lane)))), (!cast(NAME) QPR:$Vd, QPR:$Vn, (EXTRACT_SUBREG QPR:$Vm, diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index b316b1041f2c5..242fd706ca6c0 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -251,7 +251,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, // ARM supports the MachineOutliner. setMachineOutliner(true); - setSupportsDefaultOutlining(false); + setSupportsDefaultOutlining(true); } ARMBaseTargetMachine::~ARMBaseTargetMachine() = default; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 79d6bf7425687..f3206306a3b60 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -52,6 +52,8 @@ extern cl::opt EnableTailPredication; extern cl::opt EnableMaskedGatherScatters; +extern cl::opt MVEMaxSupportedInterleaveFactor; + /// Convert a vector load intrinsic into a simple llvm load instruction. /// This is beneficial when the underlying object being addressed comes /// from a constant, since we get constant-folding for free. @@ -1270,7 +1272,7 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, // multiplied by the number of elements being loaded. This is possibly very // conservative, but even so we still end up vectorising loops because the // cost per iteration for many loops is lower than for scalar loops. - unsigned VectorCost = NumElems * LT.first; + unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor(); // The scalarization cost should be a lot higher. We use the number of vector // elements plus the scalarization overhead. unsigned ScalarCost = @@ -1643,7 +1645,6 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, PredicatedScalarEvolution PSE = LAI->getPSE(); SmallVector LoadStores; int ICmpCount = 0; - int Stride = 0; for (BasicBlock *BB : L->blocks()) { for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -1662,22 +1663,38 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump()); return false; } - if (isa(I) || isa(I)) { Value *Ptr = isa(I) ? I.getOperand(0) : I.getOperand(1); int64_t NextStride = getPtrStride(PSE, Ptr, L); - // TODO: for now only allow consecutive strides of 1. We could support - // other strides as long as it is uniform, but let's keep it simple for - // now. - if (Stride == 0 && NextStride == 1) { - Stride = NextStride; + if (NextStride == 1) { + // TODO: for now only allow consecutive strides of 1. We could support + // other strides as long as it is uniform, but let's keep it simple + // for now. continue; - } - if (Stride != NextStride) { - LLVM_DEBUG(dbgs() << "Different strides found, can't " - "tail-predicate\n."); + } else if (NextStride == -1 || + (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || + (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { + LLVM_DEBUG(dbgs() + << "Consecutive strides of 2 found, vld2/vstr2 can't " + "be tail-predicated\n."); return false; + // TODO: don't tail predicate if there is a reversed load? + } else if (EnableMaskedGatherScatters) { + // Gather/scatters do allow loading from arbitrary strides, at + // least if they are loop invariant. + // TODO: Loop variant strides should in theory work, too, but + // this requires further testing. + const SCEV *PtrScev = + replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr); + if (auto AR = dyn_cast(PtrScev)) { + const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); + if (PSE.getSE()->isLoopInvariant(Step, L)) + continue; + } } + LLVM_DEBUG(dbgs() << "Bad stride found, can't " + "tail-predicate\n."); + return false; } } } diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index 40f5d5ee8e1b1..5e7a626e5276e 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -33,8 +33,8 @@ /// This pass: /// 1) Checks if the predicates of the masked load/store instructions are /// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes -/// the Backedge Taken Count (BTC) of the scalar loop as its second argument, -/// which we extract to set up the number of elements processed by the loop. +/// the the scalar loop tripcount as its second argument, which we extract +/// to set up the number of elements processed by the loop. /// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target /// specific VCTP intrinsic to represent the effect of tail predication. /// This will be picked up by the ARM Low-overhead loop pass, which performs @@ -352,14 +352,14 @@ static void Cleanup(SetVector &MaybeDead, Loop *L) { // The active lane intrinsic has this form: // -// @llvm.get.active.lane.mask(IV, BTC) +// @llvm.get.active.lane.mask(IV, TC) // // Here we perform checks that this intrinsic behaves as expected, // which means: // -// 1) The element count, which is calculated with BTC + 1, cannot overflow. -// 2) The element count needs to be sufficiently large that the decrement of -// element counter doesn't overflow, which means that we need to prove: +// 1) Check that the TripCount (TC) belongs to this loop (originally). +// 2) The element count (TC) needs to be sufficiently large that the decrement +// of element counter doesn't overflow, which means that we need to prove: // ceil(ElementCount / VectorWidth) >= TripCount // by rounding up ElementCount up: // ((ElementCount + (VectorWidth - 1)) / VectorWidth @@ -373,29 +373,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, EnableTailPredication == TailPredication::ForceEnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabled; - // 1) Test whether entry to the loop is protected by a conditional - // BTC + 1 < 0. In other words, if the scalar trip count overflows, - // becomes negative, we shouldn't enter the loop and creating - // tripcount expression BTC + 1 is not safe. So, check that BTC - // isn't max. This is evaluated in unsigned, because the semantics - // of @get.active.lane.mask is a ULE comparison. - auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); - auto *BTC = SE->getSCEV(BackedgeTakenCount); - auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L); - - if (isa(MaxBTC)) { - LLVM_DEBUG(dbgs() << "ARM TP: Can't compute SCEV BTC expression: "; - BTC->dump()); - return false; - } - - APInt MaxInt = APInt(BTC->getType()->getScalarSizeInBits(), ~0); - if (cast(MaxBTC)->getAPInt().eq(MaxInt) && - !ForceTailPredication) { - LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be int max: "; - BTC->dump()); - return false; - } + // 1) TODO: Check that the TripCount (TC) belongs to this loop (originally). + // The scalar tripcount corresponds the number of elements processed by the + // loop, so we will refer to that from this point on. + auto *ElemCountVal = ActiveLaneMask->getOperand(1); // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: // @@ -415,8 +396,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, auto *TC = SE->getSCEV(TripCount); unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); int VectorWidth = VecTy->getNumElements(); - auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); + auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); uint64_t MaxMinusVW = Diff.getZExtValue(); + // FIXME: since ranges can be negative we work with signed ranges here, but + // we shouldn't extract the zext'ed values for them. uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { @@ -434,7 +417,7 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, // // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime // values (and not constants), we have to compensate for the lowerbound value - // range to be off by 1. The reason is that BTC lives in the preheader in + // range to be off by 1. The reason is that the TC lives in the preheader in // this form: // // %trip.count.minus = add nsw nuw i32 %N, -1 @@ -449,9 +432,7 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set, // we first add 0 to TC such that we can do the <= comparison on both sets. // - auto *One = SE->getOne(TripCount->getType()); - // ElementCount = BTC + 1 - auto *ElementCount = SE->getAddExpr(BTC, One); + auto *ElementCount = SE->getSCEV(ElemCountVal); // Tmp = ElementCount + (VW-1) auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount, SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); @@ -504,38 +485,6 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, return false; } -// Materialize NumElements in the preheader block. -static Value *getNumElements(BasicBlock *Preheader, Value *BTC) { - // First, check the preheader if it not already exist: - // - // preheader: - // %BTC = add i32 %N, -1 - // .. - // vector.body: - // - // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1, - // but instead can just return %N. - for (auto &I : *Preheader) { - if (I.getOpcode() != Instruction::Add || &I != BTC) - continue; - ConstantInt *MinusOne = nullptr; - if (!(MinusOne = dyn_cast(I.getOperand(1)))) - continue; - if (MinusOne->getSExtValue() == -1) { - LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n"); - return I.getOperand(0); - } - } - - // But we do need to materialise BTC if it is not already there, - // e.g. if it is a constant. - IRBuilder<> Builder(Preheader->getTerminator()); - Value *NumElements = Builder.CreateAdd(BTC, - ConstantInt::get(BTC->getType(), 1), "num.elements"); - LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n"); - return NumElements; -} - void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, FixedVectorType *VecTy) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); @@ -543,23 +492,15 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Type *Ty = IntegerType::get(M->getContext(), 32); unsigned VectorWidth = VecTy->getNumElements(); - // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, - // is one less than the trip count. So we need to find or create - // %num.elements = %BTC + 1 in the preheader. - Value *BTC = ActiveLaneMask->getOperand(1); - Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); - Value *NumElements = getNumElements(L->getLoopPreheader(), BTC); - // Insert a phi to count the number of elements processed by the loop. Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(NumElements, L->getLoopPreheader()); + Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader()); - // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus - // represent the effect of tail predication. + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and + // thus represent the effect of tail predication. Builder.SetInsertPoint(ActiveLaneMask); - ConstantInt *Factor = - ConstantInt::get(cast(Ty), VectorWidth); + ConstantInt *Factor = ConstantInt::get(cast(Ty), VectorWidth); Intrinsic::ID VCTPID; switch (VectorWidth) { diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index b938aa1168cab..7b1c8b6079a0e 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -2064,20 +2064,9 @@ HexagonTargetLowering::getPreferredVectorAction(MVT VT) const { return TargetLoweringBase::TypeScalarizeVector; if (Subtarget.useHVXOps()) { - unsigned HwLen = Subtarget.getVectorLength(); - // If the size of VT is at least half of the vector length, - // widen the vector. Note: the threshold was not selected in - // any scientific way. - ArrayRef Tys = Subtarget.getHVXElementTypes(); - if (llvm::find(Tys, ElemTy) != Tys.end()) { - unsigned HwWidth = 8*HwLen; - unsigned VecWidth = VT.getSizeInBits(); - if (VecWidth >= HwWidth/2 && VecWidth < HwWidth) - return TargetLoweringBase::TypeWidenVector; - } - // Split vectors of i1 that correspond to (byte) vector pairs. - if (ElemTy == MVT::i1 && VecLen == 2*HwLen) - return TargetLoweringBase::TypeSplitVector; + unsigned Action = getPreferredHvxVectorAction(VT); + if (Action != ~0u) + return static_cast(Action); } // Always widen (remaining) vectors of i1. @@ -3025,7 +3014,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { if (Opc == ISD::INLINEASM || Opc == ISD::INLINEASM_BR) return LowerINLINEASM(Op, DAG); - if (isHvxOperation(Op)) { + if (isHvxOperation(Op.getNode())) { // If HVX lowering returns nothing, try the default lowering. if (SDValue V = LowerHvxOperation(Op, DAG)) return V; @@ -3132,13 +3121,15 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N, SDValue HexagonTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { - SDValue Op(N, 0); - if (isHvxOperation(Op)) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + if (isHvxOperation(N)) { if (SDValue V = PerformHvxDAGCombine(N, DCI)) return V; return SDValue(); } + SDValue Op(N, 0); const SDLoc &dl(Op); unsigned Opc = Op.getOpcode(); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 7d6e6b6185c87..d7a960fde0a20 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -338,6 +338,8 @@ namespace HexagonISD { private: void initializeHVXLowering(); + unsigned getPreferredHvxVectorAction(MVT VecTy) const; + void validateConstPtrAlignment(SDValue Ptr, const SDLoc &dl, unsigned NeedAlign) const; @@ -469,17 +471,16 @@ namespace HexagonISD { SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const; - SDValue HvxVecPredBitcastComputation(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const; SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const; SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const; + SDValue WidenHvxStore(SDValue Op, SelectionDAG &DAG) const; std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override; - bool isHvxOperation(SDValue Op) const; bool isHvxOperation(SDNode *N) const; SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const; void LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl &Results, diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 5da82244e69c7..ed701728892ad 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -14,6 +14,10 @@ using namespace llvm; +static cl::opt HvxWidenThreshold("hexagon-hvx-widen", + cl::Hidden, cl::init(16), + cl::desc("Lower threshold (in bytes) for widening to HVX vectors")); + static const MVT LegalV64[] = { MVT::v64i8, MVT::v32i16, MVT::v16i32 }; static const MVT LegalW64[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 }; static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 }; @@ -97,6 +101,8 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::CTTZ, T, Custom); setOperationAction(ISD::LOAD, T, Custom); + setOperationAction(ISD::MLOAD, T, Custom); + setOperationAction(ISD::MSTORE, T, Custom); setOperationAction(ISD::MUL, T, Custom); setOperationAction(ISD::MULHS, T, Custom); setOperationAction(ISD::MULHU, T, Custom); @@ -150,6 +156,8 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::LOAD, T, Custom); setOperationAction(ISD::STORE, T, Custom); + setOperationAction(ISD::MLOAD, T, Custom); + setOperationAction(ISD::MSTORE, T, Custom); setOperationAction(ISD::CTLZ, T, Custom); setOperationAction(ISD::CTTZ, T, Custom); setOperationAction(ISD::CTPOP, T, Custom); @@ -188,6 +196,9 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::AND, BoolW, Custom); setOperationAction(ISD::OR, BoolW, Custom); setOperationAction(ISD::XOR, BoolW, Custom); + // Masked load/store takes a mask that may need splitting. + setOperationAction(ISD::MLOAD, BoolW, Custom); + setOperationAction(ISD::MSTORE, BoolW, Custom); } for (MVT T : LegalV) { @@ -211,9 +222,56 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::SIGN_EXTEND_INREG, T, Legal); } + // Handle store widening for short vectors. + std::vector ShortTys; + unsigned HwLen = Subtarget.getVectorLength(); + for (MVT ElemTy : Subtarget.getHVXElementTypes()) { + if (ElemTy == MVT::i1) + continue; + int ElemWidth = ElemTy.getSizeInBits().getFixedSize(); + int MaxElems = (8*HwLen) / ElemWidth; + for (int N = 2; N < MaxElems; N *= 2) { + MVT VecTy = MVT::getVectorVT(ElemTy, N); + auto Action = getPreferredVectorAction(VecTy); + if (Action == TargetLoweringBase::TypeWidenVector) + setOperationAction(ISD::STORE, VecTy, Custom); + } + } + setTargetDAGCombine(ISD::VSELECT); } +unsigned +HexagonTargetLowering::getPreferredHvxVectorAction(MVT VecTy) const { + MVT ElemTy = VecTy.getVectorElementType(); + unsigned VecLen = VecTy.getVectorNumElements(); + unsigned HwLen = Subtarget.getVectorLength(); + + // Split vectors of i1 that correspond to (byte) vector pairs. + if (ElemTy == MVT::i1 && VecLen == 2*HwLen) + return TargetLoweringBase::TypeSplitVector; + // Treat i1 as i8 from now on. + if (ElemTy == MVT::i1) + ElemTy = MVT::i8; + + // If the size of VecTy is at least half of the vector length, + // widen the vector. Note: the threshold was not selected in + // any scientific way. + ArrayRef Tys = Subtarget.getHVXElementTypes(); + if (llvm::find(Tys, ElemTy) != Tys.end()) { + unsigned VecWidth = VecTy.getSizeInBits(); + bool HaveThreshold = HvxWidenThreshold.getNumOccurrences() > 0; + if (HaveThreshold && 8*HvxWidenThreshold <= VecWidth) + return TargetLoweringBase::TypeWidenVector; + unsigned HwWidth = 8*HwLen; + if (VecWidth >= HwWidth/2 && VecWidth < HwWidth) + return TargetLoweringBase::TypeWidenVector; + } + + // Defer to default. + return ~0u; +} + SDValue HexagonTargetLowering::getInt(unsigned IntId, MVT ResTy, ArrayRef Ops, const SDLoc &dl, SelectionDAG &DAG) const { @@ -1406,16 +1464,29 @@ HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const { // V6_vmpybv.) return getInstr(Hexagon::V6_vmpyih, dl, ResTy, {Vs, Vt}, DAG); case MVT::i32: { - // Use the following sequence for signed word multiply: - // T0 = V6_vmpyiowh Vs, Vt - // T1 = V6_vaslw T0, 16 - // T2 = V6_vmpyiewuh_acc T1, Vs, Vt - SDValue S16 = DAG.getConstant(16, dl, MVT::i32); - SDValue T0 = getInstr(Hexagon::V6_vmpyiowh, dl, ResTy, {Vs, Vt}, DAG); - SDValue T1 = getInstr(Hexagon::V6_vaslw, dl, ResTy, {T0, S16}, DAG); - SDValue T2 = getInstr(Hexagon::V6_vmpyiewuh_acc, dl, ResTy, - {T1, Vs, Vt}, DAG); - return T2; + auto MulL_V60 = [&](SDValue Vs, SDValue Vt) { + // Use the following sequence for signed word multiply: + // T0 = V6_vmpyiowh Vs, Vt + // T1 = V6_vaslw T0, 16 + // T2 = V6_vmpyiewuh_acc T1, Vs, Vt + SDValue S16 = DAG.getConstant(16, dl, MVT::i32); + SDValue T0 = getInstr(Hexagon::V6_vmpyiowh, dl, ResTy, {Vs, Vt}, DAG); + SDValue T1 = getInstr(Hexagon::V6_vaslw, dl, ResTy, {T0, S16}, DAG); + SDValue T2 = getInstr(Hexagon::V6_vmpyiewuh_acc, dl, ResTy, + {T1, Vs, Vt}, DAG); + return T2; + }; + auto MulL_V62 = [&](SDValue Vs, SDValue Vt) { + MVT PairTy = typeJoin({ResTy, ResTy}); + SDValue T0 = getInstr(Hexagon::V6_vmpyewuh_64, dl, PairTy, + {Vs, Vt}, DAG); + SDValue T1 = getInstr(Hexagon::V6_vmpyowh_64_acc, dl, PairTy, + {T0, Vs, Vt}, DAG); + return opSplit(T1, dl, DAG).first; + }; + if (Subtarget.useHVXV62Ops()) + return MulL_V62(Vs, Vt); + return MulL_V60(Vs, Vt); } default: break; @@ -1462,7 +1533,7 @@ HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const { assert(ElemTy == MVT::i32); SDValue S16 = DAG.getConstant(16, dl, MVT::i32); - if (IsSigned) { + auto MulHS_V60 = [&](SDValue Vs, SDValue Vt) { // mulhs(Vs,Vt) = // = [(Hi(Vs)*2^16 + Lo(Vs)) *s (Hi(Vt)*2^16 + Lo(Vt))] >> 32 // = [Hi(Vs)*2^16 *s Hi(Vt)*2^16 + Hi(Vs) *su Lo(Vt)*2^16 @@ -1489,6 +1560,20 @@ HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const { // Add: SDValue T3 = DAG.getNode(ISD::ADD, dl, ResTy, {S2, T2}); return T3; + }; + + auto MulHS_V62 = [&](SDValue Vs, SDValue Vt) { + MVT PairTy = typeJoin({ResTy, ResTy}); + SDValue T0 = getInstr(Hexagon::V6_vmpyewuh_64, dl, PairTy, {Vs, Vt}, DAG); + SDValue T1 = getInstr(Hexagon::V6_vmpyowh_64_acc, dl, PairTy, + {T0, Vs, Vt}, DAG); + return opSplit(T1, dl, DAG).second; + }; + + if (IsSigned) { + if (Subtarget.useHVXV62Ops()) + return MulHS_V62(Vs, Vt); + return MulHS_V60(Vs, Vt); } // Unsigned mulhw. (Would expansion using signed mulhw be better?) @@ -1593,7 +1678,7 @@ HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const { - const SDLoc &dl(Op); + const SDLoc &dl(Op); MVT ResTy = ty(Op); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); @@ -1613,6 +1698,75 @@ HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const { return Op; } +SDValue +HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const { + const SDLoc &dl(Op); + unsigned HwLen = Subtarget.getVectorLength(); + auto *MaskN = cast(Op.getNode()); + SDValue Mask = MaskN->getMask(); + SDValue Chain = MaskN->getChain(); + SDValue Base = MaskN->getBasePtr(); + auto *MemOp = MaskN->getMemOperand(); + + unsigned Opc = Op->getOpcode(); + assert(Opc == ISD::MLOAD || Opc == ISD::MSTORE); + + if (Opc == ISD::MLOAD) { + MVT ValTy = ty(Op); + SDValue Load = DAG.getLoad(ValTy, dl, Chain, Base, MaskN->getMemOperand()); + SDValue Thru = cast(MaskN)->getPassThru(); + if (isUndef(Thru)) + return Load; + SDValue VSel = DAG.getNode(ISD::VSELECT, dl, ValTy, Mask, Load, Thru); + return DAG.getMergeValues({VSel, Load.getValue(1)}, dl); + } + + // MSTORE + // HVX only has aligned masked stores. + + // TODO: Fold negations of the mask into the store. + unsigned StoreOpc = Hexagon::V6_vS32b_qpred_ai; + SDValue Value = cast(MaskN)->getValue(); + SDValue Offset0 = DAG.getTargetConstant(0, dl, ty(Base)); + + if (MaskN->getAlign().value() % HwLen == 0) { + SDValue Store = getInstr(StoreOpc, dl, MVT::Other, + {Mask, Base, Offset0, Value, Chain}, DAG); + DAG.setNodeMemRefs(cast(Store.getNode()), {MemOp}); + return Store; + } + + // Unaligned case. + auto StoreAlign = [&](SDValue V, SDValue A) { + SDValue Z = getZero(dl, ty(V), DAG); + // TODO: use funnel shifts? + // vlalign(Vu,Vv,Rt) rotates the pair Vu:Vv left by Rt and takes the + // upper half. + SDValue LoV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {V, Z, A}, DAG); + SDValue HiV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {Z, V, A}, DAG); + return std::make_pair(LoV, HiV); + }; + + MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen); + MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen); + SDValue MaskV = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, Mask); + VectorPair Tmp = StoreAlign(MaskV, Base); + VectorPair MaskU = {DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.first), + DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.second)}; + VectorPair ValueU = StoreAlign(Value, Base); + + SDValue Offset1 = DAG.getTargetConstant(HwLen, dl, MVT::i32); + SDValue StoreLo = + getInstr(StoreOpc, dl, MVT::Other, + {MaskU.first, Base, Offset0, ValueU.first, Chain}, DAG); + SDValue StoreHi = + getInstr(StoreOpc, dl, MVT::Other, + {MaskU.second, Base, Offset1, ValueU.second, Chain}, DAG); + DAG.setNodeMemRefs(cast(StoreLo.getNode()), {MemOp}); + DAG.setNodeMemRefs(cast(StoreHi.getNode()), {MemOp}); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, {StoreLo, StoreHi}); +} + SDValue HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const { assert(!Op.isMachineOpcode()); @@ -1648,45 +1802,114 @@ HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const { - LSBaseSDNode *BN = cast(Op.getNode()); - assert(BN->isUnindexed()); - MVT MemTy = BN->getMemoryVT().getSimpleVT(); + auto *MemN = cast(Op.getNode()); + + MVT MemTy = MemN->getMemoryVT().getSimpleVT(); if (!isHvxPairTy(MemTy)) return Op; const SDLoc &dl(Op); unsigned HwLen = Subtarget.getVectorLength(); MVT SingleTy = typeSplit(MemTy).first; - SDValue Chain = BN->getChain(); - SDValue Base0 = BN->getBasePtr(); + SDValue Chain = MemN->getChain(); + SDValue Base0 = MemN->getBasePtr(); SDValue Base1 = DAG.getMemBasePlusOffset(Base0, TypeSize::Fixed(HwLen), dl); MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr; - if (MachineMemOperand *MMO = BN->getMemOperand()) { + if (MachineMemOperand *MMO = MemN->getMemOperand()) { MachineFunction &MF = DAG.getMachineFunction(); MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen); MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen); } - unsigned MemOpc = BN->getOpcode(); - SDValue NewOp; + unsigned MemOpc = MemN->getOpcode(); if (MemOpc == ISD::LOAD) { + assert(cast(Op)->isUnindexed()); SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0); SDValue Load1 = DAG.getLoad(SingleTy, dl, Chain, Base1, MOp1); - NewOp = DAG.getMergeValues( - { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1), - DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - Load0.getValue(1), Load1.getValue(1)) }, dl); - } else { - assert(MemOpc == ISD::STORE); + return DAG.getMergeValues( + { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1), + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Load0.getValue(1), Load1.getValue(1)) }, dl); + } + if (MemOpc == ISD::STORE) { + assert(cast(Op)->isUnindexed()); VectorPair Vals = opSplit(cast(Op)->getValue(), dl, DAG); SDValue Store0 = DAG.getStore(Chain, dl, Vals.first, Base0, MOp0); SDValue Store1 = DAG.getStore(Chain, dl, Vals.second, Base1, MOp1); - NewOp = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1); + } + + assert(MemOpc == ISD::MLOAD || MemOpc == ISD::MSTORE); + + auto MaskN = cast(Op); + assert(MaskN->isUnindexed()); + VectorPair Masks = opSplit(MaskN->getMask(), dl, DAG); + SDValue Offset = DAG.getUNDEF(MVT::i32); + + if (MemOpc == ISD::MLOAD) { + VectorPair Thru = + opSplit(cast(Op)->getPassThru(), dl, DAG); + SDValue MLoad0 = + DAG.getMaskedLoad(SingleTy, dl, Chain, Base0, Offset, Masks.first, + Thru.first, SingleTy, MOp0, ISD::UNINDEXED, + ISD::NON_EXTLOAD, false); + SDValue MLoad1 = + DAG.getMaskedLoad(SingleTy, dl, Chain, Base1, Offset, Masks.second, + Thru.second, SingleTy, MOp1, ISD::UNINDEXED, + ISD::NON_EXTLOAD, false); + return DAG.getMergeValues( + { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, MLoad0, MLoad1), + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + MLoad0.getValue(1), MLoad1.getValue(1)) }, dl); + } + if (MemOpc == ISD::MSTORE) { + VectorPair Vals = opSplit(cast(Op)->getValue(), dl, DAG); + SDValue MStore0 = DAG.getMaskedStore(Chain, dl, Vals.first, Base0, Offset, + Masks.first, SingleTy, MOp0, + ISD::UNINDEXED, false, false); + SDValue MStore1 = DAG.getMaskedStore(Chain, dl, Vals.second, Base1, Offset, + Masks.second, SingleTy, MOp1, + ISD::UNINDEXED, false, false); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MStore0, MStore1); + } + + std::string Name = "Unexpected operation: " + Op->getOperationName(&DAG); + llvm_unreachable(Name.c_str()); +} + +SDValue +HexagonTargetLowering::WidenHvxStore(SDValue Op, SelectionDAG &DAG) const { + const SDLoc &dl(Op); + auto *StoreN = cast(Op.getNode()); + assert(StoreN->isUnindexed() && "Not widening indexed stores yet"); + assert(StoreN->getMemoryVT().getVectorElementType() != MVT::i1 && + "Not widening stores of i1 yet"); + + SDValue Chain = StoreN->getChain(); + SDValue Base = StoreN->getBasePtr(); + SDValue Offset = DAG.getUNDEF(MVT::i32); + + SDValue Value = opCastElem(StoreN->getValue(), MVT::i8, DAG); + MVT ValueTy = ty(Value); + unsigned ValueLen = ValueTy.getVectorNumElements(); + unsigned HwLen = Subtarget.getVectorLength(); + assert(isPowerOf2_32(ValueLen)); + + for (unsigned Len = ValueLen; Len < HwLen; ) { + Value = opJoin({DAG.getUNDEF(ty(Value)), Value}, dl, DAG); + Len = ty(Value).getVectorNumElements(); // This is Len *= 2 } + assert(ty(Value).getVectorNumElements() == HwLen); // Paranoia - return NewOp; + MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen); + SDValue StoreQ = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy, + {DAG.getConstant(ValueLen, dl, MVT::i32)}, DAG); + MachineFunction &MF = DAG.getMachineFunction(); + auto *MOp = MF.getMachineMemOperand(StoreN->getMemOperand(), 0, HwLen); + return DAG.getMaskedStore(Chain, dl, Value, Base, Offset, StoreQ, ty(Value), + MOp, ISD::UNINDEXED, false, false); } SDValue @@ -1749,6 +1972,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SETCC: case ISD::INTRINSIC_VOID: return Op; case ISD::INTRINSIC_WO_CHAIN: return LowerHvxIntrinsic(Op, DAG); + case ISD::MLOAD: + case ISD::MSTORE: return LowerHvxMaskedOp(Op, DAG); // Unaligned loads will be handled by the default lowering. case ISD::LOAD: return SDValue(); } @@ -1761,6 +1986,34 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { void HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { + unsigned Opc = N->getOpcode(); + SDValue Op(N, 0); + + switch (Opc) { + case ISD::STORE: { + assert( + getPreferredHvxVectorAction(ty(cast(N)->getValue())) == + TargetLoweringBase::TypeWidenVector && + "Not widening?"); + SDValue Store = WidenHvxStore(SDValue(N, 0), DAG); + Results.push_back(Store); + break; + } + case ISD::MLOAD: + if (isHvxPairTy(ty(Op))) { + SDValue S = SplitHvxMemOp(Op, DAG); + assert(S->getOpcode() == ISD::MERGE_VALUES); + Results.push_back(S.getOperand(0)); + Results.push_back(S.getOperand(1)); + } + break; + case ISD::MSTORE: + if (isHvxPairTy(ty(Op->getOperand(1)))) { // Stored value + SDValue S = SplitHvxMemOp(Op, DAG); + Results.push_back(S); + } + break; + } } void @@ -1783,6 +2036,8 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N, SDValue HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); const SDLoc &dl(N); SDValue Op(N, 0); @@ -1802,26 +2057,26 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return SDValue(); } -bool -HexagonTargetLowering::isHvxOperation(SDValue Op) const { - // If the type of the result, or any operand type are HVX vector types, - // this is an HVX operation. - return Subtarget.isHVXVectorType(ty(Op), true) || - llvm::any_of(Op.getNode()->ops(), - [this] (SDValue V) { - return Subtarget.isHVXVectorType(ty(V), true); - }); -} - bool HexagonTargetLowering::isHvxOperation(SDNode *N) const { + if (N->getOpcode() == ISD::STORE) { + // If it's a store-to-be-widened, treat it as an HVX operation. + SDValue Val = cast(N)->getValue(); + MVT ValTy = ty(Val); + if (ValTy.isVector()) { + auto Action = getPreferredVectorAction(ValTy); + if (Action == TargetLoweringBase::TypeWidenVector) + return true; + } + } // If the type of any result, or any operand type are HVX vector types, // this is an HVX operation. auto IsHvxTy = [this] (EVT Ty) { return Ty.isSimple() && Subtarget.isHVXVectorType(Ty.getSimpleVT(), true); }; - auto IsHvxOp = [this] (SDValue Op) { - return Subtarget.isHVXVectorType(ty(Op), true); + auto IsHvxOp = [this](SDValue Op) { + return Op.getValueType().isSimple() && + Subtarget.isHVXVectorType(ty(Op), true); }; return llvm::any_of(N->values(), IsHvxTy) || llvm::any_of(N->ops(), IsHvxOp); } diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index d1cd23c3be3e5..93215a4b61870 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -2721,6 +2721,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::PS_vloadrw_nt_ai: case Hexagon::V6_vL32b_ai: case Hexagon::V6_vS32b_ai: + case Hexagon::V6_vS32b_qpred_ai: + case Hexagon::V6_vS32b_nqpred_ai: case Hexagon::V6_vL32b_nt_ai: case Hexagon::V6_vS32b_nt_ai: case Hexagon::V6_vL32Ub_ai: diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 078a7135c55be..0e5772bd690f2 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -364,6 +364,14 @@ let Predicates = [UseHVX] in { (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>; } + // Take a pair of vectors Vt:Vs and shift them towards LSB by (Rt & HwLen). + def: Pat<(VecI8 (valign HVI8:$Vt, HVI8:$Vs, I32:$Rt)), + (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>; + def: Pat<(VecI16 (valign HVI16:$Vt, HVI16:$Vs, I32:$Rt)), + (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>; + def: Pat<(VecI32 (valign HVI32:$Vt, HVI32:$Vs, I32:$Rt)), + (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>; + def: Pat<(HexagonVASL HVI8:$Vs, I32:$Rt), (V6_vpackeb (V6_vaslh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt), (V6_vaslh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index ce674d638ccb4..cbd60f36d8c6e 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -35,6 +35,9 @@ static cl::opt EmitLookupTables("hexagon-emit-lookup-tables", cl::init(true), cl::Hidden, cl::desc("Control lookup table emission on Hexagon target")); +static cl::opt HexagonMaskedVMem("hexagon-masked-vmem", cl::init(true), + cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); + // Constant "cost factor" to make floating point operations more expensive // in terms of vectorization cost. This isn't the best way, but it should // do. Ultimately, the cost should use cycles. @@ -45,8 +48,7 @@ bool HexagonTTIImpl::useHVX() const { } bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const { - assert(VecTy->isVectorTy()); - if (isa(VecTy)) + if (!VecTy->isVectorTy() || isa(VecTy)) return false; // Avoid types like <2 x i32*>. if (!cast(VecTy)->getElementType()->isIntegerTy()) @@ -308,6 +310,14 @@ unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return 1; } +bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) { + return HexagonMaskedVMem && isTypeForHVX(DataType); +} + +bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) { + return HexagonMaskedVMem && isTypeForHVX(DataType); +} + /// --- Vector TTI end --- unsigned HexagonTTIImpl::getPrefetchDistance() const { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 07e59fb5585e8..b99f512df7665 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -155,6 +155,9 @@ class HexagonTTIImpl : public BasicTTIImplBase { return 1; } + bool isLegalMaskedStore(Type *DataType, Align Alignment); + bool isLegalMaskedLoad(Type *DataType, Align Alignment); + /// @} int getUserCost(const User *U, ArrayRef Operands, diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp index b489c81377694..2692c08b93def 100644 --- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp +++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp @@ -322,6 +322,8 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); + computeTables(); verify(*ST.getInstrInfo()); } @@ -500,7 +502,6 @@ static bool MSA2OpIntrinsicToGeneric(MachineInstr &MI, unsigned Opcode, bool MipsLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); const MipsSubtarget &ST = static_cast(MI.getMF()->getSubtarget()); const MipsInstrInfo &TII = *ST.getInstrInfo(); @@ -508,14 +509,6 @@ bool MipsLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, const RegisterBankInfo &RBI = *ST.getRegBankInfo(); switch (MI.getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memset: - case Intrinsic::memmove: - if (createMemLibcall(MIRBuilder, MRI, MI) == - LegalizerHelper::UnableToLegalize) - return false; - MI.eraseFromParent(); - return true; case Intrinsic::trap: { MachineInstr *Trap = MIRBuilder.buildInstr(Mips::TRAP); MI.eraseFromParent(); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp index d672d54772e0a..77b0331bb14c9 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -58,14 +58,19 @@ std::pair PPCXCOFFObjectWriter::getRelocTypeAndSignSize( switch ((unsigned)Fixup.getKind()) { default: report_fatal_error("Unimplemented fixup kind."); - case PPC::fixup_ppc_half16: + case PPC::fixup_ppc_half16: { + const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15; switch (Modifier) { default: report_fatal_error("Unsupported modifier for half16 fixup."); case MCSymbolRefExpr::VK_None: - return {XCOFF::RelocationType::R_TOC, EncodedSignednessIndicator | 15}; + return {XCOFF::RelocationType::R_TOC, SignAndSizeForHalf16}; + case MCSymbolRefExpr::VK_PPC_U: + return {XCOFF::RelocationType::R_TOCU, SignAndSizeForHalf16}; + case MCSymbolRefExpr::VK_PPC_L: + return {XCOFF::RelocationType::R_TOCL, SignAndSizeForHalf16}; } - break; + } break; case PPC::fixup_ppc_br24: // Branches are 4 byte aligned, so the 24 bits we encode in // the instruction actually represents a 26 bit offset. diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index cda809e2472d4..a617715d4bd86 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -238,6 +238,10 @@ def FeaturePairedVectorMemops: SubtargetFeature<"paired-vector-memops", "PairedVectorMemops", "true", "32Byte load and store instructions", [FeatureISA3_0]>; +def FeatureMMA : SubtargetFeature<"mma", "HasMMA", "true", + "Enable MMA instructions", + [FeatureP8Vector, FeatureP9Altivec, + FeaturePairedVectorMemops]>; def FeaturePredictableSelectIsExpensive : SubtargetFeature<"predictable-select-expensive", @@ -343,7 +347,8 @@ def ProcessorFeatures { // still exist with the exception of those we know are Power9 specific. list P10AdditionalFeatures = [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs, - FeaturePCRelativeMemops, FeatureP10Vector, FeaturePairedVectorMemops]; + FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA, + FeaturePairedVectorMemops]; list P10SpecificFeatures = []; list P10InheritableFeatures = !listconcat(P9InheritableFeatures, P10AdditionalFeatures); diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index cfcd3b031d17a..8f1477012bfdd 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -522,7 +522,8 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, MCSymbol *MOSymbol = getSymbol(GValue); const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, VK, OutContext); EmitToStreamer(*OutStreamer, - MCInstBuilder(Subtarget->isPPC64() ? Opcode : PPC::BL_TLS) + MCInstBuilder(Subtarget->isPPC64() ? Opcode + : (unsigned)PPC::BL_TLS) .addExpr(TlsRef) .addExpr(SymVar)); } diff --git a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp index acc8b317a220e..172f1346c5072 100644 --- a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp +++ b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp @@ -78,9 +78,9 @@ class PPCBoolRetToInt : public FunctionPass { Value *Curr = WorkList.back(); WorkList.pop_back(); auto *CurrUser = dyn_cast(Curr); - // Operands of CallInst are skipped because they may not be Bool type, - // and their positions are defined by ABI. - if (CurrUser && !isa(Curr)) + // Operands of CallInst/Constant are skipped because they may not be Bool + // type. For CallInst, their positions are defined by ABI. + if (CurrUser && !isa(Curr) && !isa(Curr)) for (auto &Op : CurrUser->operands()) if (Defs.insert(Op).second) WorkList.push_back(Op); @@ -90,6 +90,9 @@ class PPCBoolRetToInt : public FunctionPass { // Translate a i1 value to an equivalent i32/i64 value: Value *translate(Value *V) { + assert(V->getType() == Type::getInt1Ty(V->getContext()) && + "Expect an i1 value"); + Type *IntTy = ST->isPPC64() ? Type::getInt64Ty(V->getContext()) : Type::getInt32Ty(V->getContext()); @@ -252,9 +255,9 @@ class PPCBoolRetToInt : public FunctionPass { auto *First = dyn_cast(Pair.first); auto *Second = dyn_cast(Pair.second); assert((!First || Second) && "translated from user to non-user!?"); - // Operands of CallInst are skipped because they may not be Bool type, - // and their positions are defined by ABI. - if (First && !isa(First)) + // Operands of CallInst/Constant are skipped because they may not be Bool + // type. For CallInst, their positions are defined by ABI. + if (First && !isa(First) && !isa(First)) for (unsigned i = 0; i < First->getNumOperands(); ++i) Second->setOperand(i, BoolToIntMap[First->getOperand(i)]); } diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 39790ac9a8aab..9edbf5f68f324 100644 --- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -1567,6 +1567,10 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) { if (IsVarArg) return false; + // If this is a PC-Rel function, let SDISel handle the call. + if (Subtarget->isUsingPCRelativeCalls()) + return false; + // Handle simple calls for now, with legal return types and // those that can be extended. Type *RetTy = CLI.RetTy; @@ -1991,6 +1995,10 @@ bool PPCFastISel::fastSelectInstruction(const Instruction *I) { // Materialize a floating-point constant into a register, and return // the register number (or zero if we failed to handle it). unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { + // If this is a PC-Rel function, let SDISel handle constant pool. + if (Subtarget->isUsingPCRelativeCalls()) + return false; + // No plans to handle long double here. if (VT != MVT::f32 && VT != MVT::f64) return 0; @@ -2055,6 +2063,10 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { // Materialize the address of a global value into a register, and return // the register number (or zero if we failed to handle it). unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { + // If this is a PC-Rel function, let SDISel handle GV materialization. + if (Subtarget->isUsingPCRelativeCalls()) + return false; + assert(VT == MVT::i64 && "Non-address!"); const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass; unsigned DestReg = createResultReg(RC); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 18e04ff6f5160..2c0de9a90b8f6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -7862,20 +7862,45 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, // to // - assert(Op.getValueType().isVector() && "Vector type expected."); - - SDLoc DL(Op); - SDValue N1 = Op.getOperand(0); - unsigned SrcSize = N1.getValueType().getSizeInBits(); - assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector"); - SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL); - EVT TrgVT = Op.getValueType(); + assert(TrgVT.isVector() && "Vector type expected."); unsigned TrgNumElts = TrgVT.getVectorNumElements(); EVT EltVT = TrgVT.getVectorElementType(); + if (!isOperationCustom(Op.getOpcode(), TrgVT) || + TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(TrgNumElts) || + !isPowerOf2_32(EltVT.getSizeInBits())) + return SDValue(); + + SDValue N1 = Op.getOperand(0); + EVT SrcVT = N1.getValueType(); + unsigned SrcSize = SrcVT.getSizeInBits(); + if (SrcSize > 256 || + !isPowerOf2_32(SrcVT.getVectorNumElements()) || + !isPowerOf2_32(SrcVT.getVectorElementType().getSizeInBits())) + return SDValue(); + if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2) + return SDValue(); + unsigned WideNumElts = 128 / EltVT.getSizeInBits(); EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); + SDLoc DL(Op); + SDValue Op1, Op2; + if (SrcSize == 256) { + EVT VecIdxTy = getVectorIdxTy(DAG.getDataLayout()); + EVT SplitVT = + N1.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); + unsigned SplitNumElts = SplitVT.getVectorNumElements(); + Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1, + DAG.getConstant(0, DL, VecIdxTy)); + Op2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1, + DAG.getConstant(SplitNumElts, DL, VecIdxTy)); + } + else { + Op1 = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL); + Op2 = DAG.getUNDEF(WideVT); + } + // First list the elements we want to keep. unsigned SizeMult = SrcSize / TrgVT.getSizeInBits(); SmallVector ShuffV; @@ -7891,8 +7916,9 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, // ShuffV.push_back(i + WideNumElts); ShuffV.push_back(WideNumElts + 1); - SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc); - return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV); + Op1 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op1); + Op2 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op2); + return DAG.getVectorShuffle(WideVT, DL, Op1, Op2, ShuffV); } /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when @@ -10750,13 +10776,11 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; case ISD::TRUNCATE: { - EVT TrgVT = N->getValueType(0); - EVT OpVT = N->getOperand(0).getValueType(); - if (TrgVT.isVector() && - isOperationCustom(N->getOpcode(), TrgVT) && - OpVT.getSizeInBits() <= 128 && - isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits())) - Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG)); + if (!N->getValueType(0).isVector()) + return; + SDValue Lowered = LowerTRUNCATEVector(SDValue(N, 0), DAG); + if (Lowered) + Results.push_back(Lowered); return; } case ISD::BITCAST: diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 5c59e0f7b2af3..0732e0f0ace36 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -3760,6 +3760,20 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI, } return false; } + case PPC::SUBFIC: + case PPC::SUBFIC8: { + // Only transform this if the CARRY implicit operand is dead. + if (MI.getNumOperands() > 3 && !MI.getOperand(3).isDead()) + return false; + int64_t Minuend = MI.getOperand(2).getImm(); + if (isInt<16>(Minuend - SExtImm)) { + ReplaceWithLI = true; + Is64BitLI = Opc == PPC::SUBFIC8; + NewImm = Minuend - SExtImm; + break; + } + return false; + } case PPC::RLDICL: case PPC::RLDICL_rec: case PPC::RLDICL_32: diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index 43b306e341715..491d969861e13 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -504,6 +504,7 @@ multiclass 8LS_DForm_R_SI34_XT6_RA5_p opcode, dag OOL, dag IOL, def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">; def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">; def PairedVectorMemops : Predicate<"PPCSubTarget->pairedVectorMemops()">; +def MMA : Predicate<"PPCSubTarget->hasMMA()">; let Predicates = [PrefixInstrs] in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { @@ -1206,13 +1207,21 @@ let Predicates = [IsISA3_1] in { "vdivud $vD, $vA, $vB", IIC_VecGeneral, [(set v2i64:$vD, (udiv v2i64:$vA, v2i64:$vB))]>; def VDIVESW : VXForm_1<907, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vdivesw $vD, $vA, $vB", IIC_VecGeneral, []>; + "vdivesw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (int_ppc_altivec_vdivesw v4i32:$vA, + v4i32:$vB))]>; def VDIVEUW : VXForm_1<651, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vdiveuw $vD, $vA, $vB", IIC_VecGeneral, []>; + "vdiveuw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (int_ppc_altivec_vdiveuw v4i32:$vA, + v4i32:$vB))]>; def VDIVESD : VXForm_1<971, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vdivesd $vD, $vA, $vB", IIC_VecGeneral, []>; + "vdivesd $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (int_ppc_altivec_vdivesd v2i64:$vA, + v2i64:$vB))]>; def VDIVEUD : VXForm_1<715, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vdiveud $vD, $vA, $vB", IIC_VecGeneral, []>; + "vdiveud $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (int_ppc_altivec_vdiveud v2i64:$vA, + v2i64:$vB))]>; def XVTLSBB : XX2_BF3_XO5_XB6_XO9<60, 2, 475, (outs crrc:$BF), (ins vsrc:$XB), "xvtlsbb $BF, $XB", IIC_VecGeneral, []>; @@ -1284,6 +1293,15 @@ let Predicates = [IsISA3_1] in { //---------------------------- Anonymous Patterns ----------------------------// let Predicates = [IsISA3_1] in { + // Exploit the vector multiply high instructions using intrinsics. + def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)), + (v4i32 (VMULHSW $vA, $vB))>; + def : Pat<(v4i32 (int_ppc_altivec_vmulhuw v4i32:$vA, v4i32:$vB)), + (v4i32 (VMULHUW $vA, $vB))>; + def : Pat<(v2i64 (int_ppc_altivec_vmulhsd v2i64:$vA, v2i64:$vB)), + (v2i64 (VMULHSD $vA, $vB))>; + def : Pat<(v2i64 (int_ppc_altivec_vmulhud v2i64:$vA, v2i64:$vB)), + (v2i64 (VMULHUD $vA, $vB))>; def : Pat<(v16i8 (int_ppc_vsx_xxgenpcvbm v16i8:$VRB, imm:$IMM)), (v16i8 (COPY_TO_REGCLASS (XXGENPCVBM $VRB, imm:$IMM), VRRC))>; def : Pat<(v8i16 (int_ppc_vsx_xxgenpcvhm v8i16:$VRB, imm:$IMM)), diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td index c79d55f56b2a6..571cc219ff2b4 100644 --- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td +++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td @@ -41,9 +41,9 @@ def P9Model : SchedMachineModel { let CompleteModel = 1; // Do not support SPE (Signal Processing Engine), prefixed instructions on - // Power 9, paired vector mem ops, PC relative mem ops, or instructions + // Power 9, paired vector mem ops, MMA, PC relative mem ops, or instructions // introduced in ISA 3.1. - let UnsupportedFeatures = [HasSPE, PrefixInstrs, PairedVectorMemops, + let UnsupportedFeatures = [HasSPE, PrefixInstrs, PairedVectorMemops, MMA, PCRelativeMemops, IsISA3_1]; } diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 6dcb73f2be649..8021cfa4a18c6 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -73,6 +73,7 @@ void PPCSubtarget::initializeEnvironment() { HasP8Crypto = false; HasP9Vector = false; HasP9Altivec = false; + HasMMA = false; HasP10Vector = false; HasPrefixInstrs = false; HasPCRelativeMemops = false; diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 3936bd5f5aaee..76b43dfc7a723 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -107,6 +107,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool HasP10Vector; bool HasPrefixInstrs; bool HasPCRelativeMemops; + bool HasMMA; bool HasFCPSGN; bool HasFSQRT; bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES; @@ -260,6 +261,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasP10Vector() const { return HasP10Vector; } bool hasPrefixInstrs() const { return HasPrefixInstrs; } bool hasPCRelativeMemops() const { return HasPCRelativeMemops; } + bool hasMMA() const { return HasMMA; } bool pairedVectorMemops() const { return PairedVectorMemops; } bool hasMFOCRF() const { return HasMFOCRF; } bool hasISEL() const { return HasISEL; } diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index 493f5fa0b674c..73fcbb43adb92 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -162,6 +162,13 @@ def FeatureStdExtZvlsseg def HasStdExtZvlsseg : Predicate<"Subtarget->hasStdExtZvlsseg()">, AssemblerPredicate<(all_of FeatureStdExtZvlsseg), "'Zvlsseg' (Vector segment load/store instructions)">; +def FeatureExtZvamo + : SubtargetFeature<"experimental-zvamo", "HasStdExtZvamo", "true", + "'Zvamo'(Vector AMO Operations)", + [FeatureStdExtV]>; +def HasStdExtZvamo : Predicate<"Subtarget->hasStdExtZvamo()">, + AssemblerPredicate<(all_of FeatureExtZvamo), + "'Zvamo'(Vector AMO Operations)">; def Feature64Bit : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td index 8ca010d033c39..030571a370fd8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td @@ -42,6 +42,19 @@ def LUMOPUnitStrideFF: RISCVLSUMOP<0b10000>; def SUMOPUnitStride : RISCVLSUMOP<0b00000>; def SUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>; +class RISCVAMOOP val> { + bits<5> Value = val; +} +def AMOOPVamoSwap : RISCVAMOOP<0b00001>; +def AMOOPVamoAdd : RISCVAMOOP<0b00000>; +def AMOOPVamoXor : RISCVAMOOP<0b00100>; +def AMOOPVamoAnd : RISCVAMOOP<0b01100>; +def AMOOPVamoOr : RISCVAMOOP<0b01000>; +def AMOOPVamoMin : RISCVAMOOP<0b10000>; +def AMOOPVamoMax : RISCVAMOOP<0b10100>; +def AMOOPVamoMinu : RISCVAMOOP<0b11000>; +def AMOOPVamoMaxu : RISCVAMOOP<0b11100>; + class RISCVWidth val> { bits<4> Value = val; } @@ -313,3 +326,22 @@ class RVInstVSX nf, bit mew, RISCVMOP mop, bits<3> width, let Uses = [VTYPE, VL]; } + +class RVInstVAMO width, dag outs, + dag ins, string opcodestr, string argstr> + : RVInst { + bits<5> vs2; + bits<5> rs1; + bit wd; + bit vm; + + let Inst{31-27} = amoop.Value; + let Inst{26} = wd; + let Inst{25} = vm; + let Inst{24-20} = vs2; + let Inst{19-15} = rs1; + let Inst{14-12} = width; + let Opcode = OPC_AMO.Value; + + let Uses = [VTYPE, VL]; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 7b6ea002c7b71..249264d1945f4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -512,13 +512,20 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { const unsigned Opcode = MI.getOpcode(); - switch(Opcode) { - default: - break; - case RISCV::ADDI: - case RISCV::ORI: - case RISCV::XORI: - return (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0); + switch (Opcode) { + default: + break; + case RISCV::FSGNJ_D: + case RISCV::FSGNJ_S: + // The canonical floatig-point move is fsgnj rd, rs, rs. + return MI.getOperand(1).isReg() && MI.getOperand(2).isReg() && + MI.getOperand(1).getReg() == MI.getOperand(2).getReg(); + case RISCV::ADDI: + case RISCV::ORI: + case RISCV::XORI: + return (MI.getOperand(1).isReg() && + MI.getOperand(1).getReg() == RISCV::X0) || + (MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0); } return MI.isAsCheapAsAMove(); } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index f68767847ade8..a9960ea546ada 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -519,7 +519,8 @@ def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1), let rs2 = 0; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isMoveReg = 1, + isAsCheapAsAMove = 1 in def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2), "c.mv", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU]>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 70afcc6776d56..3ac474cb65499 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -293,6 +293,29 @@ class VALUVs2 funct6, bits<5> vs1, RISCVVFormat opv, string opcodestr> opcodestr, "$vd, $vs2$vm">; } // hasSideEffects = 0, mayLoad = 0, mayStore = 0 +let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in { +// vamo vd, (rs1), vs2, vd, vm +class VAMOWd + : RVInstVAMO { + let Constraints = "$vd_wd = $vd"; + let wd = 1; + bits<5> vd; + let Inst{11-7} = vd; +} + +// vamo x0, (rs1), vs2, vs3, vm +class VAMONoWd + : RVInstVAMO { + bits<5> vs3; + let Inst{11-7} = vs3; +} + +} // hasSideEffects = 0, mayLoad = 1, mayStore = 1 + //===----------------------------------------------------------------------===// // Combination of instruction classes. // Use these multiclasses to define instructions more easily. @@ -400,6 +423,11 @@ multiclass VALU_FV_VS2 funct6, bits<5> vs1> { def "" : VALUVs2; } +multiclass VAMO { + def _WD : VAMOWd; + def _UNWD : VAMONoWd; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -1021,3 +1049,53 @@ let Predicates = [HasStdExtZvlsseg] in { def VSXSEG#nf#EI1024_V : VIndexedSegmentStore; } } // Predicates = [HasStdExtZvlsseg] + +let Predicates = [HasStdExtZvamo, HasStdExtA] in { + defm VAMOSWAPEI8 : VAMO; + defm VAMOSWAPEI16 : VAMO; + defm VAMOSWAPEI32 : VAMO; + + defm VAMOADDEI8 : VAMO; + defm VAMOADDEI16 : VAMO; + defm VAMOADDEI32 : VAMO; + + defm VAMOXOREI8 : VAMO; + defm VAMOXOREI16 : VAMO; + defm VAMOXOREI32 : VAMO; + + defm VAMOANDEI8 : VAMO; + defm VAMOANDEI16 : VAMO; + defm VAMOANDEI32 : VAMO; + + defm VAMOOREI8 : VAMO; + defm VAMOOREI16 : VAMO; + defm VAMOOREI32 : VAMO; + + defm VAMOMINEI8 : VAMO; + defm VAMOMINEI16 : VAMO; + defm VAMOMINEI32 : VAMO; + + defm VAMOMAXEI8 : VAMO; + defm VAMOMAXEI16 : VAMO; + defm VAMOMAXEI32 : VAMO; + + defm VAMOMINUEI8 : VAMO; + defm VAMOMINUEI16 : VAMO; + defm VAMOMINUEI32 : VAMO; + + defm VAMOMAXUEI8 : VAMO; + defm VAMOMAXUEI16 : VAMO; + defm VAMOMAXUEI32 : VAMO; +} // Predicates = [HasStdExtZvamo, HasStdExtA] + +let Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64] in { + defm VAMOSWAPEI64 : VAMO; + defm VAMOADDEI64 : VAMO; + defm VAMOXOREI64 : VAMO; + defm VAMOANDEI64 : VAMO; + defm VAMOOREI64 : VAMO; + defm VAMOMINEI64 : VAMO; + defm VAMOMAXEI64 : VAMO; + defm VAMOMINUEI64 : VAMO; + defm VAMOMAXUEI64 : VAMO; +} // Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket32.td b/llvm/lib/Target/RISCV/RISCVSchedRocket32.td index f9f80848e12eb..1cb474b54d8b7 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedRocket32.td +++ b/llvm/lib/Target/RISCV/RISCVSchedRocket32.td @@ -17,7 +17,7 @@ def Rocket32Model : SchedMachineModel { let LoadLatency = 3; let MispredictPenalty = 3; let CompleteModel = 1; - let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg]; + let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg, HasStdExtZvamo]; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket64.td b/llvm/lib/Target/RISCV/RISCVSchedRocket64.td index 9b5aa776a0d36..8a29762e5adbf 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedRocket64.td +++ b/llvm/lib/Target/RISCV/RISCVSchedRocket64.td @@ -16,7 +16,7 @@ def Rocket64Model : SchedMachineModel { let IssueWidth = 1; // 1 micro-ops are dispatched per cycle. let LoadLatency = 3; let MispredictPenalty = 3; - let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg]; + let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg, HasStdExtZvamo]; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 72d35515d51ec..245c8eb01e384 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -52,6 +52,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { bool HasStdExtZbproposedc = false; bool HasStdExtV = false; bool HasStdExtZvlsseg = false; + bool HasStdExtZvamo = false; bool HasRV64 = false; bool IsRV32E = false; bool EnableLinkerRelax = false; @@ -114,6 +115,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { bool hasStdExtZbproposedc() const { return HasStdExtZbproposedc; } bool hasStdExtV() const { return HasStdExtV; } bool hasStdExtZvlsseg() const { return HasStdExtZvlsseg; } + bool hasStdExtZvamo() const { return HasStdExtZvamo; } bool is64Bit() const { return HasRV64; } bool isRV32E() const { return IsRV32E; } bool enableLinkerRelax() const { return EnableLinkerRelax; } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index eb1e51341ec4d..9f1805879e7ca 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -7246,6 +7246,15 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI, StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp); + // ISel pattern matching also adds a load memory operand of the same + // address, so take special care to find the storing memory operand. + MachineMemOperand *MMO = nullptr; + for (auto *I : MI.memoperands()) + if (I->isStore()) { + MMO = I; + break; + } + // Use STOCOpcode if possible. We could use different store patterns in // order to avoid matching the index register, but the performance trade-offs // might be more complicated in that case. @@ -7253,15 +7262,6 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI, if (Invert) CCMask ^= CCValid; - // ISel pattern matching also adds a load memory operand of the same - // address, so take special care to find the storing memory operand. - MachineMemOperand *MMO = nullptr; - for (auto *I : MI.memoperands()) - if (I->isStore()) { - MMO = I; - break; - } - BuildMI(*MBB, MI, DL, TII->get(STOCOpcode)) .addReg(SrcReg) .add(Base) @@ -7306,7 +7306,8 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI, .addReg(SrcReg) .add(Base) .addImm(Disp) - .addReg(IndexReg); + .addReg(IndexReg) + .addMemOperand(MMO); MBB->addSuccessor(JoinMBB); MI.eraseFromParent(); diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index ddb13e46e9305..5694105dcbd11 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2670,6 +2670,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, break; } + // Support the suffix syntax for overriding displacement size as well. + if (Name.consume_back(".d32")) { + ForcedDispEncoding = DispEncoding_Disp32; + } else if (Name.consume_back(".d8")) { + ForcedDispEncoding = DispEncoding_Disp8; + } + StringRef PatchedName = Name; // Hack to skip "short" following Jcc. diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index bb52bd6128ad7..f2651d658d71c 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -741,6 +741,25 @@ def ProcessorFeatures { list TGLFeatures = !listconcat(ICLFeatures, TGLAdditionalFeatures ); + //Sapphirerapids + list SPRAdditionalFeatures = [FeatureAMXTILE, + FeatureAMXINT8, + FeatureAMXBF16, + FeatureBF16, + FeatureSERIALIZE, + FeatureCLDEMOTE, + FeatureWAITPKG, + FeaturePTWRITE, + FeatureTSXLDTRK, + FeatureENQCMD, + FeatureSHSTK, + FeatureVP2INTERSECT, + FeatureMOVDIRI, + FeatureMOVDIR64B]; + list SPRTuning = ICXTuning; + list SPRFeatures = + !listconcat(ICXFeatures, SPRAdditionalFeatures); + // Atom list AtomFeatures = [FeatureX87, FeatureCMPXCHG8B, @@ -1037,8 +1056,14 @@ class ProcModel; +def : ProcModel<"generic", SandyBridgeModel, + [FeatureX87, FeatureCMPXCHG8B, Feature64Bit], + [FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureSlowIncDec, + FeatureMacroFusion, + FeatureInsertVZEROUPPER]>; + def : Proc<"i386", [FeatureX87], [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; def : Proc<"i486", [FeatureX87], @@ -1082,20 +1107,10 @@ def : ProcModel<"pentium-m", GenericPostRAModel, [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; foreach P = ["pentium4", "pentium4m"] in { -// def : ProcModel; -// [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; - - // Since 'pentium4' is the default 32-bit CPU on Linux and Windows, - // give it more modern tunings. - // FIXME: This wouldn't be needed if we supported mtune. - def : ProcModel; + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; } // Intel Quark. @@ -1237,6 +1252,8 @@ def : ProcModel<"icelake-server", SkylakeServerModel, ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>; def : ProcModel<"tigerlake", SkylakeServerModel, ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>; +def : ProcModel<"sapphirerapids", SkylakeServerModel, + ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>; // AMD CPUs. diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index c47ef4708e919..a07e165633bb6 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -442,6 +442,29 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MBB.erase(MBBI); return true; } + case X86::MWAITX_SAVE_EBX: + case X86::MWAITX_SAVE_RBX: { + // Perform the following transformation. + // SaveRbx = pseudomwaitx InArg, SaveRbx + // => + // [E|R]BX = InArg + // actualmwaitx + // [E|R]BX = SaveRbx + const MachineOperand &InArg = MBBI->getOperand(1); + // Copy the input argument of the pseudo into the argument of the + // actual instruction. + TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill()); + // Create the actual instruction. + BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr)); + // Finally, restore the value of RBX. + Register SaveRbx = MBBI->getOperand(2).getReg(); + unsigned BasePointer = Opcode == X86::MWAITX_SAVE_EBX ? X86::EBX : X86::RBX; + TII->copyPhysReg(MBB, MBBI, DL, BasePointer, SaveRbx, + /*SrcIsKill*/ true); + // Delete the pseudo. + MBBI->eraseFromParent(); + return true; + } case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 32d8f3d96dae3..7440f3238448f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25817,6 +25817,20 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } + case Intrinsic::x86_mwaitx: { + // If the current function needs the base pointer, RBX, + // we shouldn't use mwaitx directly. + // Indeed the lowering of that instruction will clobber + // that register and since RBX will be a reserved register + // the register allocator will not make sure its value will + // be properly saved and restored around this live-range. + SDLoc dl(Op); + unsigned Opcode = X86ISD::MWAITX_DAG; + SDValue Chain = DAG.getNode(Opcode, dl, MVT::Other, + {Op->getOperand(0), Op->getOperand(2), + Op->getOperand(3), Op->getOperand(4)}); + return Chain; + } } return SDValue(); } @@ -30538,6 +30552,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LCMPXCHG16_DAG) NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG) NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG) + NODE_NAME_CASE(MWAITX_DAG) NODE_NAME_CASE(LADD) NODE_NAME_CASE(LSUB) NODE_NAME_CASE(LOR) @@ -33497,6 +33512,48 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BB->addLiveIn(BasePtr); return BB; } + case X86::MWAITX: { + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + Register BasePtr = TRI->getBaseRegister(); + bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX); + // If no need to save the base pointer, we generate MWAITXrrr, + // else we generate pseudo MWAITX_SAVE_RBX/EBX. + if (!IsRBX || !TRI->hasBasePointer(*MF)) { + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI.getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI.getOperand(1).getReg()); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX) + .addReg(MI.getOperand(2).getReg()); + BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr)); + MI.eraseFromParent(); + } else { + if (!BB->isLiveIn(BasePtr)) { + BB->addLiveIn(BasePtr); + } + // Parameters can be copied into ECX and EAX but not EBX yet. + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI.getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI.getOperand(1).getReg()); + const TargetRegisterClass *RegClass = + BasePtr == X86::EBX ? &X86::GR32RegClass : &X86::GR64RegClass; + // Save RBX (or EBX) into a virtual register. + Register SaveRBX = MF->getRegInfo().createVirtualRegister(RegClass); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX) + .addReg(BasePtr); + // Generate mwaitx pseudo. + unsigned Opcode = + BasePtr == X86::RBX ? X86::MWAITX_SAVE_RBX : X86::MWAITX_SAVE_EBX; + Register Dst = MF->getRegInfo().createVirtualRegister(RegClass); + BuildMI(*BB, MI, DL, TII->get(Opcode)) + .addDef(Dst) // Destination tied in with SaveRBX. + .addReg(MI.getOperand(2).getReg()) // input value of EBX. + .addUse(SaveRBX); // Save of base pointer. + MI.eraseFromParent(); + } + return BB; + } case TargetOpcode::PREALLOCATED_SETUP: { assert(Subtarget.is32Bit() && "preallocated only used in 32-bit"); auto MFI = MF->getInfo(); @@ -44622,7 +44679,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, return SDValue(); }; if (SDValue Extract = IsExtractedElement(StoredVal)) { - SDValue Trunc = peekThroughOneUseBitcasts(Extract.getOperand(0)); + SDValue Trunc = peekThroughOneUseBitcasts(Extract); if (Trunc.getOpcode() == X86ISD::VTRUNC) { SDValue Src = Trunc.getOperand(0); MVT DstVT = Trunc.getSimpleValueType(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 2a870b53dc827..7c977ce9e3fa1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -749,6 +749,9 @@ namespace llvm { STRICT_CVTPS2PH, STRICT_CVTPH2PS, + // Mwaitx builtin is lowered to this if the base pointer needs saving. + MWAITX_DAG, + // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index f9582238d30ff..e1eb66484124f 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2884,6 +2884,8 @@ def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>; def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>; +def : Pat<(i8 (trunc (i16 (bitconvert (v16i1 VK16:$src))))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_8bit)>; def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>; @@ -2967,10 +2969,9 @@ let Predicates = [HasAVX512] in { def : Pat<(insert_subvector (v16i1 immAllZerosV), (v1i1 (scalar_to_vector GR8:$src)), (iPTR 0)), - (COPY_TO_REGCLASS - (KMOVWkr (AND32ri8 - (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), - (i32 1))), VK16)>; + (KMOVWkr (AND32ri8 + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), + (i32 1)))>; } // Mask unary operation diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index d78d9f7c80c76..4f81c271386c5 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -896,6 +896,44 @@ def LCMPXCHG16B_SAVE_RBX : GR64:$rbx_save))]>; } +// This pseudo must be used when the frame uses RBX as +// the base pointer. +// cf comment for LCMPXCHG8B_SAVE_EBX. +let Defs = [ECX, EAX, EBX, EFLAGS], Uses = [ECX, EAX, EBX], + Predicates = [HasMWAITX], SchedRW = [WriteSystem], + isCodeGenOnly = 1, isPseudo = 1, Constraints = "$ebx_save = $dst", + usesCustomInserter = 1 in { +def MWAITX_SAVE_EBX : + I<0, Pseudo, (outs GR32:$dst), + (ins GR32:$ebx_input, GR32:$ebx_save), + "mwaitx", + []>; +} +// Same as MWAITX_SAVE_EBX but for the case where RBX is the base pointer. +let Defs = [ECX, EAX, EBX, EFLAGS], Uses = [ECX, EAX, EBX], + Predicates = [HasMWAITX], SchedRW = [WriteSystem], + isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst", + usesCustomInserter = 1 in { +def MWAITX_SAVE_RBX : + I<0, Pseudo, (outs GR64:$dst), + (ins GR32:$ebx_input, GR64:$rbx_save), + "mwaitx", + []>; +} + +// Pseudo mwaitx instruction to use for custom insertion. +let Defs = [ECX, EAX, EBX, EFLAGS], Uses = [ECX, EAX, EBX], + Predicates = [HasMWAITX], SchedRW = [WriteSystem], + isCodeGenOnly = 1, isPseudo = 1, + usesCustomInserter = 1 in { +def MWAITX : + I<0, Pseudo, (outs), + (ins GR32:$ecx, GR32:$eax, GR32:$ebx), + "mwaitx", + [(X86mwaitx GR32:$ecx, GR32:$eax, GR32:$ebx)]>; +} + + defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>; // Atomic exchange and add diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 8d2178066d4f2..14ca9f889e17c 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -77,6 +77,9 @@ def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3, [SDTCisVT<0, i64>, SDTCisPtrTy<1>, SDTCisVT<2, i64>, SDTCisVT<3, i64>]>; +def SDTX86mwaitx : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; + def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisInt<2>]>; @@ -184,6 +187,10 @@ def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG", [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +def X86mwaitx : SDNode<"X86ISD::MWAITX_DAG", SDTX86mwaitx, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad]>; + def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret, diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp index 9ee0bdc3430d3..1b371ac2a1086 100644 --- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp @@ -86,25 +86,14 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizeScalarToDifferentSizeStrategy( G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest); + getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); + computeTables(); verify(*STI.getInstrInfo()); } bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { - MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; - switch (MI.getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memset: - case Intrinsic::memmove: - if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) == - LegalizerHelper::UnableToLegalize) - return false; - MI.eraseFromParent(); - return true; - default: - break; - } return true; } diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 254d81eeb301e..4cf17e46a598a 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -233,7 +233,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, CPU = "generic"; if (TuneCPU.empty()) - TuneCPU = "generic"; + TuneCPU = "i586"; // FIXME: "generic" is more modern than llc tests expect. std::string FullFS = X86_MC::ParseX86Triple(TargetTriple); assert(!FullFS.empty() && "Failed to parse X86 triple"); diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 2173b2633a7ca..4c253aab6f221 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -254,8 +254,9 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // Extract prefer-vector-width attribute. unsigned PreferVectorWidthOverride = 0; - if (F.hasFnAttribute("prefer-vector-width")) { - StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString(); + Attribute PreferVecWidthAttr = F.getFnAttribute("prefer-vector-width"); + if (!PreferVecWidthAttr.hasAttribute(Attribute::None)) { + StringRef Val = PreferVecWidthAttr.getValueAsString(); unsigned Width; if (!Val.getAsInteger(0, Width)) { Key += "prefer-vector-width="; @@ -266,9 +267,9 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // Extract min-legal-vector-width attribute. unsigned RequiredVectorWidth = UINT32_MAX; - if (F.hasFnAttribute("min-legal-vector-width")) { - StringRef Val = - F.getFnAttribute("min-legal-vector-width").getValueAsString(); + Attribute MinLegalVecWidthAttr = F.getFnAttribute("min-legal-vector-width"); + if (!MinLegalVecWidthAttr.hasAttribute(Attribute::None)) { + StringRef Val = MinLegalVecWidthAttr.getValueAsString(); unsigned Width; if (!Val.getAsInteger(0, Width)) { Key += "min-legal-vector-width="; diff --git a/llvm/lib/Testing/Support/CMakeLists.txt b/llvm/lib/Testing/Support/CMakeLists.txt index 4f5345c1dc570..ed2fd8ae43b26 100644 --- a/llvm/lib/Testing/Support/CMakeLists.txt +++ b/llvm/lib/Testing/Support/CMakeLists.txt @@ -12,6 +12,4 @@ add_llvm_library(LLVMTestingSupport Support ) -include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include) -include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock/include) target_link_libraries(LLVMTestingSupport PRIVATE gtest) diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index d34f4438d4c66..48565a4678cc7 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -1301,6 +1301,9 @@ ChangeStatus Attributor::cleanupIR() { for (Function *Fn : ToBeDeletedFunctions) CGUpdater.removeFunction(*Fn); + if (!ToBeDeletedFunctions.empty()) + ManifestChange = ChangeStatus::CHANGED; + NumFnDeleted += ToBeDeletedFunctions.size(); LLVM_DEBUG(dbgs() << "[Attributor] Deleted " << NumFnDeleted @@ -1320,7 +1323,7 @@ ChangeStatus Attributor::cleanupIR() { ChangeStatus Attributor::run() { TimeTraceScope TimeScope("Attributor::run"); - SeedingPeriod = false; + Phase = AttributorPhase::UPDATE; runTillFixpoint(); // dump graphs on demand @@ -1333,13 +1336,19 @@ ChangeStatus Attributor::run() { if (PrintDependencies) DG.print(); + Phase = AttributorPhase::MANIFEST; ChangeStatus ManifestChange = manifestAttributes(); + + Phase = AttributorPhase::CLEANUP; ChangeStatus CleanupChange = cleanupIR(); + return ManifestChange | CleanupChange; } ChangeStatus Attributor::updateAA(AbstractAttribute &AA) { TimeTraceScope TimeScope(AA.getName() + "::updateAA"); + assert(Phase == AttributorPhase::UPDATE && + "We can update AA only in the update stage!"); // Use a new dependence vector for this update. DependenceVector DV; @@ -2165,9 +2174,12 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, OS << "set-state(< {"; if (!S.isValidState()) OS << "full-set"; - else + else { for (auto &it : S.getAssumedSet()) OS << it << ", "; + if (S.undefIsContained()) + OS << "undef "; + } OS << "} >)"; return OS; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 49b92e23955ab..8098379b659a7 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3093,6 +3093,10 @@ struct AAIsDeadFunction : public AAIsDead { /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override; + bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const override { + return !AssumedLiveEdges.count(std::make_pair(From, To)); + } + /// See AbstractAttribute::trackStatistics() void trackStatistics() const override {} @@ -3170,6 +3174,9 @@ struct AAIsDeadFunction : public AAIsDead { /// Collection of instructions that are known to not transfer control. SmallSetVector KnownDeadEnds; + /// Collection of all assumed live edges + DenseSet> AssumedLiveEdges; + /// Collection of all assumed live BasicBlocks. DenseSet AssumedLiveBlocks; }; @@ -3287,7 +3294,7 @@ ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) { // Fast forward for uninteresting instructions. We could look for UB here // though. - while(!I->isTerminator() && !isa(I)) { + while (!I->isTerminator() && !isa(I)) { Change = ChangeStatus::CHANGED; I = I->getNextNode(); } @@ -3340,6 +3347,9 @@ ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) { "Non-terminator expected to have a single successor!"); Worklist.push_back(AliveSuccessor); } else { + // record the assumed live edge + AssumedLiveEdges.insert( + std::make_pair(I->getParent(), AliveSuccessor->getParent())); if (assumeLive(A, *AliveSuccessor->getParent())) Worklist.push_back(AliveSuccessor); } @@ -7343,10 +7353,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl { } if (isa(&V)) { - // Collapse the undef state to 0. - unionAssumed( - APInt(/* numBits */ getAssociatedType()->getIntegerBitWidth(), - /* val */ 0)); + unionAssumedWithUndef(); indicateOptimisticFixpoint(); return; } @@ -7467,6 +7474,20 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl { } } + bool calculateBinaryOperatorAndTakeUnion(const BinaryOperator *BinOp, + const APInt &LHS, const APInt &RHS) { + bool SkipOperation = false; + bool Unsupported = false; + APInt Result = + calculateBinaryOperator(BinOp, LHS, RHS, SkipOperation, Unsupported); + if (Unsupported) + return false; + // If SkipOperation is true, we can ignore this operand pair (L, R). + if (!SkipOperation) + unionAssumed(Result); + return isValidState(); + } + ChangeStatus updateWithICmpInst(Attributor &A, ICmpInst *ICI) { auto AssumedBefore = getAssumed(); Value *LHS = ICI->getOperand(0); @@ -7485,16 +7506,40 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl { const DenseSet &LHSAAPVS = LHSAA.getAssumedSet(); const DenseSet &RHSAAPVS = RHSAA.getAssumedSet(); - // TODO: Handle undef correctly. + // TODO: make use of undef flag to limit potential values aggressively. bool MaybeTrue = false, MaybeFalse = false; - for (const APInt &L : LHSAAPVS) { + const APInt Zero(RHS->getType()->getIntegerBitWidth(), 0); + if (LHSAA.undefIsContained() && RHSAA.undefIsContained()) { + // The result of any comparison between undefs can be soundly replaced + // with undef. + unionAssumedWithUndef(); + } else if (LHSAA.undefIsContained()) { + bool MaybeTrue = false, MaybeFalse = false; for (const APInt &R : RHSAAPVS) { - bool CmpResult = calculateICmpInst(ICI, L, R); + bool CmpResult = calculateICmpInst(ICI, Zero, R); MaybeTrue |= CmpResult; MaybeFalse |= !CmpResult; if (MaybeTrue & MaybeFalse) return indicatePessimisticFixpoint(); } + } else if (RHSAA.undefIsContained()) { + for (const APInt &L : LHSAAPVS) { + bool CmpResult = calculateICmpInst(ICI, L, Zero); + MaybeTrue |= CmpResult; + MaybeFalse |= !CmpResult; + if (MaybeTrue & MaybeFalse) + return indicatePessimisticFixpoint(); + } + } else { + for (const APInt &L : LHSAAPVS) { + for (const APInt &R : RHSAAPVS) { + bool CmpResult = calculateICmpInst(ICI, L, R); + MaybeTrue |= CmpResult; + MaybeFalse |= !CmpResult; + if (MaybeTrue & MaybeFalse) + return indicatePessimisticFixpoint(); + } + } } if (MaybeTrue) unionAssumed(APInt(/* numBits */ 1, /* val */ 1)); @@ -7520,8 +7565,13 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl { if (!RHSAA.isValidState()) return indicatePessimisticFixpoint(); - unionAssumed(LHSAA); - unionAssumed(RHSAA); + if (LHSAA.undefIsContained() && RHSAA.undefIsContained()) + // select i1 *, undef , undef => undef + unionAssumedWithUndef(); + else { + unionAssumed(LHSAA); + unionAssumed(RHSAA); + } return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } @@ -7537,11 +7587,14 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl { if (!SrcAA.isValidState()) return indicatePessimisticFixpoint(); const DenseSet &SrcAAPVS = SrcAA.getAssumedSet(); - for (const APInt &S : SrcAAPVS) { - APInt T = calculateCastInst(CI, S, ResultBitWidth); - unionAssumed(T); + if (SrcAA.undefIsContained()) + unionAssumedWithUndef(); + else { + for (const APInt &S : SrcAAPVS) { + APInt T = calculateCastInst(CI, S, ResultBitWidth); + unionAssumed(T); + } } - // TODO: Handle undef correctly. return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } @@ -7563,19 +7616,28 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl { const DenseSet &LHSAAPVS = LHSAA.getAssumedSet(); const DenseSet &RHSAAPVS = RHSAA.getAssumedSet(); + const APInt Zero = APInt(LHS->getType()->getIntegerBitWidth(), 0); - // TODO: Handle undef correctly - for (const APInt &L : LHSAAPVS) { + // TODO: make use of undef flag to limit potential values aggressively. + if (LHSAA.undefIsContained() && RHSAA.undefIsContained()) { + if (!calculateBinaryOperatorAndTakeUnion(BinOp, Zero, Zero)) + return indicatePessimisticFixpoint(); + } else if (LHSAA.undefIsContained()) { for (const APInt &R : RHSAAPVS) { - bool SkipOperation = false; - bool Unsupported = false; - APInt Result = - calculateBinaryOperator(BinOp, L, R, SkipOperation, Unsupported); - if (Unsupported) + if (!calculateBinaryOperatorAndTakeUnion(BinOp, Zero, R)) return indicatePessimisticFixpoint(); - // If SkipOperation is true, we can ignore this operand pair (L, R). - if (!SkipOperation) - unionAssumed(Result); + } + } else if (RHSAA.undefIsContained()) { + for (const APInt &L : LHSAAPVS) { + if (!calculateBinaryOperatorAndTakeUnion(BinOp, L, Zero)) + return indicatePessimisticFixpoint(); + } + } else { + for (const APInt &L : LHSAAPVS) { + for (const APInt &R : RHSAAPVS) { + if (!calculateBinaryOperatorAndTakeUnion(BinOp, L, R)) + return indicatePessimisticFixpoint(); + } } } return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED @@ -7590,7 +7652,10 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl { *this, IRPosition::value(*IncomingValue)); if (!PotentialValuesAA.isValidState()) return indicatePessimisticFixpoint(); - unionAssumed(PotentialValuesAA.getAssumed()); + if (PotentialValuesAA.undefIsContained()) + unionAssumedWithUndef(); + else + unionAssumed(PotentialValuesAA.getAssumed()); } return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; @@ -7678,10 +7743,7 @@ struct AAPotentialValuesCallSiteArgument : AAPotentialValuesFloating { } if (isa(&V)) { - // Collapse the undef state to 0. - unionAssumed( - APInt(/* numBits */ getAssociatedType()->getIntegerBitWidth(), - /* val */ 0)); + unionAssumedWithUndef(); indicateOptimisticFixpoint(); return; } @@ -7743,6 +7805,15 @@ struct AANoUndefImpl : AANoUndef { const std::string getAsStr() const override { return getAssumed() ? "noundef" : "may-undef-or-poison"; } + + ChangeStatus manifest(Attributor &A) override { + // We don't manifest noundef attribute for dead positions because the + // associated values with dead positions would be replaced with undef + // values. + if (A.isAssumedDead(getIRPosition(), nullptr, nullptr)) + return ChangeStatus::UNCHANGED; + return AANoUndef::manifest(A); + } }; struct AANoUndefFloating : public AANoUndefImpl { diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 39db2a36f56a6..a63d9fdfdac1e 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -657,8 +657,8 @@ struct OpenMPOpt { for (Function *F : SCC) { for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) - deduplicateRuntimeCalls(*F, - OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); + Changed |= deduplicateRuntimeCalls( + *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); // __kmpc_global_thread_num is special as we can replace it with an // argument in enough cases to make it worth trying. diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 2c520a1b5b6b7..997701e5721f5 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -840,7 +840,7 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const { return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL), DIL->getBaseDiscriminator()), - CalleeName); + CalleeName, Reader->getRemapper()); } /// Returns a vector of FunctionSamples that are the indirect call targets @@ -903,7 +903,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { auto it = DILocation2SampleMap.try_emplace(DIL,nullptr); if (it.second) - it.first->second = Samples->findFunctionSamples(DIL); + it.first->second = Samples->findFunctionSamples(DIL, Reader->getRemapper()); return it.first->second; } @@ -1050,24 +1050,23 @@ bool SampleProfileLoader::inlineHotFunctions( PSI->getOrCompHotCountThreshold()); continue; } - auto CalleeFunctionName = FS->getFuncName(); + if (!callsiteIsHot(FS, PSI)) + continue; + + const char *Reason = "Callee function not available"; + // R->getValue() != &F is to prevent promoting a recursive call. // If it is a recursive call, we do not inline it as it could bloat // the code exponentially. There is way to better handle this, e.g. // clone the caller first, and inline the cloned caller if it is // recursive. As llvm does not inline recursive calls, we will // simply ignore it instead of handling it explicitly. - if (CalleeFunctionName == F.getName()) - continue; - - if (!callsiteIsHot(FS, PSI)) - continue; - - const char *Reason = "Callee function not available"; + auto CalleeFunctionName = FS->getFuncName(); auto R = SymbolMap.find(CalleeFunctionName); if (R != SymbolMap.end() && R->getValue() && !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() && R->getValue()->hasFnAttribute("use-sample-profile") && + R->getValue() != &F && isLegalToPromote(*I, R->getValue(), &Reason)) { uint64_t C = FS->getEntrySamples(); auto &DI = @@ -1854,7 +1853,6 @@ bool SampleProfileLoader::doInitialization(Module &M, FunctionAnalysisManager *FAM) { auto &Ctx = M.getContext(); - std::unique_ptr RemapReader; auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx, RemappingFilename); if (std::error_code EC = ReaderOrErr.getError()) { @@ -1910,6 +1908,7 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, for (const auto &I : Reader->getProfiles()) TotalCollectedSamples += I.second.getTotalSamples(); + auto Remapper = Reader->getRemapper(); // Populate the symbol map. for (const auto &N_F : M.getValueSymbolTable()) { StringRef OrigName = N_F.getKey(); @@ -1927,6 +1926,15 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, // to nullptr to avoid confusion. if (!r.second) r.first->second = nullptr; + OrigName = NewName; + } + // Insert the remapped names into SymbolMap. + if (Remapper) { + if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) { + if (*MapName == OrigName) + continue; + SymbolMap.insert(std::make_pair(*MapName, F)); + } } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index c6c4105b9b308..ef2563ce4cec4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -776,6 +776,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Value *X; if (match(IIOperand, m_Neg(m_Value(X)))) return replaceOperand(*II, 0, X); + if (match(IIOperand, m_Select(m_Value(), m_Value(X), m_Neg(m_Deferred(X))))) + return replaceOperand(*II, 0, X); + if (match(IIOperand, m_Select(m_Value(), m_Neg(m_Value(X)), m_Deferred(X)))) + return replaceOperand(*II, 0, X); break; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 92169ffde22b9..86b0bfe24d287 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -615,16 +615,18 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final /// Try to rotate an operation below a PHI node, using PHI nodes for /// its operands. - Instruction *FoldPHIArgOpIntoPHI(PHINode &PN); - Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN); - Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); - Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN); - Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN); + Instruction *foldPHIArgOpIntoPHI(PHINode &PN); + Instruction *foldPHIArgBinOpIntoPHI(PHINode &PN); + Instruction *foldPHIArgInsertValueInstructionIntoPHI(PHINode &PN); + Instruction *foldPHIArgExtractValueInstructionIntoPHI(PHINode &PN); + Instruction *foldPHIArgGEPIntoPHI(PHINode &PN); + Instruction *foldPHIArgLoadIntoPHI(PHINode &PN); + Instruction *foldPHIArgZextsIntoPHI(PHINode &PN); /// If an integer typed PHI has only one use which is an IntToPtr operation, /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise /// insert a new pointer typed PHI and replace the original one. - Instruction *FoldIntegerTypedPHI(PHINode &PN); + Instruction *foldIntegerTypedPHI(PHINode &PN); /// Helper function for FoldPHIArgXIntoPHI() to set debug location for the /// folded operation. diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index ea6adc421954d..d8510d5388d14 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -13,12 +13,14 @@ #include "InstCombineInternal.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" + using namespace llvm; using namespace llvm::PatternMatch; @@ -28,6 +30,11 @@ static cl::opt MaxNumPhis("instcombine-max-num-phis", cl::init(512), cl::desc("Maximum number phis to handle in intptr/ptrint folding")); +STATISTIC(NumPHIsOfInsertValues, + "Number of phi-of-insertvalue turned into insertvalue-of-phis"); +STATISTIC(NumPHIsOfExtractValues, + "Number of phi-of-extractvalue turned into extractvalue-of-phi"); + /// The PHI arguments will be folded into a single operation with a PHI node /// as input. The debug location of the single operation will be the merged /// locations of the original PHI node arguments. @@ -94,7 +101,7 @@ void InstCombinerImpl::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) { // ptr_val_inc = ... // ... // -Instruction *InstCombinerImpl::FoldIntegerTypedPHI(PHINode &PN) { +Instruction *InstCombinerImpl::foldIntegerTypedPHI(PHINode &PN) { if (!PN.getType()->isIntegerTy()) return nullptr; if (!PN.hasOneUse()) @@ -291,9 +298,86 @@ Instruction *InstCombinerImpl::FoldIntegerTypedPHI(PHINode &PN) { IntToPtr->getOperand(0)->getType()); } +/// If we have something like phi [insertvalue(a,b,0), insertvalue(c,d,0)], +/// turn this into a phi[a,c] and phi[b,d] and a single insertvalue. +Instruction * +InstCombinerImpl::foldPHIArgInsertValueInstructionIntoPHI(PHINode &PN) { + auto *FirstIVI = cast(PN.getIncomingValue(0)); + + // Scan to see if all operands are `insertvalue`'s with the same indicies, + // and all have a single use. + for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { + auto *I = dyn_cast(PN.getIncomingValue(i)); + if (!I || !I->hasOneUser() || I->getIndices() != FirstIVI->getIndices()) + return nullptr; + } + + // For each operand of an `insertvalue` + std::array NewOperands; + for (int OpIdx : {0, 1}) { + auto *&NewOperand = NewOperands[OpIdx]; + // Create a new PHI node to receive the values the operand has in each + // incoming basic block. + NewOperand = PHINode::Create( + FirstIVI->getOperand(OpIdx)->getType(), PN.getNumIncomingValues(), + FirstIVI->getOperand(OpIdx)->getName() + ".pn"); + // And populate each operand's PHI with said values. + for (auto Incoming : zip(PN.blocks(), PN.incoming_values())) + NewOperand->addIncoming( + cast(std::get<1>(Incoming))->getOperand(OpIdx), + std::get<0>(Incoming)); + InsertNewInstBefore(NewOperand, PN); + } + + // And finally, create `insertvalue` over the newly-formed PHI nodes. + auto *NewIVI = InsertValueInst::Create(NewOperands[0], NewOperands[1], + FirstIVI->getIndices(), PN.getName()); + + PHIArgMergedDebugLoc(NewIVI, PN); + ++NumPHIsOfInsertValues; + return NewIVI; +} + +/// If we have something like phi [extractvalue(a,0), extractvalue(b,0)], +/// turn this into a phi[a,b] and a single extractvalue. +Instruction * +InstCombinerImpl::foldPHIArgExtractValueInstructionIntoPHI(PHINode &PN) { + auto *FirstEVI = cast(PN.getIncomingValue(0)); + + // Scan to see if all operands are `extractvalue`'s with the same indicies, + // and all have a single use. + for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { + auto *I = dyn_cast(PN.getIncomingValue(i)); + if (!I || !I->hasOneUser() || I->getIndices() != FirstEVI->getIndices() || + I->getAggregateOperand()->getType() != + FirstEVI->getAggregateOperand()->getType()) + return nullptr; + } + + // Create a new PHI node to receive the values the aggregate operand has + // in each incoming basic block. + auto *NewAggregateOperand = PHINode::Create( + FirstEVI->getAggregateOperand()->getType(), PN.getNumIncomingValues(), + FirstEVI->getAggregateOperand()->getName() + ".pn"); + // And populate the PHI with said values. + for (auto Incoming : zip(PN.blocks(), PN.incoming_values())) + NewAggregateOperand->addIncoming( + cast(std::get<1>(Incoming))->getAggregateOperand(), + std::get<0>(Incoming)); + InsertNewInstBefore(NewAggregateOperand, PN); + + // And finally, create `extractvalue` over the newly-formed PHI nodes. + auto *NewEVI = ExtractValueInst::Create(NewAggregateOperand, + FirstEVI->getIndices(), PN.getName()); + + PHIArgMergedDebugLoc(NewEVI, PN); + ++NumPHIsOfExtractValues; + return NewEVI; +} + /// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the -/// adds all have a single use, turn this into a phi and a single binop. -Instruction *InstCombinerImpl::FoldPHIArgBinOpIntoPHI(PHINode &PN) { +/// adds all have a single user, turn this into a phi and a single binop. +Instruction *InstCombinerImpl::foldPHIArgBinOpIntoPHI(PHINode &PN) { Instruction *FirstInst = cast(PN.getIncomingValue(0)); assert(isa(FirstInst) || isa(FirstInst)); unsigned Opc = FirstInst->getOpcode(); @@ -303,10 +387,10 @@ Instruction *InstCombinerImpl::FoldPHIArgBinOpIntoPHI(PHINode &PN) { Type *LHSType = LHSVal->getType(); Type *RHSType = RHSVal->getType(); - // Scan to see if all operands are the same opcode, and all have one use. + // Scan to see if all operands are the same opcode, and all have one user. for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { Instruction *I = dyn_cast(PN.getIncomingValue(i)); - if (!I || I->getOpcode() != Opc || !I->hasOneUse() || + if (!I || I->getOpcode() != Opc || !I->hasOneUser() || // Verify type of the LHS matches so we don't fold cmp's of different // types. I->getOperand(0)->getType() != LHSType || @@ -386,7 +470,7 @@ Instruction *InstCombinerImpl::FoldPHIArgBinOpIntoPHI(PHINode &PN) { return NewBinOp; } -Instruction *InstCombinerImpl::FoldPHIArgGEPIntoPHI(PHINode &PN) { +Instruction *InstCombinerImpl::foldPHIArgGEPIntoPHI(PHINode &PN) { GetElementPtrInst *FirstInst =cast(PN.getIncomingValue(0)); SmallVector FixedOperands(FirstInst->op_begin(), @@ -402,11 +486,12 @@ Instruction *InstCombinerImpl::FoldPHIArgGEPIntoPHI(PHINode &PN) { bool AllInBounds = true; - // Scan to see if all operands are the same opcode, and all have one use. + // Scan to see if all operands are the same opcode, and all have one user. for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { - GetElementPtrInst *GEP= dyn_cast(PN.getIncomingValue(i)); - if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() || - GEP->getNumOperands() != FirstInst->getNumOperands()) + GetElementPtrInst *GEP = + dyn_cast(PN.getIncomingValue(i)); + if (!GEP || !GEP->hasOneUser() || GEP->getType() != FirstInst->getType() || + GEP->getNumOperands() != FirstInst->getNumOperands()) return nullptr; AllInBounds &= GEP->isInBounds(); @@ -540,7 +625,7 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { return true; } -Instruction *InstCombinerImpl::FoldPHIArgLoadIntoPHI(PHINode &PN) { +Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { LoadInst *FirstLI = cast(PN.getIncomingValue(0)); // FIXME: This is overconservative; this transform is allowed in some cases @@ -573,7 +658,7 @@ Instruction *InstCombinerImpl::FoldPHIArgLoadIntoPHI(PHINode &PN) { // Check to see if all arguments are the same operation. for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { LoadInst *LI = dyn_cast(PN.getIncomingValue(i)); - if (!LI || !LI->hasOneUse()) + if (!LI || !LI->hasOneUser()) return nullptr; // We can't sink the load if the loaded value could be modified between @@ -654,7 +739,7 @@ Instruction *InstCombinerImpl::FoldPHIArgLoadIntoPHI(PHINode &PN) { /// TODO: This function could handle other cast types, but then it might /// require special-casing a cast from the 'i1' type. See the comment in /// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types. -Instruction *InstCombinerImpl::FoldPHIArgZextsIntoPHI(PHINode &Phi) { +Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) { // We cannot create a new instruction after the PHI if the terminator is an // EHPad because there is no valid insertion point. if (Instruction *TI = Phi.getParent()->getTerminator()) @@ -686,8 +771,8 @@ Instruction *InstCombinerImpl::FoldPHIArgZextsIntoPHI(PHINode &Phi) { unsigned NumConsts = 0; for (Value *V : Phi.incoming_values()) { if (auto *Zext = dyn_cast(V)) { - // All zexts must be identical and have one use. - if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUse()) + // All zexts must be identical and have one user. + if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUser()) return nullptr; NewIncoming.push_back(Zext->getOperand(0)); NumZexts++; @@ -728,7 +813,7 @@ Instruction *InstCombinerImpl::FoldPHIArgZextsIntoPHI(PHINode &Phi) { /// If all operands to a PHI node are the same "unary" operator and they all are /// only used by the PHI, PHI together their inputs, and do the operation once, /// to the result of the PHI. -Instruction *InstCombinerImpl::FoldPHIArgOpIntoPHI(PHINode &PN) { +Instruction *InstCombinerImpl::foldPHIArgOpIntoPHI(PHINode &PN) { // We cannot create a new instruction after the PHI if the terminator is an // EHPad because there is no valid insertion point. if (Instruction *TI = PN.getParent()->getTerminator()) @@ -738,9 +823,13 @@ Instruction *InstCombinerImpl::FoldPHIArgOpIntoPHI(PHINode &PN) { Instruction *FirstInst = cast(PN.getIncomingValue(0)); if (isa(FirstInst)) - return FoldPHIArgGEPIntoPHI(PN); + return foldPHIArgGEPIntoPHI(PN); if (isa(FirstInst)) - return FoldPHIArgLoadIntoPHI(PN); + return foldPHIArgLoadIntoPHI(PN); + if (isa(FirstInst)) + return foldPHIArgInsertValueInstructionIntoPHI(PN); + if (isa(FirstInst)) + return foldPHIArgExtractValueInstructionIntoPHI(PN); // Scan the instruction, looking for input operations that can be folded away. // If all input operands to the phi are the same instruction (e.g. a cast from @@ -763,7 +852,7 @@ Instruction *InstCombinerImpl::FoldPHIArgOpIntoPHI(PHINode &PN) { // otherwise call FoldPHIArgBinOpIntoPHI. ConstantOp = dyn_cast(FirstInst->getOperand(1)); if (!ConstantOp) - return FoldPHIArgBinOpIntoPHI(PN); + return foldPHIArgBinOpIntoPHI(PN); } else { return nullptr; // Cannot fold this operation. } @@ -771,7 +860,7 @@ Instruction *InstCombinerImpl::FoldPHIArgOpIntoPHI(PHINode &PN) { // Check to see if all arguments are the same operation. for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { Instruction *I = dyn_cast(PN.getIncomingValue(i)); - if (!I || !I->hasOneUse() || !I->isSameOperationAs(FirstInst)) + if (!I || !I->hasOneUser() || !I->isSameOperationAs(FirstInst)) return nullptr; if (CastSrcTy) { if (I->getOperand(0)->getType() != CastSrcTy) @@ -1207,7 +1296,7 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) { if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN))) return replaceInstUsesWith(PN, V); - if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN)) + if (Instruction *Result = foldPHIArgZextsIntoPHI(PN)) return Result; // If all PHI operands are the same operation, pull them through the PHI, @@ -1215,18 +1304,16 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) { if (isa(PN.getIncomingValue(0)) && isa(PN.getIncomingValue(1)) && cast(PN.getIncomingValue(0))->getOpcode() == - cast(PN.getIncomingValue(1))->getOpcode() && - // FIXME: The hasOneUse check will fail for PHIs that use the value more - // than themselves more than once. - PN.getIncomingValue(0)->hasOneUse()) - if (Instruction *Result = FoldPHIArgOpIntoPHI(PN)) + cast(PN.getIncomingValue(1))->getOpcode() && + PN.getIncomingValue(0)->hasOneUser()) + if (Instruction *Result = foldPHIArgOpIntoPHI(PN)) return Result; // If this is a trivial cycle in the PHI node graph, remove it. Basically, if // this PHI only has a single use (a PHI), and if that PHI only has one use (a // PHI)... break the cycle. if (PN.hasOneUse()) { - if (Instruction *Result = FoldIntegerTypedPHI(PN)) + if (Instruction *Result = foldIntegerTypedPHI(PN)) return Result; Instruction *PHIUser = cast(PN.user_back()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 382db79cba607..7d8d5c933bc29 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -130,6 +130,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (Depth == MaxAnalysisRecursionDepth) return nullptr; + if (isa(VTy)) + return nullptr; + Instruction *I = dyn_cast(V); if (!I) { computeKnownBits(V, Known, Depth, CxtI); @@ -1152,6 +1155,19 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, if (IdxNo < VWidth) PreInsertDemandedElts.clearBit(IdxNo); + // If we only demand the element that is being inserted and that element + // was extracted from the same index in another vector with the same type, + // replace this insert with that other vector. + // Note: This is attempted before the call to simplifyAndSetOp because that + // may change UndefElts to a value that does not match with Vec. + Value *Vec; + if (PreInsertDemandedElts == 0 && + match(I->getOperand(1), + m_ExtractElt(m_Value(Vec), m_SpecificInt(IdxNo))) && + Vec->getType() == I->getType()) { + return Vec; + } + simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts); // If this is inserting an element that isn't demanded, remove this diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 9316de4eb32c0..1fc0b140be035 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_component_library(LLVMInstrumentation ControlHeightReduction.cpp DataFlowSanitizer.cpp GCOVProfiling.cpp + HeapProfiler.cpp MemorySanitizer.cpp IndirectCallPromotion.cpp Instrumentation.cpp diff --git a/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp b/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp new file mode 100644 index 0000000000000..6372dfded82a7 --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp @@ -0,0 +1,614 @@ +//===- HeapProfiler.cpp - heap allocation and access profiler +//--------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of HeapProfiler. Memory accesses are instrumented +// to increment the access count held in a shadow memory location, or +// alternatively to call into the runtime. Memory intrinsic calls (memmove, +// memcpy, memset) are changed to call the heap profiling runtime version +// instead. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation/HeapProfiler.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "heapprof" + +constexpr int LLVM_HEAP_PROFILER_VERSION = 1; + +// Size of memory mapped to a single shadow location. +constexpr uint64_t DefaultShadowGranularity = 64; + +// Scale from granularity down to shadow size. +constexpr uint64_t DefaultShadowScale = 3; + +constexpr char HeapProfModuleCtorName[] = "heapprof.module_ctor"; +constexpr uint64_t HeapProfCtorAndDtorPriority = 1; +// On Emscripten, the system needs more than one priorities for constructors. +constexpr uint64_t HeapProfEmscriptenCtorAndDtorPriority = 50; +constexpr char HeapProfInitName[] = "__heapprof_init"; +constexpr char HeapProfVersionCheckNamePrefix[] = + "__heapprof_version_mismatch_check_v"; + +constexpr char HeapProfShadowMemoryDynamicAddress[] = + "__heapprof_shadow_memory_dynamic_address"; + +// Command-line flags. + +static cl::opt ClInsertVersionCheck( + "heapprof-guard-against-version-mismatch", + cl::desc("Guard against compiler/runtime version mismatch."), cl::Hidden, + cl::init(true)); + +// This flag may need to be replaced with -f[no-]memprof-reads. +static cl::opt ClInstrumentReads("heapprof-instrument-reads", + cl::desc("instrument read instructions"), + cl::Hidden, cl::init(true)); + +static cl::opt + ClInstrumentWrites("heapprof-instrument-writes", + cl::desc("instrument write instructions"), cl::Hidden, + cl::init(true)); + +static cl::opt ClInstrumentAtomics( + "heapprof-instrument-atomics", + cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden, + cl::init(true)); + +static cl::opt ClUseCalls( + "heapprof-use-callbacks", + cl::desc("Use callbacks instead of inline instrumentation sequences."), + cl::Hidden, cl::init(false)); + +static cl::opt + ClMemoryAccessCallbackPrefix("heapprof-memory-access-callback-prefix", + cl::desc("Prefix for memory access callbacks"), + cl::Hidden, cl::init("__heapprof_")); + +// These flags allow to change the shadow mapping. +// The shadow mapping looks like +// Shadow = ((Mem & mask) >> scale) + offset + +static cl::opt ClMappingScale("heapprof-mapping-scale", + cl::desc("scale of heapprof shadow mapping"), + cl::Hidden, cl::init(DefaultShadowScale)); + +static cl::opt + ClMappingGranularity("heapprof-mapping-granularity", + cl::desc("granularity of heapprof shadow mapping"), + cl::Hidden, cl::init(DefaultShadowGranularity)); + +// Debug flags. + +static cl::opt ClDebug("heapprof-debug", cl::desc("debug"), cl::Hidden, + cl::init(0)); + +static cl::opt ClDebugFunc("heapprof-debug-func", cl::Hidden, + cl::desc("Debug func")); + +static cl::opt ClDebugMin("heapprof-debug-min", cl::desc("Debug min inst"), + cl::Hidden, cl::init(-1)); + +static cl::opt ClDebugMax("heapprof-debug-max", cl::desc("Debug max inst"), + cl::Hidden, cl::init(-1)); + +STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); +STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); + +namespace { + +/// This struct defines the shadow mapping using the rule: +/// shadow = ((mem & mask) >> Scale) ADD DynamicShadowOffset. +struct ShadowMapping { + ShadowMapping() { + Scale = ClMappingScale; + Granularity = ClMappingGranularity; + Mask = ~(Granularity - 1); + } + + int Scale; + int Granularity; + uint64_t Mask; // Computed as ~(Granularity-1) +}; + +static uint64_t getCtorAndDtorPriority(Triple &TargetTriple) { + return TargetTriple.isOSEmscripten() ? HeapProfEmscriptenCtorAndDtorPriority + : HeapProfCtorAndDtorPriority; +} + +struct InterestingMemoryAccess { + Value *Addr = nullptr; + bool IsWrite; + unsigned Alignment; + uint64_t TypeSize; + Value *MaybeMask = nullptr; +}; + +/// Instrument the code in module to profile heap accesses. +class HeapProfiler { +public: + HeapProfiler(Module &M) { + C = &(M.getContext()); + LongSize = M.getDataLayout().getPointerSizeInBits(); + IntptrTy = Type::getIntNTy(*C, LongSize); + } + + /// If it is an interesting memory access, populate information + /// about the access and return a InterestingMemoryAccess struct. + /// Otherwise return None. + Optional isInterestingMemoryAccess(Instruction *I); + + void instrumentMop(Instruction *I, const DataLayout &DL, + InterestingMemoryAccess &Access); + void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore, + Value *Addr, uint32_t TypeSize, bool IsWrite); + void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask, + Instruction *I, Value *Addr, + unsigned Alignment, uint32_t TypeSize, + bool IsWrite); + void instrumentMemIntrinsic(MemIntrinsic *MI); + Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); + bool instrumentFunction(Function &F); + bool maybeInsertHeapProfInitAtFunctionEntry(Function &F); + bool insertDynamicShadowAtFunctionEntry(Function &F); + +private: + void initializeCallbacks(Module &M); + + LLVMContext *C; + int LongSize; + Type *IntptrTy; + ShadowMapping Mapping; + + // These arrays is indexed by AccessIsWrite + FunctionCallee HeapProfMemoryAccessCallback[2]; + FunctionCallee HeapProfMemoryAccessCallbackSized[2]; + + FunctionCallee HeapProfMemmove, HeapProfMemcpy, HeapProfMemset; + Value *DynamicShadowOffset = nullptr; +}; + +class HeapProfilerLegacyPass : public FunctionPass { +public: + static char ID; + + explicit HeapProfilerLegacyPass() : FunctionPass(ID) { + initializeHeapProfilerLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "HeapProfilerFunctionPass"; } + + bool runOnFunction(Function &F) override { + HeapProfiler Profiler(*F.getParent()); + return Profiler.instrumentFunction(F); + } +}; + +class ModuleHeapProfiler { +public: + ModuleHeapProfiler(Module &M) { TargetTriple = Triple(M.getTargetTriple()); } + + bool instrumentModule(Module &); + +private: + Triple TargetTriple; + ShadowMapping Mapping; + Function *HeapProfCtorFunction = nullptr; +}; + +class ModuleHeapProfilerLegacyPass : public ModulePass { +public: + static char ID; + + explicit ModuleHeapProfilerLegacyPass() : ModulePass(ID) { + initializeModuleHeapProfilerLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "ModuleHeapProfiler"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override {} + + bool runOnModule(Module &M) override { + ModuleHeapProfiler HeapProfiler(M); + return HeapProfiler.instrumentModule(M); + } +}; + +} // end anonymous namespace + +HeapProfilerPass::HeapProfilerPass() {} + +PreservedAnalyses HeapProfilerPass::run(Function &F, + AnalysisManager &AM) { + Module &M = *F.getParent(); + HeapProfiler Profiler(M); + if (Profiler.instrumentFunction(F)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); + + return PreservedAnalyses::all(); +} + +ModuleHeapProfilerPass::ModuleHeapProfilerPass() {} + +PreservedAnalyses ModuleHeapProfilerPass::run(Module &M, + AnalysisManager &AM) { + ModuleHeapProfiler Profiler(M); + if (Profiler.instrumentModule(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +char HeapProfilerLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(HeapProfilerLegacyPass, "heapprof", + "HeapProfiler: profile heap allocations and accesses.", + false, false) +INITIALIZE_PASS_END(HeapProfilerLegacyPass, "heapprof", + "HeapProfiler: profile heap allocations and accesses.", + false, false) + +FunctionPass *llvm::createHeapProfilerFunctionPass() { + return new HeapProfilerLegacyPass(); +} + +char ModuleHeapProfilerLegacyPass::ID = 0; + +INITIALIZE_PASS(ModuleHeapProfilerLegacyPass, "heapprof-module", + "HeapProfiler: profile heap allocations and accesses." + "ModulePass", + false, false) + +ModulePass *llvm::createModuleHeapProfilerLegacyPassPass() { + return new ModuleHeapProfilerLegacyPass(); +} + +Value *HeapProfiler::memToShadow(Value *Shadow, IRBuilder<> &IRB) { + // (Shadow & mask) >> scale + Shadow = IRB.CreateAnd(Shadow, Mapping.Mask); + Shadow = IRB.CreateLShr(Shadow, Mapping.Scale); + // (Shadow >> scale) | offset + assert(DynamicShadowOffset); + return IRB.CreateAdd(Shadow, DynamicShadowOffset); +} + +// Instrument memset/memmove/memcpy +void HeapProfiler::instrumentMemIntrinsic(MemIntrinsic *MI) { + IRBuilder<> IRB(MI); + if (isa(MI)) { + IRB.CreateCall( + isa(MI) ? HeapProfMemmove : HeapProfMemcpy, + {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); + } else if (isa(MI)) { + IRB.CreateCall( + HeapProfMemset, + {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), + IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false), + IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); + } + MI->eraseFromParent(); +} + +Optional +HeapProfiler::isInterestingMemoryAccess(Instruction *I) { + // Do not instrument the load fetching the dynamic shadow address. + if (DynamicShadowOffset == I) + return None; + + InterestingMemoryAccess Access; + + const DataLayout &DL = I->getModule()->getDataLayout(); + if (LoadInst *LI = dyn_cast(I)) { + if (!ClInstrumentReads) + return None; + Access.IsWrite = false; + Access.TypeSize = DL.getTypeStoreSizeInBits(LI->getType()); + Access.Alignment = LI->getAlignment(); + Access.Addr = LI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast(I)) { + if (!ClInstrumentWrites) + return None; + Access.IsWrite = true; + Access.TypeSize = + DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType()); + Access.Alignment = SI->getAlignment(); + Access.Addr = SI->getPointerOperand(); + } else if (AtomicRMWInst *RMW = dyn_cast(I)) { + if (!ClInstrumentAtomics) + return None; + Access.IsWrite = true; + Access.TypeSize = + DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType()); + Access.Alignment = 0; + Access.Addr = RMW->getPointerOperand(); + } else if (AtomicCmpXchgInst *XCHG = dyn_cast(I)) { + if (!ClInstrumentAtomics) + return None; + Access.IsWrite = true; + Access.TypeSize = + DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType()); + Access.Alignment = 0; + Access.Addr = XCHG->getPointerOperand(); + } else if (auto *CI = dyn_cast(I)) { + auto *F = CI->getCalledFunction(); + if (F && (F->getIntrinsicID() == Intrinsic::masked_load || + F->getIntrinsicID() == Intrinsic::masked_store)) { + unsigned OpOffset = 0; + if (F->getIntrinsicID() == Intrinsic::masked_store) { + if (!ClInstrumentWrites) + return None; + // Masked store has an initial operand for the value. + OpOffset = 1; + Access.IsWrite = true; + } else { + if (!ClInstrumentReads) + return None; + Access.IsWrite = false; + } + + auto *BasePtr = CI->getOperand(0 + OpOffset); + auto *Ty = cast(BasePtr->getType())->getElementType(); + Access.TypeSize = DL.getTypeStoreSizeInBits(Ty); + if (auto *AlignmentConstant = + dyn_cast(CI->getOperand(1 + OpOffset))) + Access.Alignment = (unsigned)AlignmentConstant->getZExtValue(); + else + Access.Alignment = 1; // No alignment guarantees. We probably got Undef + Access.MaybeMask = CI->getOperand(2 + OpOffset); + Access.Addr = BasePtr; + } + } + + if (!Access.Addr) + return None; + + // Do not instrument acesses from different address spaces; we cannot deal + // with them. + Type *PtrTy = cast(Access.Addr->getType()->getScalarType()); + if (PtrTy->getPointerAddressSpace() != 0) + return None; + + // Ignore swifterror addresses. + // swifterror memory addresses are mem2reg promoted by instruction + // selection. As such they cannot have regular uses like an instrumentation + // function and it makes no sense to track them as memory. + if (Access.Addr->isSwiftError()) + return None; + + return Access; +} + +void HeapProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, + Value *Mask, Instruction *I, + Value *Addr, unsigned Alignment, + uint32_t TypeSize, + bool IsWrite) { + auto *VTy = cast( + cast(Addr->getType())->getElementType()); + uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType()); + unsigned Num = VTy->getNumElements(); + auto *Zero = ConstantInt::get(IntptrTy, 0); + for (unsigned Idx = 0; Idx < Num; ++Idx) { + Value *InstrumentedAddress = nullptr; + Instruction *InsertBefore = I; + if (auto *Vector = dyn_cast(Mask)) { + // dyn_cast as we might get UndefValue + if (auto *Masked = dyn_cast(Vector->getOperand(Idx))) { + if (Masked->isZero()) + // Mask is constant false, so no instrumentation needed. + continue; + // If we have a true or undef value, fall through to instrumentAddress. + // with InsertBefore == I + } + } else { + IRBuilder<> IRB(I); + Value *MaskElem = IRB.CreateExtractElement(Mask, Idx); + Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false); + InsertBefore = ThenTerm; + } + + IRBuilder<> IRB(InsertBefore); + InstrumentedAddress = + IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)}); + instrumentAddress(I, InsertBefore, InstrumentedAddress, ElemTypeSize, + IsWrite); + } +} + +void HeapProfiler::instrumentMop(Instruction *I, const DataLayout &DL, + InterestingMemoryAccess &Access) { + if (Access.IsWrite) + NumInstrumentedWrites++; + else + NumInstrumentedReads++; + + if (Access.MaybeMask) { + instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr, + Access.Alignment, Access.TypeSize, + Access.IsWrite); + } else { + // Since the access counts will be accumulated across the entire allocation, + // we only update the shadow access count for the first location and thus + // don't need to worry about alignment and type size. + instrumentAddress(I, I, Access.Addr, Access.TypeSize, Access.IsWrite); + } +} + +void HeapProfiler::instrumentAddress(Instruction *OrigIns, + Instruction *InsertBefore, Value *Addr, + uint32_t TypeSize, bool IsWrite) { + IRBuilder<> IRB(InsertBefore); + Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); + + if (ClUseCalls) { + IRB.CreateCall(HeapProfMemoryAccessCallback[IsWrite], AddrLong); + return; + } + + // Create an inline sequence to compute shadow location, and increment the + // value by one. + Type *ShadowTy = Type::getInt64Ty(*C); + Type *ShadowPtrTy = PointerType::get(ShadowTy, 0); + Value *ShadowPtr = memToShadow(AddrLong, IRB); + Value *ShadowAddr = IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy); + Value *ShadowValue = IRB.CreateLoad(ShadowTy, ShadowAddr); + Value *Inc = ConstantInt::get(Type::getInt64Ty(*C), 1); + ShadowValue = IRB.CreateAdd(ShadowValue, Inc); + IRB.CreateStore(ShadowValue, ShadowAddr); +} + +bool ModuleHeapProfiler::instrumentModule(Module &M) { + // Create a module constructor. + std::string HeapProfVersion = std::to_string(LLVM_HEAP_PROFILER_VERSION); + std::string VersionCheckName = + ClInsertVersionCheck ? (HeapProfVersionCheckNamePrefix + HeapProfVersion) + : ""; + std::tie(HeapProfCtorFunction, std::ignore) = + createSanitizerCtorAndInitFunctions(M, HeapProfModuleCtorName, + HeapProfInitName, /*InitArgTypes=*/{}, + /*InitArgs=*/{}, VersionCheckName); + + const uint64_t Priority = getCtorAndDtorPriority(TargetTriple); + appendToGlobalCtors(M, HeapProfCtorFunction, Priority); + + return true; +} + +void HeapProfiler::initializeCallbacks(Module &M) { + IRBuilder<> IRB(*C); + + for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { + const std::string TypeStr = AccessIsWrite ? "store" : "load"; + + SmallVector Args2 = {IntptrTy, IntptrTy}; + SmallVector Args1{1, IntptrTy}; + HeapProfMemoryAccessCallbackSized[AccessIsWrite] = + M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr + "N", + FunctionType::get(IRB.getVoidTy(), Args2, false)); + + HeapProfMemoryAccessCallback[AccessIsWrite] = + M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr, + FunctionType::get(IRB.getVoidTy(), Args1, false)); + } + HeapProfMemmove = M.getOrInsertFunction( + ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy); + HeapProfMemcpy = M.getOrInsertFunction( + ClMemoryAccessCallbackPrefix + "memcpy", IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy); + HeapProfMemset = M.getOrInsertFunction( + ClMemoryAccessCallbackPrefix + "memset", IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy); +} + +bool HeapProfiler::maybeInsertHeapProfInitAtFunctionEntry(Function &F) { + // For each NSObject descendant having a +load method, this method is invoked + // by the ObjC runtime before any of the static constructors is called. + // Therefore we need to instrument such methods with a call to __heapprof_init + // at the beginning in order to initialize our runtime before any access to + // the shadow memory. + // We cannot just ignore these methods, because they may call other + // instrumented functions. + if (F.getName().find(" load]") != std::string::npos) { + FunctionCallee HeapProfInitFunction = + declareSanitizerInitFunction(*F.getParent(), HeapProfInitName, {}); + IRBuilder<> IRB(&F.front(), F.front().begin()); + IRB.CreateCall(HeapProfInitFunction, {}); + return true; + } + return false; +} + +bool HeapProfiler::insertDynamicShadowAtFunctionEntry(Function &F) { + IRBuilder<> IRB(&F.front().front()); + Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal( + HeapProfShadowMemoryDynamicAddress, IntptrTy); + DynamicShadowOffset = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress); + return true; +} + +bool HeapProfiler::instrumentFunction(Function &F) { + if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) + return false; + if (ClDebugFunc == F.getName()) + return false; + if (F.getName().startswith("__heapprof_")) + return false; + + bool FunctionModified = false; + + // If needed, insert __heapprof_init. + // This function needs to be called even if the function body is not + // instrumented. + if (maybeInsertHeapProfInitAtFunctionEntry(F)) + FunctionModified = true; + + LLVM_DEBUG(dbgs() << "HEAPPROF instrumenting:\n" << F << "\n"); + + initializeCallbacks(*F.getParent()); + + FunctionModified |= insertDynamicShadowAtFunctionEntry(F); + + SmallVector ToInstrument; + + // Fill the set of memory operations to instrument. + for (auto &BB : F) { + for (auto &Inst : BB) { + if (isInterestingMemoryAccess(&Inst) || isa(Inst)) + ToInstrument.push_back(&Inst); + } + } + + int NumInstrumented = 0; + for (auto *Inst : ToInstrument) { + if (ClDebugMin < 0 || ClDebugMax < 0 || + (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) { + Optional Access = + isInterestingMemoryAccess(Inst); + if (Access) + instrumentMop(Inst, F.getParent()->getDataLayout(), *Access); + else + instrumentMemIntrinsic(cast(Inst)); + } + NumInstrumented++; + } + + if (NumInstrumented > 0) + FunctionModified = true; + + LLVM_DEBUG(dbgs() << "HEAPPROF done instrumenting: " << FunctionModified + << " " << F << "\n"); + + return FunctionModified; +} diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index eda38e7da4f4d..5cf3c2e3e11b3 100644 --- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -105,6 +105,8 @@ Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T, void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeAddressSanitizerLegacyPassPass(Registry); initializeModuleAddressSanitizerLegacyPassPass(Registry); + initializeHeapProfilerLegacyPassPass(Registry); + initializeModuleHeapProfilerLegacyPassPass(Registry); initializeBoundsCheckingLegacyPassPass(Registry); initializeControlHeightReductionLegacyPassPass(Registry); initializeGCOVProfilerLegacyPassPass(Registry); diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp index fa97a194ea2b5..6f785687b5045 100644 --- a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp +++ b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp @@ -282,8 +282,10 @@ static bool rewrite(Function &F) { // Note: There are many more sources of documented UB, but this pass only // attempts to find UB triggered by propagation of poison. - if (Value *Op = const_cast(getGuaranteedNonPoisonOp(&I))) - CreateAssertNot(B, getPoisonFor(ValToPoison, Op)); + SmallPtrSet NonPoisonOps; + getGuaranteedNonPoisonOps(&I, NonPoisonOps); + for (const Value *Op : NonPoisonOps) + CreateAssertNot(B, getPoisonFor(ValToPoison, const_cast(Op))); if (LocalCheck) if (auto *RI = dyn_cast(&I)) diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index 9d9712bb0da3e..89173414c16b1 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -4,7 +4,6 @@ add_llvm_component_library(LLVMScalarOpts BDCE.cpp CallSiteSplitting.cpp ConstantHoisting.cpp - ConstantProp.cpp CorrelatedValuePropagation.cpp DCE.cpp DeadStoreElimination.cpp diff --git a/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/llvm/lib/Transforms/Scalar/ConstantProp.cpp deleted file mode 100644 index 73bf1d521b1d0..0000000000000 --- a/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ /dev/null @@ -1,121 +0,0 @@ -//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements constant propagation and merging: -// -// Specifically, this: -// * Converts instructions like "add int 1, 2" into 3 -// -// Notice that: -// * This pass has a habit of making definitions be dead. It is a good idea -// to run a DIE pass sometime after running this pass. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/IR/Constant.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instruction.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/DebugCounter.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -using namespace llvm; - -#define DEBUG_TYPE "constprop" - -STATISTIC(NumInstKilled, "Number of instructions killed"); -DEBUG_COUNTER(CPCounter, "constprop-transform", - "Controls which instructions are killed"); - -namespace { - struct ConstantPropagation : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - ConstantPropagation() : FunctionPass(ID) { - initializeConstantPropagationPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - } - }; -} - -char ConstantPropagation::ID = 0; -INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop", - "Simple constant propagation", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(ConstantPropagation, "constprop", - "Simple constant propagation", false, false) - -FunctionPass *llvm::createConstantPropagationPass() { - return new ConstantPropagation(); -} - -bool ConstantPropagation::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - // Initialize the worklist to all of the instructions ready to process... - SmallPtrSet WorkList; - // The SmallVector of WorkList ensures that we do iteration at stable order. - // We use two containers rather than one SetVector, since remove is - // linear-time, and we don't care enough to remove from Vec. - SmallVector WorkListVec; - for (Instruction &I : instructions(&F)) { - WorkList.insert(&I); - WorkListVec.push_back(&I); - } - - bool Changed = false; - const DataLayout &DL = F.getParent()->getDataLayout(); - TargetLibraryInfo *TLI = - &getAnalysis().getTLI(F); - - while (!WorkList.empty()) { - SmallVector NewWorkListVec; - for (auto *I : WorkListVec) { - WorkList.erase(I); // Remove element from the worklist... - - if (!I->use_empty()) // Don't muck with dead instructions... - if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) { - if (!DebugCounter::shouldExecute(CPCounter)) - continue; - - // Add all of the users of this instruction to the worklist, they might - // be constant propagatable now... - for (User *U : I->users()) { - // If user not in the set, then add it to the vector. - if (WorkList.insert(cast(U)).second) - NewWorkListVec.push_back(cast(U)); - } - - // Replace all of the uses of a variable with uses of the constant. - I->replaceAllUsesWith(C); - - if (isInstructionTriviallyDead(I, TLI)) { - I->eraseFromParent(); - ++NumInstKilled; - } - - // We made a change to the function... - Changed = true; - } - } - WorkListVec = std::move(NewWorkListVec); - } - return Changed; -} diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index e2c428c75aa54..0c7992031eb51 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -87,6 +87,8 @@ STATISTIC(NumModifiedStores, "Number of stores modified"); STATISTIC(NumCFGChecks, "Number of stores modified"); STATISTIC(NumCFGTries, "Number of stores modified"); STATISTIC(NumCFGSuccess, "Number of stores modified"); +STATISTIC(NumDomMemDefChecks, + "Number iterations check for reads in getDomMemoryDef"); DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa", "Controls which MemoryDefs are eliminated."); @@ -109,12 +111,29 @@ static cl::opt MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden, cl::desc("The number of memory instructions to scan for " "dead store elimination (default = 100)")); +static cl::opt MemorySSAUpwardsStepLimit( + "dse-memoryssa-walklimit", cl::init(70), cl::Hidden, + cl::desc("The maximum number of steps while walking upwards to find " + "MemoryDefs that may be killed (default = 70)")); static cl::opt MemorySSADefsPerBlockLimit( "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden, cl::desc("The number of MemoryDefs we consider as candidates to eliminated " "other stores per basic block (default = 5000)")); +static cl::opt MemorySSASameBBStepCost( + "dse-memoryssa-samebb-cost", cl::init(1), cl::Hidden, + cl::desc( + "The cost of a step in the same basic block as the killing MemoryDef" + "(default = 1)")); + +static cl::opt + MemorySSAOtherBBStepCost("dse-memoryssa-otherbb-cost", cl::init(5), + cl::Hidden, + cl::desc("The cost of a step in a different basic " + "block than the killing MemoryDef" + "(default = 5)")); + static cl::opt MemorySSAPathCheckLimit( "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden, cl::desc("The maximum number of blocks to check when trying to prove that " @@ -1440,16 +1459,17 @@ namespace { // in between both MemoryDefs. A bit more concretely: // // For all MemoryDefs StartDef: -// 1. Get the next dominating clobbering MemoryDef (DomAccess) by walking +// 1. Get the next dominating clobbering MemoryDef (EarlierAccess) by walking // upwards. -// 2. Check that there are no reads between DomAccess and the StartDef by -// checking all uses starting at DomAccess and walking until we see StartDef. -// 3. For each found DomDef, check that: -// 1. There are no barrier instructions between DomDef and StartDef (like +// 2. Check that there are no reads between EarlierAccess and the StartDef by +// checking all uses starting at EarlierAccess and walking until we see +// StartDef. +// 3. For each found EarlierDef, check that: +// 1. There are no barrier instructions between EarlierDef and StartDef (like // throws or stores with ordering constraints). -// 2. StartDef is executed whenever DomDef is executed. -// 3. StartDef completely overwrites DomDef. -// 4. Erase DomDef from the function and MemorySSA. +// 2. StartDef is executed whenever EarlierDef is executed. +// 3. StartDef completely overwrites EarlierDef. +// 4. Erase EarlierDef from the function and MemorySSA. // Returns true if \p M is an intrisnic that does not read or write memory. bool isNoopIntrinsic(MemoryUseOrDef *M) { @@ -1527,10 +1547,11 @@ struct DSEState { SmallPtrSet SkipStores; // Keep track of all of the objects that are invisible to the caller before // the function returns. - SmallPtrSet InvisibleToCallerBeforeRet; + // SmallPtrSet InvisibleToCallerBeforeRet; + DenseMap InvisibleToCallerBeforeRet; // Keep track of all of the objects that are invisible to the caller after // the function returns. - SmallPtrSet InvisibleToCallerAfterRet; + DenseMap InvisibleToCallerAfterRet; // Keep track of blocks with throwing instructions not modeled in MemorySSA. SmallPtrSet ThrowingBlocks; // Post-order numbers for each basic block. Used to figure out if memory @@ -1541,6 +1562,18 @@ struct DSEState { /// basic block. DenseMap IOLs; + struct CheckCache { + SmallPtrSet KnownNoReads; + SmallPtrSet KnownReads; + + bool isKnownNoRead(MemoryAccess *A) const { + return KnownNoReads.find(A) != KnownNoReads.end(); + } + bool isKnownRead(MemoryAccess *A) const { + return KnownReads.find(A) != KnownReads.end(); + } + }; + DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI) : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI), @@ -1564,26 +1597,6 @@ struct DSEState { if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I))) State.MemDefs.push_back(MD); - - // Track whether alloca and alloca-like objects are visible in the - // caller before and after the function returns. Alloca objects are - // invalid in the caller, so they are neither visible before or after - // the function returns. - if (isa(&I)) { - State.InvisibleToCallerBeforeRet.insert(&I); - State.InvisibleToCallerAfterRet.insert(&I); - } - - // For alloca-like objects we need to check if they are captured before - // the function returns and if the return might capture the object. - if (isAllocLikeFn(&I, &TLI)) { - bool CapturesBeforeRet = PointerMayBeCaptured(&I, false, true); - if (!CapturesBeforeRet) { - State.InvisibleToCallerBeforeRet.insert(&I); - if (!PointerMayBeCaptured(&I, true, false)) - State.InvisibleToCallerAfterRet.insert(&I); - } - } } } @@ -1593,13 +1606,45 @@ struct DSEState { if (AI.hasPassPointeeByValueCopyAttr()) { // For byval, the caller doesn't know the address of the allocation. if (AI.hasByValAttr()) - State.InvisibleToCallerBeforeRet.insert(&AI); - State.InvisibleToCallerAfterRet.insert(&AI); + State.InvisibleToCallerBeforeRet.insert({&AI, true}); + State.InvisibleToCallerAfterRet.insert({&AI, true}); } return State; } + bool isInvisibleToCallerAfterRet(const Value *V) { + if (isa(V)) + return true; + auto I = InvisibleToCallerAfterRet.insert({V, false}); + if (I.second) { + if (!isInvisibleToCallerBeforeRet(V)) { + I.first->second = false; + } else { + auto *Inst = dyn_cast(V); + if (Inst && isAllocLikeFn(Inst, &TLI)) + I.first->second = !PointerMayBeCaptured(V, true, false); + } + } + return I.first->second; + } + + bool isInvisibleToCallerBeforeRet(const Value *V) { + if (isa(V)) + return true; + auto I = InvisibleToCallerBeforeRet.insert({V, false}); + if (I.second) { + auto *Inst = dyn_cast(V); + if (Inst && isAllocLikeFn(Inst, &TLI)) + // NOTE: This could be made more precise by PointerMayBeCapturedBefore + // with the killing MemoryDef. But we refrain from doing so for now to + // limit compile-time and this does not cause any changes to the number + // of stores removed on a large test set in practice. + I.first->second = !PointerMayBeCaptured(V, false, true); + } + return I.first->second; + } + Optional getLocForWriteEx(Instruction *I) const { if (!I->mayWriteToMemory()) return None; @@ -1749,11 +1794,11 @@ struct DSEState { if (CB->onlyAccessesInaccessibleMemory()) return false; - ModRefInfo MR = BatchAA.getModRefInfo(UseInst, DefLoc); - // If necessary, perform additional analysis. - if (isRefSet(MR)) - MR = AA.callCapturesBefore(UseInst, DefLoc, &DT); - return isRefSet(MR); + // NOTE: For calls, the number of stores removed could be slightly improved + // by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to + // be expensive compared to the benefits in practice. For now, avoid more + // expensive analysis to limit compile-time. + return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc)); } // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no @@ -1764,14 +1809,14 @@ struct DSEState { // (read). Optional getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current, - MemoryLocation DefLoc, bool DefVisibleToCallerBeforeRet, - bool DefVisibleToCallerAfterRet, unsigned &ScanLimit) { - if (ScanLimit == 0) { + MemoryLocation DefLoc, const Value *DefUO, CheckCache &Cache, + unsigned &ScanLimit, unsigned &WalkerStepLimit) { + if (ScanLimit == 0 || WalkerStepLimit == 0) { LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n"); return None; } - MemoryAccess *DomAccess; + MemoryAccess *StartAccess = Current; bool StepAgain; LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current << "\n"); @@ -1782,39 +1827,42 @@ struct DSEState { if (MSSA.isLiveOnEntryDef(Current)) return None; - if (isa(Current)) { - DomAccess = Current; - break; - } - MemoryUseOrDef *CurrentUD = cast(Current); - // Look for access that clobber DefLoc. - DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD, - DefLoc); - if (MSSA.isLiveOnEntryDef(DomAccess)) + // Cost of a step. Accesses in the same block are more likely to be valid + // candidates for elimination, hence consider them cheaper. + unsigned StepCost = KillingDef->getBlock() == Current->getBlock() + ? MemorySSASameBBStepCost + : MemorySSAOtherBBStepCost; + if (WalkerStepLimit <= StepCost) return None; + WalkerStepLimit -= StepCost; - if (isa(DomAccess)) + if (isa(Current)) break; - // Check if we can skip DomDef for DSE. - MemoryDef *DomDef = dyn_cast(DomAccess); - if (DomDef && canSkipDef(DomDef, DefVisibleToCallerBeforeRet)) { + // Check if we can skip EarlierDef for DSE. + MemoryDef *CurrentDef = dyn_cast(Current); + if (CurrentDef && + canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) { StepAgain = true; - Current = DomDef->getDefiningAccess(); + Current = CurrentDef->getDefiningAccess(); } - } while (StepAgain); + MemoryAccess *EarlierAccess = Current; // Accesses to objects accessible after the function returns can only be // eliminated if the access is killed along all paths to the exit. Collect // the blocks with killing (=completely overwriting MemoryDefs) and check if - // they cover all paths from DomAccess to any function exit. + // they cover all paths from EarlierAccess to any function exit. SmallPtrSet KillingDefs; KillingDefs.insert(KillingDef->getMemoryInst()); + Instruction *EarlierMemInst = + isa(EarlierAccess) + ? cast(EarlierAccess)->getMemoryInst() + : nullptr; LLVM_DEBUG({ - dbgs() << " Checking for reads of " << *DomAccess; - if (isa(DomAccess)) - dbgs() << " (" << *cast(DomAccess)->getMemoryInst() << ")\n"; + dbgs() << " Checking for reads of " << *EarlierAccess; + if (EarlierMemInst) + dbgs() << " (" << *EarlierMemInst << ")\n"; else dbgs() << ")\n"; }); @@ -1824,9 +1872,14 @@ struct DSEState { for (Use &U : Acc->uses()) WorkList.insert(cast(U.getUser())); }; - PushMemUses(DomAccess); - - // Check if DomDef may be read. + PushMemUses(EarlierAccess); + + // Optimistically collect all accesses for reads. If we do not find any + // read clobbers, add them to the cache. + SmallPtrSet KnownNoReads; + if (!EarlierMemInst || !EarlierMemInst->mayReadFromMemory()) + KnownNoReads.insert(EarlierAccess); + // Check if EarlierDef may be read. for (unsigned I = 0; I < WorkList.size(); I++) { MemoryAccess *UseAccess = WorkList[I]; @@ -1837,6 +1890,20 @@ struct DSEState { return None; } --ScanLimit; + NumDomMemDefChecks++; + + // Check if we already visited this access. + if (Cache.isKnownNoRead(UseAccess)) { + LLVM_DEBUG(dbgs() << " ... skip, discovered that " << *UseAccess + << " is safe earlier.\n"); + continue; + } + if (Cache.isKnownRead(UseAccess)) { + LLVM_DEBUG(dbgs() << " ... bail out, discovered that " << *UseAccess + << " has a read-clobber earlier.\n"); + return None; + } + KnownNoReads.insert(UseAccess); if (isa(UseAccess)) { if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) { @@ -1876,14 +1943,17 @@ struct DSEState { // original MD. Stop walk. if (isReadClobber(DefLoc, UseInst)) { LLVM_DEBUG(dbgs() << " ... found read clobber\n"); + Cache.KnownReads.insert(UseAccess); + Cache.KnownReads.insert(StartAccess); + Cache.KnownReads.insert(EarlierAccess); return None; } - // For the KillingDef and DomAccess we only have to check if it reads the - // memory location. + // For the KillingDef and EarlierAccess we only have to check if it reads + // the memory location. // TODO: It would probably be better to check for self-reads before // calling the function. - if (KillingDef == UseAccess || DomAccess == UseAccess) { + if (KillingDef == UseAccess || EarlierAccess == UseAccess) { LLVM_DEBUG(dbgs() << " ... skipping killing def/dom access\n"); continue; } @@ -1892,7 +1962,7 @@ struct DSEState { // the original location. Otherwise we have to check uses of *all* // MemoryDefs we discover, including non-aliasing ones. Otherwise we might // miss cases like the following - // 1 = Def(LoE) ; <----- DomDef stores [0,1] + // 1 = Def(LoE) ; <----- EarlierDef stores [0,1] // 2 = Def(1) ; (2, 1) = NoAlias, stores [2,3] // Use(2) ; MayAlias 2 *and* 1, loads [0, 3]. // (The Use points to the *first* Def it may alias) @@ -1900,10 +1970,11 @@ struct DSEState { // stores [0,1] if (MemoryDef *UseDef = dyn_cast(UseAccess)) { if (isCompleteOverwrite(DefLoc, UseInst)) { - if (DefVisibleToCallerAfterRet && UseAccess != DomAccess) { + if (!isInvisibleToCallerAfterRet(DefUO) && + UseAccess != EarlierAccess) { BasicBlock *MaybeKillingBlock = UseInst->getParent(); if (PostOrderNumbers.find(MaybeKillingBlock)->second < - PostOrderNumbers.find(DomAccess->getBlock())->second) { + PostOrderNumbers.find(EarlierAccess->getBlock())->second) { LLVM_DEBUG(dbgs() << " ... found killing def " << *UseInst << "\n"); @@ -1916,9 +1987,9 @@ struct DSEState { } // For accesses to locations visible after the function returns, make sure - // that the location is killed (=overwritten) along all paths from DomAccess - // to the exit. - if (DefVisibleToCallerAfterRet) { + // that the location is killed (=overwritten) along all paths from + // EarlierAccess to the exit. + if (!isInvisibleToCallerAfterRet(DefUO)) { SmallPtrSet KillingBlocks; for (Instruction *KD : KillingDefs) KillingBlocks.insert(KD->getParent()); @@ -1935,23 +2006,19 @@ struct DSEState { } // If CommonPred is in the set of killing blocks, just check if it - // post-dominates DomAccess. + // post-dominates EarlierAccess. if (KillingBlocks.count(CommonPred)) { - if (PDT.dominates(CommonPred, DomAccess->getBlock())) - return {DomAccess}; + if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) + return {EarlierAccess}; return None; } - // If the common post-dominator does not post-dominate DomAccess, there - // is a path from DomAccess to an exit not going through a killing block. - if (PDT.dominates(CommonPred, DomAccess->getBlock())) { + // If the common post-dominator does not post-dominate EarlierAccess, + // there is a path from EarlierAccess to an exit not going through a + // killing block. + if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) { SetVector WorkList; - // DomAccess's post-order number provides an upper bound of the blocks - // on a path starting at DomAccess. - unsigned UpperBound = - PostOrderNumbers.find(DomAccess->getBlock())->second; - // If CommonPred is null, there are multiple exits from the function. // They all have to be added to the worklist. if (CommonPred) @@ -1962,24 +2029,20 @@ struct DSEState { NumCFGTries++; // Check if all paths starting from an exit node go through one of the - // killing blocks before reaching DomAccess. + // killing blocks before reaching EarlierAccess. for (unsigned I = 0; I < WorkList.size(); I++) { NumCFGChecks++; BasicBlock *Current = WorkList[I]; if (KillingBlocks.count(Current)) continue; - if (Current == DomAccess->getBlock()) + if (Current == EarlierAccess->getBlock()) return None; - // DomAccess is reachable from the entry, so we don't have to explore - // unreachable blocks further. + // EarlierAccess is reachable from the entry, so we don't have to + // explore unreachable blocks further. if (!DT.isReachableFromEntry(Current)) continue; - unsigned CPO = PostOrderNumbers.find(Current)->second; - // Current block is not on a path starting at DomAccess. - if (CPO > UpperBound) - continue; for (BasicBlock *Pred : predecessors(Current)) WorkList.insert(Pred); @@ -1987,13 +2050,15 @@ struct DSEState { return None; } NumCFGSuccess++; - return {DomAccess}; + return {EarlierAccess}; } return None; } - // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead. - return {DomAccess}; + // No aliasing MemoryUses of EarlierAccess found, EarlierAccess is + // potentially dead. + Cache.KnownNoReads.insert(KnownNoReads.begin(), KnownNoReads.end()); + return {EarlierAccess}; } // Delete dead memory defs @@ -2038,11 +2103,11 @@ struct DSEState { // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may // throw are handled during the walk from one def to the next. bool mayThrowBetween(Instruction *SI, Instruction *NI, - const Value *SILocUnd) const { + const Value *SILocUnd) { // First see if we can ignore it by using the fact that SI is an // alloca/alloca like object that is not visible to the caller during // execution of the function. - if (SILocUnd && InvisibleToCallerBeforeRet.count(SILocUnd)) + if (SILocUnd && isInvisibleToCallerBeforeRet(SILocUnd)) return false; if (SI->getParent() == NI->getParent()) @@ -2055,10 +2120,10 @@ struct DSEState { // * A memory instruction that may throw and \p SI accesses a non-stack // object. // * Atomic stores stronger that monotonic. - bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) const { + bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) { // If NI may throw it acts as a barrier, unless we are to an alloca/alloca // like object that does not escape. - if (NI->mayThrow() && !InvisibleToCallerBeforeRet.count(SILocUnd)) + if (NI->mayThrow() && !isInvisibleToCallerBeforeRet(SILocUnd)) return true; // If NI is an atomic load/store stronger than monotonic, do not try to @@ -2096,17 +2161,17 @@ struct DSEState { auto DefLoc = getLocForWriteEx(DefI); if (!DefLoc) continue; - getUnderlyingObjects(DefLoc->Ptr, Pointers); - bool CanKill = true; - for (const Value *Pointer : Pointers) { - if (!InvisibleToCallerAfterRet.count(Pointer)) { - CanKill = false; - break; - } - } + // NOTE: Currently eliminating writes at the end of a function is limited + // to MemoryDefs with a single underlying object, to save compile-time. In + // practice it appears the case with multiple underlying objects is very + // uncommon. If it turns out to be important, we can use + // getUnderlyingObjects here instead. + const Value *UO = getUnderlyingObject(DefLoc->Ptr); + if (!UO || !isInvisibleToCallerAfterRet(UO)) + continue; - if (CanKill && isWriteAtEndOfFunction(Def)) { + if (isWriteAtEndOfFunction(Def)) { // See through pointer-to-pointer bitcasts LLVM_DEBUG(dbgs() << " ... MemoryDef is not accessed until the end " "of the function\n"); @@ -2189,50 +2254,40 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, continue; } - Instruction *DefObj = - const_cast(dyn_cast(SILocUnd)); - bool DefVisibleToCallerBeforeRet = - !State.InvisibleToCallerBeforeRet.count(SILocUnd); - bool DefVisibleToCallerAfterRet = - !State.InvisibleToCallerAfterRet.count(SILocUnd); - if (DefObj && isAllocLikeFn(DefObj, &TLI)) { - if (DefVisibleToCallerBeforeRet) - DefVisibleToCallerBeforeRet = - PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT); - } - MemoryAccess *Current = KillingDef; LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " << *KillingDef << " (" << *SI << ")\n"); unsigned ScanLimit = MemorySSAScanLimit; + unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit; // Worklist of MemoryAccesses that may be killed by KillingDef. SetVector ToCheck; ToCheck.insert(KillingDef->getDefiningAccess()); + DSEState::CheckCache Cache; // Check if MemoryAccesses in the worklist are killed by KillingDef. for (unsigned I = 0; I < ToCheck.size(); I++) { Current = ToCheck[I]; if (State.SkipStores.count(Current)) continue; - Optional Next = State.getDomMemoryDef( - KillingDef, Current, SILoc, DefVisibleToCallerBeforeRet, - DefVisibleToCallerAfterRet, ScanLimit); + Optional Next = + State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, Cache, + ScanLimit, WalkerStepLimit); if (!Next) { LLVM_DEBUG(dbgs() << " finished walk\n"); continue; } - MemoryAccess *DomAccess = *Next; - LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess); - if (isa(DomAccess)) { + MemoryAccess *EarlierAccess = *Next; + LLVM_DEBUG(dbgs() << " Checking if we can kill " << *EarlierAccess); + if (isa(EarlierAccess)) { LLVM_DEBUG(dbgs() << "\n ... adding incoming values to worklist\n"); - for (Value *V : cast(DomAccess)->incoming_values()) { + for (Value *V : cast(EarlierAccess)->incoming_values()) { MemoryAccess *IncomingAccess = cast(V); BasicBlock *IncomingBlock = IncomingAccess->getBlock(); - BasicBlock *PhiBlock = DomAccess->getBlock(); + BasicBlock *PhiBlock = EarlierAccess->getBlock(); // We only consider incoming MemoryAccesses that come before the // MemoryPhi. Otherwise we could discover candidates that do not @@ -2243,7 +2298,7 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, } continue; } - MemoryDef *NextDef = dyn_cast(DomAccess); + MemoryDef *NextDef = dyn_cast(EarlierAccess); Instruction *NI = NextDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " (" << *NI << ")\n"); diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index 8bc35d5228df8..55b9dd7482cc3 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -38,7 +38,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeAlignmentFromAssumptionsPass(Registry); initializeCallSiteSplittingLegacyPassPass(Registry); initializeConstantHoistingLegacyPassPass(Registry); - initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); initializeDCELegacyPassPass(Registry); initializeDeadInstEliminationPass(Registry); @@ -248,10 +247,6 @@ void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createTailCallEliminationPass()); } -void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createConstantPropagationPass()); -} - void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createDemoteRegisterToMemoryPass()); } diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 851bd79cd6d83..3bc0cbde8c19d 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -941,13 +941,13 @@ bool ScalarizerVisitor::finish() { for (unsigned I = 0; I < Count; ++I) Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I), Op->getName() + ".upto" + Twine(I)); + Res->takeName(Op); } else { assert(CV.size() == 1 && Op->getType() == CV[0]->getType()); Res = CV[0]; if (Op == Res) continue; } - Res->takeName(Op); Op->replaceAllUsesWith(Res); } PotentiallyDeadInstrs.emplace_back(Op); diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 78a52c992e9f4..34eb9e1b8124f 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -693,8 +693,6 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B, Offset); } } - - return nullptr; } // strlen(x?"foo":"bars") --> x ? 3 : 4 diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 26ffb044a35c2..770e6e3037cfa 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -1027,8 +1027,8 @@ bool Vectorizer::vectorizeStoreChain( unsigned EltSzInBytes = Sz / 8; unsigned SzInBytes = EltSzInBytes * ChainSize; - VectorType *VecTy; - VectorType *VecStoreTy = dyn_cast(StoreTy); + FixedVectorType *VecTy; + auto *VecStoreTy = dyn_cast(StoreTy); if (VecStoreTy) VecTy = FixedVectorType::get(StoreTy->getScalarType(), Chain.size() * VecStoreTy->getNumElements()); @@ -1180,7 +1180,7 @@ bool Vectorizer::vectorizeLoadChain( unsigned EltSzInBytes = Sz / 8; unsigned SzInBytes = EltSzInBytes * ChainSize; VectorType *VecTy; - VectorType *VecLoadTy = dyn_cast(LoadTy); + auto *VecLoadTy = dyn_cast(LoadTy); if (VecLoadTy) VecTy = FixedVectorType::get(LoadTy->getScalarType(), Chain.size() * VecLoadTy->getNumElements()); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index e2a95d6f67a86..157620c30b98f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -919,7 +919,10 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { } bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl &SafePtrs, bool PreserveGuards) { + BasicBlock *BB, SmallPtrSetImpl &SafePtrs, + SmallPtrSetImpl &MaskedOp, + SmallPtrSetImpl &ConditionalAssumes, + bool PreserveGuards) const { const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); for (Instruction &I : *BB) { @@ -1026,7 +1029,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointers)) { + if (!blockCanBePredicated(BB, SafePointers, MaskedOp, + ConditionalAssumes)) { reportVectorizationFailure( "Control flow cannot be substituted for a select", "control flow cannot be substituted for a select", @@ -1253,10 +1257,10 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { Instruction *UI = cast(U); if (TheLoop->contains(UI)) continue; - reportVectorizationFailure( - "Cannot fold tail by masking, loop has an outside user for", - "Cannot fold tail by masking in the presence of live outs.", - "LiveOutFoldingTailByMasking", ORE, TheLoop, UI); + LLVM_DEBUG( + dbgs() + << "LV: Cannot fold tail by masking, loop has an outside user for " + << *UI << "\n"); return false; } } @@ -1264,20 +1268,26 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { // The list of pointers that we can safely read and write to remains empty. SmallPtrSet SafePointers; + SmallPtrSet TmpMaskedOp; + SmallPtrSet TmpConditionalAssumes; + // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) { - reportVectorizationFailure( - "Cannot fold tail by masking as required", - "control flow cannot be substituted for a select", - "NoCFGForSelect", ORE, TheLoop, - BB->getTerminator()); + if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, + TmpConditionalAssumes, + /* MaskAllLoads= */ true)) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } } LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); + + MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); + ConditionalAssumes.insert(TmpConditionalAssumes.begin(), + TmpConditionalAssumes.end()); + return true; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index ecf6c8402cd66..8c3dff69e072c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -172,12 +172,14 @@ class VPBuilder { /// Information about vectorization costs struct VectorizationFactor { // Vector width with best cost - unsigned Width; + ElementCount Width; // Cost of the loop with that width unsigned Cost; // Width 1 means no vectorization, cost 0 means uncomputed cost. - static VectorizationFactor Disabled() { return {1, 0}; } + static VectorizationFactor Disabled() { + return {ElementCount::getFixed(1), 0}; + } bool operator==(const VectorizationFactor &rhs) const { return Width == rhs.Width && Cost == rhs.Cost; @@ -227,7 +229,10 @@ class LoopVectorizationPlanner { /// A builder used to construct the current plan. VPBuilder Builder; - unsigned BestVF = 0; + /// The best number of elements of the vector types used in the + /// transformed loop. BestVF = None means that vectorization is + /// disabled. + Optional BestVF = None; unsigned BestUF = 0; public: @@ -242,14 +247,14 @@ class LoopVectorizationPlanner { /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(unsigned UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(unsigned UserVF); + VectorizationFactor planInVPlanNativePath(ElementCount UserVF); /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(unsigned VF, unsigned UF); + void setBestPlan(ElementCount VF, unsigned UF); /// Generate the IR code for the body of the vectorized loop according to the /// best selected VPlan. @@ -264,7 +269,7 @@ class LoopVectorizationPlanner { /// \p Predicate on Range.Start, possibly decreasing Range.End such that the /// returned value holds for the entire \p Range. static bool - getDecisionAndClampRange(const std::function &Predicate, + getDecisionAndClampRange(const std::function &Predicate, VFRange &Range); protected: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 86f15500d8389..f999c5af7f475 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -178,13 +178,36 @@ static cl::opt TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); -// Indicates that an epilogue is undesired, predication is preferred. -// This means that the vectorizer will try to fold the loop-tail (epilogue) -// into the loop and predicate the loop body accordingly. -static cl::opt PreferPredicateOverEpilog( - "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, - cl::desc("Indicate that an epilogue is undesired, predication should be " - "used instead.")); +// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, +// that predication is preferred, and this lists all options. I.e., the +// vectorizer will try to fold the tail-loop (epilogue) into the vector body +// and predicate the instructions accordingly. If tail-folding fails, there are +// different fallback strategies depending on these values: +namespace PreferPredicateTy { + enum Option { + ScalarEpilogue = 0, + PredicateElseScalarEpilogue, + PredicateOrDontVectorize + }; +} + +static cl::opt PreferPredicateOverEpilogue( + "prefer-predicate-over-epilogue", + cl::init(PreferPredicateTy::ScalarEpilogue), + cl::Hidden, + cl::desc("Tail-folding and predication preferences over creating a scalar " + "epilogue loop."), + cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, + "scalar-epilogue", + "Don't tail-predicate loops, create scalar epilogue"), + clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, + "predicate-else-scalar-epilogue", + "prefer tail-folding, create scalar epilogue if tail " + "folding fails."), + clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, + "predicate-dont-vectorize", + "prefers tail-folding, don't attempt vectorization if " + "tail-folding fails."))); static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, @@ -196,7 +219,7 @@ static cl::opt EnableInterleavedMemAccesses( cl::desc("Enable vectorization on interleaved memory accesses in a loop")); /// An interleave-group may need masking if it resides in a block that needs -/// predication, or in order to mask away gaps. +/// predication, or in order to mask away gaps. static cl::opt EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); @@ -318,11 +341,12 @@ static Type *getMemInstValueType(Value *I) { /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. -static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { +static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a vector. - if (VF > 1) { - auto *VectorTy = FixedVectorType::get(Ty, VF); + if (VF.isVector()) { + auto *VectorTy = VectorType::get(Ty, VF); return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); } @@ -404,7 +428,7 @@ class InnerLoopVectorizer { LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, unsigned VecWidth, + OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) @@ -454,13 +478,13 @@ class InnerLoopVectorizer { /// Vectorize a single GetElementPtrInst based on information gathered and /// decisions taken during planning. void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, - unsigned VF, bool IsPtrLoopInvariant, + ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane @@ -748,7 +772,7 @@ class InnerLoopVectorizer { /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. - unsigned VF; + ElementCount VF; /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. @@ -837,8 +861,9 @@ class InnerLoopUnroller : public InnerLoopVectorizer { LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM, BFI, PSI) {} + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), UnrollFactor, LVL, CM, + BFI, PSI) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -874,7 +899,8 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && !isa(Inst)) { - auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min); if (NewDIL) B.SetCurrentDebugLocation(NewDIL.getValue()); else @@ -1039,7 +1065,7 @@ class LoopVectorizationCostModel { VectorizationFactor selectVectorizationFactor(unsigned MaxVF); /// Setup cost-based decisions for user vectorization factor. - void selectUserVectorizationFactor(unsigned UserVF) { + void selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); } @@ -1053,7 +1079,7 @@ class LoopVectorizationCostModel { /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); + unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1062,7 +1088,7 @@ class LoopVectorizationCostModel { /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. - void setCostBasedWideningDecision(unsigned VF); + void setCostBasedWideningDecision(ElementCount VF); /// A struct that represents some properties of the register usage /// of a loop. @@ -1077,7 +1103,8 @@ class LoopVectorizationCostModel { /// \return Returns information about the register usages of the loop for the /// given vectorization factors. - SmallVector calculateRegisterUsage(ArrayRef VFs); + SmallVector + calculateRegisterUsage(ArrayRef VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); @@ -1095,8 +1122,9 @@ class LoopVectorizationCostModel { /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. - bool isProfitableToScalarize(Instruction *I, unsigned VF) const { - assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); + bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { + assert(VF.isVector() && + "Profitable to scalarize relevant only for VF > 1."); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. @@ -1110,8 +1138,8 @@ class LoopVectorizationCostModel { } /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1126,8 +1154,8 @@ class LoopVectorizationCostModel { } /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1143,8 +1171,8 @@ class LoopVectorizationCostModel { /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. - bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { - return VF > 1 && MinBWs.find(I) != MinBWs.end() && + bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { + return VF.isVector() && MinBWs.find(I) != MinBWs.end() && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1161,17 +1189,17 @@ class LoopVectorizationCostModel { /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. - void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, + void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + assert(VF.isVector() && "Expected VF >=2"); WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. - void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, - InstWidening W, unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + void setWideningDecision(const InterleaveGroup *Grp, + ElementCount VF, InstWidening W, unsigned Cost) { + assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { @@ -1187,15 +1215,16 @@ class LoopVectorizationCostModel { /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); + InstWidening getWideningDecision(Instruction *I, ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(VF.isVector() && "Expected VF >=2"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; - std::pair InstOnVF = std::make_pair(I, VF); + std::pair InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; @@ -1204,9 +1233,9 @@ class LoopVectorizationCostModel { /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. - unsigned getWideningCost(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - std::pair InstOnVF = std::make_pair(I, VF); + unsigned getWideningCost(Instruction *I, ElementCount VF) { + assert(VF.isVector() && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; @@ -1215,7 +1244,7 @@ class LoopVectorizationCostModel { /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. - bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { + bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { // If the instruction is not a truncate, return false. auto *Trunc = dyn_cast(I); if (!Trunc) @@ -1240,14 +1269,14 @@ class LoopVectorizationCostModel { /// Collects the instructions to scalarize for each predicated instruction in /// the loop. - void collectInstsToScalarize(unsigned VF); + void collectInstsToScalarize(ElementCount VF); /// Collect Uniform and Scalar values for the given \p VF. /// The sets depend on CM decision for Load/Store instructions /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { + void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. - if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) + if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1298,7 +1327,8 @@ class LoopVectorizationCostModel { /// instructions that may divide by zero. /// If a non-zero VF has been calculated, we check if I will be scalarized /// predication for that VF. - bool isScalarWithPredication(Instruction *I, unsigned VF = 1); + bool isScalarWithPredication(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. @@ -1315,12 +1345,16 @@ class LoopVectorizationCostModel { /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + bool + memoryInstructionCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + bool + interleavedAccessCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1372,14 +1406,15 @@ class LoopVectorizationCostModel { /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. - unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. - unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + unsigned getVectorCallCost(CallInst *CI, ElementCount VF, + bool &NeedToScalarize); /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1409,41 +1444,41 @@ class LoopVectorizationCostModel { /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. - VectorizationCostTy expectedCost(unsigned VF); + VectorizationCostTy expectedCost(ElementCount VF); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); /// Calculate vectorization cost of memory instruction \p I. - unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); + unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); /// The cost computation for scalarized memory instruction. - unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); /// The cost computation for interleaving group of memory instructions. - unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); /// The cost computation for Gather/Scatter instruction. - unsigned getGatherScatterCost(Instruction *I, unsigned VF); + unsigned getGatherScatterCost(Instruction *I, ElementCount VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. - unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); + unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. /// Store: scalar store + (loop invariant value stored? 0 : extract of last /// element) - unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -1483,19 +1518,19 @@ class LoopVectorizationCostModel { /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap InstsToScalarize; + DenseMap InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. - DenseMap> Uniforms; + DenseMap> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. - DenseMap> Scalars; + DenseMap> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. - DenseMap> ForcedScalars; + DenseMap> ForcedScalars; /// PHINodes of the reductions that should be expanded in-loop along with /// their associated chains of reduction operations, in program order from top @@ -1508,7 +1543,7 @@ class LoopVectorizationCostModel { /// non-negative return value implies the expression will be scalarized. /// Currently, only single-use chains are considered for scalarization. int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, - unsigned VF); + ElementCount VF); /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in @@ -1519,27 +1554,28 @@ class LoopVectorizationCostModel { /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. - void collectLoopUniforms(unsigned VF); + void collectLoopUniforms(ElementCount VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized /// during vectorization. Non-uniform scalarized instructions will be /// represented by VF values in the vectorized loop, each corresponding to an /// iteration of the original scalar loop. - void collectLoopScalars(unsigned VF); + void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - using DecisionList = DenseMap, + using DecisionList = DenseMap, std::pair>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. - bool needsExtract(Value *V, unsigned VF) const { + bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast(V); - if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) + if (VF.isScalar() || !I || !TheLoop->contains(I) || + TheLoop->isLoopInvariant(I)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1554,7 +1590,7 @@ class LoopVectorizationCostModel { /// Returns a range containing only operands needing to be extracted. SmallVector filterExtractingOperands(Instruction::op_range Ops, - unsigned VF) { + ElementCount VF) { return SmallVector(make_filter_range( Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } @@ -1801,7 +1837,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); + Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min); Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. @@ -1809,9 +1845,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. + assert(!VF.Scalable && "scalable vectors not yet supported."); Value *SplatVF = isa(Mul) - ? ConstantVector::getSplat(ElementCount::getFixed(VF), - cast(Mul)) + ? ConstantVector::getSplat(VF, cast(Mul)) : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); @@ -1946,8 +1982,9 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = - getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); + assert(!VF.Scalable && "scalable vectors not yet supported."); + Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step, + ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); if (Trunc) addMetadata(EntryPart, Trunc); @@ -1957,7 +1994,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF <= 1) { + if (VF.isZero() || VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -1998,7 +2035,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. - auto *ValVTy = cast(Val->getType()); + auto *ValVTy = cast(Val->getType()); int VLen = ValVTy->getNumElements(); Type *STy = Val->getType()->getScalarType(); @@ -2055,8 +2092,9 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF > 1 && "VF should be greater than one"); - + assert(VF.isVector() && "VF should be greater than one"); + assert(!VF.Scalable && + "the code below assumes a fixed number of elements at compile time"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2078,12 +2116,14 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 - : VF; + Cost->isUniformAfterVectorization(cast(EntryVal), VF) + ? 1 + : VF.Min; // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); + auto *StartIdx = + getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2126,7 +2166,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; + assert(!VF.Scalable && "scalable vectors not yet supported."); + unsigned LastLane = + Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1; auto *LastInst = cast( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2148,9 +2190,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { // Initialize packing with insertelements to start from undef. - Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); + assert(!VF.Scalable && "VF is assumed to be non scalable."); + Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); VectorLoopValueMap.setVectorValue(V, Part, Undef); - for (unsigned Lane = 0; Lane < VF; ++Lane) + for (unsigned Lane = 0; Lane < VF.Min; ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } @@ -2214,9 +2257,10 @@ void InnerLoopVectorizer::packScalarIntoVectorValue( Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); + assert(!VF.Scalable && "Cannot reverse scalable vectors"); SmallVector ShuffleMask; - for (unsigned i = 0; i < VF; ++i) - ShuffleMask.push_back(VF - i - 1); + for (unsigned i = 0; i < VF.Min; ++i) + ShuffleMask.push_back(VF.Min - i - 1); return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), ShuffleMask, "reverse"); @@ -2270,7 +2314,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. SmallVector AddrParts; @@ -2286,8 +2331,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. + assert(!VF.Scalable && + "scalable vector reverse operation is not implemented"); if (Group->isReverse()) - Index += (VF - 1) * Group->getFactor(); + Index += (VF.Min - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, {Part, 0}); @@ -2322,7 +2369,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *MaskForGaps = nullptr; if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); + assert(!VF.Scalable && "scalable vectors not yet supported."); + MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2339,9 +2387,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); + assert(!VF.Scalable && "scalable vectors not yet supported."); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, Undefs, - createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); + createReplicatedMask(InterleaveFactor, VF.Min), + "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) @@ -2367,14 +2417,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (!Member) continue; - auto StrideMask = createStrideMask(I, InterleaveFactor, VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); + assert(!VF.Scalable && "VF is assumed to be non scalable."); + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2388,7 +2440,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } // The sub vector type for current instruction. - auto *SubVT = FixedVectorType::get(ScalarTy, VF); + assert(!VF.Scalable && "VF is assumed to be non scalable."); + auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. for (unsigned Part = 0; Part < UF; Part++) { @@ -2416,8 +2469,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. + assert(!VF.Scalable && "scalable vectors not yet supported."); Value *IVec = Builder.CreateShuffleVector( - WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), + WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor), "interleaved.vec"); Instruction *NewStoreInstr; @@ -2425,8 +2479,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), - "interleaved.mask"); + BlockInMaskPart, Undefs, + createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); } @@ -2459,7 +2513,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, "CM decision is not to widen the memory instruction"); Type *ScalarDataTy = getMemInstValueType(Instr); - auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); + + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); // Determine if the pointer operand of the access is either consecutive or @@ -2493,17 +2549,17 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); + PartPtr = cast(Builder.CreateGEP( + ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min))); PartPtr->setIsInBounds(InBounds); - PartPtr = cast( - Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); + PartPtr = cast(Builder.CreateGEP( + ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { - PartPtr = cast( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); + PartPtr = cast(Builder.CreateGEP( + ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min))); PartPtr->setIsInBounds(InBounds); } @@ -2699,7 +2755,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Type *Ty = TC->getType(); - Constant *Step = ConstantInt::get(Ty, VF * UF); + // This is where we can make the step a runtime constant. + assert(!VF.Scalable && "scalable vectorization is not supported yet"); + Constant *Step = ConstantInt::get(Ty, VF.Min * UF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2708,9 +2766,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF * UF) && + assert(isPowerOf2_32(VF.Min * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); + TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1), + "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -2727,7 +2786,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations // check ensures that N >= Step. - if (VF > 1 && Cost->requiresScalarEpilogue()) { + if (VF.isVector() && Cost->requiresScalarEpilogue()) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -2740,17 +2799,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. - unsigned VF = DstVTy->getNumElements(); - VectorType *SrcVecTy = cast(V->getType()); + auto *DstFVTy = cast(DstVTy); + unsigned VF = DstFVTy->getNumElements(); + auto *SrcVecTy = cast(V->getType()); assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); - Type *DstElemTy = DstVTy->getElementType(); + Type *DstElemTy = DstFVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"); // Do a direct cast if element types are castable. if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { - return Builder.CreateBitOrPointerCast(V, DstVTy); + return Builder.CreateBitOrPointerCast(V, DstFVTy); } // V cannot be directly casted to desired vector type. // May happen when V is a floating point vector but DstVTy is a vector of @@ -2764,7 +2824,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); auto *VecIntTy = FixedVectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); - return Builder.CreateBitOrPointerCast(CastVal, DstVTy); + return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, @@ -2785,11 +2845,12 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); - if (!Cost->foldTailByMasking()) + if (!Cost->foldTailByMasking()) { + assert(!VF.Scalable && "scalable vectors not yet supported."); CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF * UF), + P, Count, ConstantInt::get(Count->getType(), VF.Min * UF), "min.iters.check"); - + } // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -3242,7 +3303,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Value *StartIdx = ConstantInt::get(IdxTy, 0); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). - Constant *Step = ConstantInt::get(IdxTy, VF * UF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -3374,8 +3436,9 @@ static void cse(BasicBlock *BB) { } unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, - unsigned VF, + ElementCount VF, bool &NeedToScalarize) { + assert(!VF.Scalable && "scalable vectors not yet supported."); Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; @@ -3388,7 +3451,7 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // value. unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); - if (VF == 1) + if (VF.isScalar()) return ScalarCallCost; // Compute corresponding vector type for return value and arguments. @@ -3400,13 +3463,12 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // packing the return values to a vector. unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); - unsigned Cost = ScalarCallCost * VF + ScalarizationCost; + unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost; // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - VFShape Shape = - VFShape::get(*CI, ElementCount::getFixed(VF), false /*HasGlobalPred*/); + VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); if (!TLI || CI->isNoBuiltin() || !VecFunc) @@ -3423,7 +3485,7 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, } unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, - unsigned VF) { + ElementCount VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -3463,7 +3525,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), KV.second); auto *TruncatedTy = FixedVectorType::get( - ScalarTruncatedTy, cast(OriginalTy)->getNumElements()); + ScalarTruncatedTy, + cast(OriginalTy)->getNumElements()); if (TruncatedTy == OriginalTy) continue; @@ -3513,13 +3576,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { break; } } else if (auto *SI = dyn_cast(I)) { - auto Elements0 = - cast(SI->getOperand(0)->getType())->getNumElements(); + auto Elements0 = cast(SI->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( SI->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = - cast(SI->getOperand(1)->getType())->getNumElements(); + auto Elements1 = cast(SI->getOperand(1)->getType()) + ->getNumElements(); auto *O1 = B.CreateZExtOrTrunc( SI->getOperand(1), FixedVectorType::get(ScalarTruncatedTy, Elements1)); @@ -3529,16 +3592,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast(I)) { - auto Elements = - cast(IE->getOperand(0)->getType())->getNumElements(); + auto Elements = cast(IE->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( IE->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements)); auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); } else if (auto *EE = dyn_cast(I)) { - auto Elements = - cast(EE->getOperand(0)->getType())->getNumElements(); + auto Elements = cast(EE->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( EE->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements)); @@ -3580,7 +3643,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. - if (VF > 1) + if (VF.isVector()) truncateToMinimalBitwidths(); // Fix widened non-induction PHIs by setting up the PHI operands. @@ -3621,9 +3684,11 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. + assert(!VF.Scalable && + "cannot use scalable ElementCount to determine unroll factor"); setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF * UF); + LI->getLoopFor(LoopScalarBody), VF.Min * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -3702,11 +3767,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Create a vector from the initial value. auto *VectorInit = ScalarInit; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + assert(!VF.Scalable && "VF is assumed to be non scalable."); VectorInit = Builder.CreateInsertElement( - UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), - VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); + UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, + Builder.getInt32(VF.Min - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -3747,10 +3813,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - SmallVector ShuffleMask(VF); - ShuffleMask[0] = VF - 1; - for (unsigned I = 1; I < VF; ++I) - ShuffleMask[I] = I + VF - 1; + assert(!VF.Scalable); + SmallVector ShuffleMask(VF.Min); + ShuffleMask[0] = VF.Min - 1; + for (unsigned I = 1; I < VF.Min; ++I) + ShuffleMask[I] = I + VF.Min - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -3760,9 +3827,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *PreviousPart = getOrCreateVectorValue(Previous, Part); Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, - ShuffleMask) - : Incoming; + auto *Shuffle = + VF.isVector() + ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) + : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast(PhiPart)->eraseFromParent(); VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); @@ -3775,10 +3843,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. auto *ExtractForScalar = Incoming; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); + ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract"); } // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself @@ -3786,9 +3854,9 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // will be the value when jumping to the exit block from the LoopMiddleBlock, // when the scalar loop is not run at all. Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF > 1) + if (VF.isVector()) ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); + Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi"); // When loop is unrolled without vectorizing, initialize // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of // `Incoming`. This is analogous to the vectorized case above: extracting the @@ -3867,7 +3935,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // incoming scalar reduction. VectorStart = ReductionStartValue; } else { - Identity = ConstantVector::getSplat(ElementCount::getFixed(VF), Iden); + Identity = ConstantVector::getSplat(VF, Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. @@ -3943,9 +4011,10 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. - if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { + if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); - Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); @@ -3997,7 +4066,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Create the reduction after the loop. Note that inloop reductions create the // target reduction in the loop using a Reduction recipe. - if (VF > 1 && !IsInLoopReductionPhi) { + if (VF.isVector() && !IsInLoopReductionPhi) { bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); @@ -4076,16 +4145,17 @@ void InnerLoopVectorizer::clearReductionWrapFlags( } void InnerLoopVectorizer::fixLCSSAPHIs() { + assert(!VF.Scalable && "the code below assumes fixed width vectors"); for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF - 1; + if (isa(IncomingValue)) + LastLane = Cost->isUniformAfterVectorization( + cast(IncomingValue), VF) + ? 0 + : VF.Min - 1; // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -4197,7 +4267,7 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, - unsigned UF, unsigned VF, + unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State) { @@ -4207,7 +4277,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. - if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we @@ -4267,7 +4337,8 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, } void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, - unsigned VF) { + ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); PHINode *P = cast(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction @@ -4275,7 +4346,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = - (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); + (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -4293,9 +4364,10 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. - bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast(PN)); + bool ScalarPHI = + (VF.isScalar()) || Cost->isInLoopReduction(cast(PN)); Type *VecTy = - ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF); + ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); VectorLoopValueMap.setVectorValue(P, Part, EntryPart); @@ -4331,10 +4403,11 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min; for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); + Constant *Idx = + ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); @@ -4364,7 +4437,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); Value *InductionGEP = GetElementPtrInst::Create( ScStValueType->getPointerElementType(), NewPointerPhi, - Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)), + Builder.CreateMul(ScalarStepValue, + ConstantInt::get(PhiType, VF.Min * UF)), "ptr.ind", InductionLoc); NewPointerPhi->addIncoming(InductionGEP, LoopLatch); @@ -4374,14 +4448,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF; ++i) - Indices.push_back(ConstantInt::get(PhiType, i + Part * VF)); + for (unsigned i = 0; i < VF.Min; ++i) + Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min)); Constant *StartOffset = ConstantVector::get(Indices); Value *GEP = Builder.CreateGEP( ScStValueType->getPointerElementType(), NewPointerPhi, Builder.CreateMul(StartOffset, - Builder.CreateVectorSplat(VF, ScalarStepValue), + Builder.CreateVectorSplat(VF.Min, ScalarStepValue), "vector.gep")); VectorLoopValueMap.setVectorValue(P, Part, GEP); } @@ -4409,6 +4483,7 @@ static bool mayDivideByZero(Instruction &I) { void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, VPTransformState &State) { + assert(!VF.Scalable && "scalable vectors not yet supported."); switch (I.getOpcode()) { case Instruction::Call: case Instruction::Br: @@ -4496,8 +4571,9 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, setDebugLocFromInst(Builder, CI); /// Vectorize casts. + assert(!VF.Scalable && "VF is assumed to be non scalable."); Type *DestTy = - (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); + (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = State.get(User.getOperand(0), Part); @@ -4525,7 +4601,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, SmallVector Tys; for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min)); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -4556,15 +4632,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, if (UseVectorIntrinsic) { // Use vector version of the intrinsic. Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = - FixedVectorType::get(CI->getType()->getScalarType(), VF); + if (VF.isVector()) { + assert(!VF.Scalable && "VF is assumed to be non scalable."); + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + } VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. - const VFShape Shape = VFShape::get(*CI, ElementCount::getFixed(VF), - false /*HasGlobalPred*/); + const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); #ifndef NDEBUG assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && "Can't create vector function."); @@ -4607,11 +4683,11 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, } } -void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { +void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && + assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); SmallSetVector Worklist; @@ -4794,7 +4870,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, + ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -4808,7 +4886,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne auto *Ty = getMemInstValueType(I); // We have already decided how to vectorize this instruction, get that // result. - if (VF > 1) { + if (VF.isVector()) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -4829,8 +4907,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return false; } -bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( + Instruction *I, ElementCount VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); @@ -4866,8 +4944,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, : TTI.isLegalMaskedStore(Ty, Alignment); } -bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( + Instruction *I, ElementCount VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast(I); StoreInst *SI = dyn_cast(I); @@ -4894,13 +4972,13 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, return true; } -void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { +void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && + assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -4951,7 +5029,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Holds pointer operands of instructions that are possibly non-uniform. SmallPtrSet PossibleNonUniformPtrs; - auto isUniformDecision = [&](Instruction *I, unsigned VF) { + auto isUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -5186,6 +5264,19 @@ Optional LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, return MaxVF; } + // If there was a tail-folding hint/switch, but we can't fold the tail by + // masking, fallback to a vectorization with a scalar epilogue. + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { + LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); + return None; + } + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " + "scalar epilogue instead.\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + return MaxVF; + } + if (TC == 0) { reportVectorizationFailure( "Unable to calculate the loop count due to complex control flow", @@ -5248,10 +5339,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). - SmallVector VFs; + SmallVector VFs; unsigned NewMaxVectorSize = WidestRegister / SmallestType; for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) - VFs.push_back(VS); + VFs.push_back(ElementCount::getFixed(VS)); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5266,7 +5357,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { Selected = false; } if (Selected) { - MaxVF = VFs[i]; + MaxVF = VFs[i].Min; break; } } @@ -5283,7 +5374,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { - float Cost = expectedCost(1).first; + float Cost = expectedCost(ElementCount::getFixed(1)).first; const float ScalarCost = Cost; unsigned Width = 1; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); @@ -5300,7 +5391,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. - VectorizationCostTy C = expectedCost(i); + VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); float VectorCost = C.first / (float)i; LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); @@ -5328,7 +5419,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; + VectorizationFactor Factor = {ElementCount::getFixed(Width), + (unsigned)(Width * Cost)}; return Factor; } @@ -5388,7 +5480,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -5466,7 +5558,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min); // Check if the user has overridden the max. if (VF == 1) { @@ -5480,7 +5573,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // If trip count is known or estimated compile time constant, limit the // interleave count to be less than the trip count divided by VF. if (BestKnownTC) { - MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); + MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount); } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5499,7 +5592,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && !Legal->getReductionVars().empty()) { + if (VF.isVector() && !Legal->getReductionVars().empty()) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5507,7 +5600,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Note that if we've already vectorized the loop we will have done the // runtime check and so interleaving won't require further checks. bool InterleavingRequiresRuntimePointerCheck = - (VF == 1 && Legal->getRuntimePointerChecking()->Need); + (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. @@ -5561,7 +5654,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } SmallVector -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { +LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -5648,11 +5741,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); // A lambda that gets the register usage for the given type and VF. - auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { + auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { if (Ty->isTokenTy()) return 0U; unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); - return std::max(1, VF * TypeSize / WidestRegister); + assert(!VF.Scalable && "scalable vectors not yet supported."); + return std::max(1, VF.Min * TypeSize / WidestRegister); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -5676,7 +5770,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // Count the number of live intervals. SmallMapVector RegUsage; - if (VFs[j] == 1) { + if (VFs[j].isScalar()) { for (auto Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); if (RegUsage.find(ClassID) == RegUsage.end()) @@ -5725,8 +5819,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { SmallMapVector Invariant; for (auto Inst : LoopInvariants) { - unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); - unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); + unsigned Usage = + VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = + TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); if (Invariant.find(ClassID) == Invariant.end()) Invariant[ClassID] = Usage; else @@ -5774,12 +5870,13 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ NumPredStores > NumberOfStoresToPredicate); } -void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { +void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) + if (VF.isScalar() || VF.isZero() || + InstsToScalarize.find(VF) != InstsToScalarize.end()) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -5809,7 +5906,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { int LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, DenseMap &ScalarCosts, - unsigned VF) { + ElementCount VF) { assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); @@ -5876,16 +5973,20 @@ int LoopVectorizationCostModel::computePredInstDiscount( // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. - unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + assert(!VF.Scalable && "scalable vectors not yet supported."); + unsigned ScalarCost = + VF.Min * getInstructionCost(I, ElementCount::getFixed(1)).first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF), true, false); - ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, - TTI::TCK_RecipThroughput); + APInt::getAllOnesValue(VF.Min), true, false); + assert(!VF.Scalable && "scalable vectors not yet supported."); + ScalarCost += + VF.Min * + TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); } // Compute the scalarization overhead of needed extractelement @@ -5898,10 +5999,12 @@ int LoopVectorizationCostModel::computePredInstDiscount( "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J, VF)) + else if (needsExtract(J, VF)) { + assert(!VF.Scalable && "scalable vectors not yet supported."); ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF), false, true); + APInt::getAllOnesValue(VF.Min), false, true); + } } // Scale the total scalar cost by block probability. @@ -5917,7 +6020,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(unsigned VF) { +LoopVectorizationCostModel::expectedCost(ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); VectorizationCostTy Cost; // For each block. @@ -5927,7 +6031,8 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) + if (ValuesToIgnore.count(&I) || + (VF.isVector() && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); @@ -5949,7 +6054,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // unconditionally executed. For the scalar case, we may not always execute // the predicated block. Thus, scale the block's cost by the probability of // executing it. - if (VF == 1 && blockNeedsPredication(BB)) + if (VF.isScalar() && blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -5994,9 +6099,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { Legal->hasStride(I->getOperand(1)); } -unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, - unsigned VF) { - assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); +unsigned +LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + ElementCount VF) { + assert(VF.isVector() && + "Scalarization cost of instruction implies vectorization."); + assert(!VF.Scalable && "scalable vectors not yet supported."); Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); @@ -6009,14 +6117,14 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. const Align Alignment = getLoadStoreAlignment(I); - Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS, - TTI::TCK_RecipThroughput); + Cost += VF.Min * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS, TTI::TCK_RecipThroughput); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -6038,7 +6146,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, } unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); @@ -6064,7 +6172,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, } unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6082,14 +6190,13 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + - (isLoopInvariantStoreValue - ? 0 - : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF - 1)); + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( + Instruction::ExtractElement, + VectorTy, VF.Min - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6102,7 +6209,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -6111,7 +6218,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. // An interleaved store group doesn't need this as it doesn't allow gaps. @@ -6140,10 +6248,10 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - unsigned VF) { + ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. - if (VF == 1) { + if (VF.isScalar()) { Type *ValTy = getMemInstValueType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6156,35 +6264,42 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, + ElementCount VF) { + assert(!VF.Scalable && + "the cost model is not yet implemented for scalable vectorization"); // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) - VF = 1; + VF = ElementCount::getFixed(1); - if (VF > 1 && isProfitableToScalarize(I, VF)) + if (VF.isVector() && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); // Forced scalars do not have any scalarization overhead. auto ForcedScalar = ForcedScalars.find(VF); - if (VF > 1 && ForcedScalar != ForcedScalars.end()) { + if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) - return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + return VectorizationCostTy( + (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.Min), + false); } Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); - bool TypeNotScalarized = - VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; + bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() && + TTI.getNumberOfParts(VectorTy) < VF.Min; return VectorizationCostTy(C, TypeNotScalarized); } unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - unsigned VF) { + ElementCount VF) { - if (VF == 1) + assert(!VF.Scalable && + "cannot compute scalarization overhead for scalable vectorization"); + if (VF.isScalar()) return 0; unsigned Cost = 0; @@ -6192,7 +6307,7 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnesValue(VF), true, false); + cast(RetTy), APInt::getAllOnesValue(VF.Min), true, false); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6208,12 +6323,14 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Skip operands that do not require extraction/scalarization and do not incur // any overhead. - return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF); + return Cost + + TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF), + VF.Min); } -void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { - if (VF == 1) +void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); + if (VF.isScalar()) return; NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { @@ -6347,14 +6464,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { InstWidening Decision = getWideningDecision(I, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. - setWideningDecision(I, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(I, 1))); + setWideningDecision( + I, VF, CM_Scalarize, + (VF.Min * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) - setWideningDecision(Member, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(Member, 1))); + setWideningDecision( + Member, VF, CM_Scalarize, + (VF.Min * + getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); } } } else @@ -6365,7 +6485,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, - unsigned VF, + ElementCount VF, Type *&VectorTy) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) @@ -6388,19 +6508,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // blocks requires also an extract of its vector compare i1 element. bool ScalarPredicatedBB = false; BranchInst *BI = cast(I); - if (VF > 1 && BI->isConditional() && + if (VF.isVector() && BI->isConditional() && (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. + assert(!VF.Scalable && "scalable vectors not yet supported."); auto *Vec_i1Ty = - FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), - false, true) + - (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); - } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) + VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min)); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); else @@ -6415,15 +6536,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. - if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) + if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - cast(VectorTy), VF - 1, + cast(VectorTy), VF.Min - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. - if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) + if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), @@ -6440,17 +6561,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF > 1 && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I)) { unsigned Cost = 0; // These instructions have a non-void type, so account for the phi nodes // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); + Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); + Cost += + VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -6489,14 +6611,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + assert(!VF.Scalable && "VF is assumed to be non scalable."); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, @@ -6509,9 +6632,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); - if (!ScalarCond) - CondTy = FixedVectorType::get(CondTy, VF); - + if (!ScalarCond) { + assert(!VF.Scalable && "VF is assumed to be non scalable."); + CondTy = VectorType::get(CondTy, VF); + } return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, CostKind, I); } @@ -6527,13 +6651,13 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } case Instruction::Store: case Instruction::Load: { - unsigned Width = VF; - if (Width > 1) { + ElementCount Width = VF; + if (Width.isVector()) { InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); if (Decision == CM_Scalarize) - Width = 1; + Width = ElementCount::getFixed(1); } VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); @@ -6555,7 +6679,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, assert((isa(I) || isa(I)) && "Expected a load or a store!"); - if (VF == 1 || !TheLoop->contains(I)) + if (VF.isScalar() || !TheLoop->contains(I)) return TTI::CastContextHint::Normal; switch (getWideningDecision(I, VF)) { @@ -6621,7 +6745,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } } - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + assert(!VF.Scalable && "VF is assumed to be non scalable"); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; return N * TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } @@ -6636,8 +6761,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, - CostKind) + + return VF.Min * + TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, + CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6743,8 +6869,9 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { - unsigned VF = UserVF; +LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { + assert(!UserVF.Scalable && "scalable vectors not yet supported"); + ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -6752,28 +6879,29 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { if (!OrigLoop->empty()) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. - if (!UserVF) { - VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); + if (UserVF.isZero()) { + VF = ElementCount::getFixed( + determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. - if (VPlanBuildStressTest && VF < 2) { + if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " << "overriding computed VF.\n"); - VF = 4; + VF = ElementCount::getFixed(4); } } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF - << " to build VPlans.\n"); - buildVPlans(VF, VF); + assert(isPowerOf2_32(VF.Min) && "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") + << "VF " << VF << " to build VPlans.\n"); + buildVPlans(VF.Min, VF.Min); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0}; + return {VF, 0 /*Cost*/}; } LLVM_DEBUG( @@ -6782,10 +6910,11 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { return VectorizationFactor::Disabled(); } -Optional LoopVectorizationPlanner::plan(unsigned UserVF, - unsigned UserIC) { +Optional +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + assert(!UserVF.Scalable && "scalable vectorization not yet handled"); assert(OrigLoop->empty() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); + Optional MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6803,14 +6932,14 @@ Optional LoopVectorizationPlanner::plan(unsigned UserVF, CM.invalidateCostModelingDecisions(); } - if (UserVF) { + if (!UserVF.isZero()) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; } @@ -6820,12 +6949,12 @@ Optional LoopVectorizationPlanner::plan(unsigned UserVF, for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); + CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. if (VF > 1) - CM.collectInstsToScalarize(VF); + CM.collectInstsToScalarize(ElementCount::getFixed(VF)); } CM.collectInLoopReductions(); @@ -6839,7 +6968,7 @@ Optional LoopVectorizationPlanner::plan(unsigned UserVF, return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { +void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); BestVF = VF; @@ -6858,9 +6987,11 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPCallbackILV CallbackILV(ILV); - VPTransformState State{BestVF, BestUF, LI, - DT, ILV.Builder, ILV.VectorLoopValueMap, - &ILV, CallbackILV}; + assert(BestVF.hasValue() && "Vectorization Factor is missing"); + + VPTransformState State{*BestVF, BestUF, LI, + DT, ILV.Builder, ILV.VectorLoopValueMap, + &ILV, CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; @@ -6974,12 +7105,12 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } bool LoopVectorizationPlanner::getDecisionAndClampRange( - const std::function &Predicate, VFRange &Range) { + const std::function &Predicate, VFRange &Range) { assert(Range.End > Range.Start && "Trying to test an empty VF range."); - bool PredicateAtRangeStart = Predicate(Range.Start); + bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) - if (Predicate(TmpVF) != PredicateAtRangeStart) { + if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { Range.End = TmpVF; break; } @@ -7060,10 +7191,16 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); - if (TailFolded && CM.TTI.emitGetActiveLaneMask()) - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); - else + + if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { + // While ActiveLaneMask is a binary op that consumes the loop tripcount + // as a second argument, we only pass the IV here and extract the + // tripcount from the transform state where codegen of the VP instructions + // happen. + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + } else { BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + } return BlockMaskCache[BB] = BlockMask; } @@ -7090,8 +7227,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, assert((isa(I) || isa(I)) && "Must be called with either a load or store"); - auto willWiden = [&](unsigned VF) -> bool { - if (VF == 1) + auto willWiden = [&](ElementCount VF) -> bool { + assert(!VF.Scalable && "unexpected scalable ElementCount"); + if (VF.isScalar()) return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); @@ -7144,9 +7282,10 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, // Determine whether \p K is a truncation based on an induction variable that // can be optimized. auto isOptimizableIVTruncate = - [&](Instruction *K) -> std::function { - return - [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; + [&](Instruction *K) -> std::function { + return [=](ElementCount VF) -> bool { + return CM.isOptimizableIVTruncate(K, VF); + }; }; if (LoopVectorizationPlanner::getDecisionAndClampRange( @@ -7181,7 +7320,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, VPlan &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -7192,7 +7333,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) return nullptr; - auto willWiden = [&](unsigned VF) -> bool { + auto willWiden = [&](ElementCount VF) -> bool { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // The following case may be scalarized depending on the VF. // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -7216,7 +7357,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { !isa(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](unsigned VF) -> bool { + auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I, VF); @@ -7279,11 +7420,12 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( DenseMap &PredInst2Recipe, VPlanPtr &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, + [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, + Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); @@ -7491,8 +7633,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 + auto applyIG = [IG, this](ElementCount VF) -> bool { + return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); }; @@ -7617,10 +7759,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( std::string PlanName; raw_string_ostream RSO(PlanName); - unsigned VF = Range.Start; + ElementCount VF = ElementCount::getFixed(Range.Start); Plan->addVF(VF); RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; VF < Range.End; VF *= 2) { + for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) { Plan->addVF(VF); RSO << "," << VF; } @@ -7647,7 +7789,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { HCFGBuilder.buildHierarchicalCFG(); for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) - Plan->addVF(VF); + Plan->addVF(ElementCount::getFixed(VF)); if (EnableVPlanPredication) { VPlanPredicator VPP(*Plan); @@ -7841,11 +7983,12 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. - if (AlsoPack && State.VF > 1) { + if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from undef. if (State.Instance->Lane == 0) { - Value *Undef = UndefValue::get( - FixedVectorType::get(Ingredient->getType(), State.VF)); + assert(!State.VF.Scalable && "VF is assumed to be non scalable."); + Value *Undef = + UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); } State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); @@ -7856,7 +7999,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Generate scalar instances for all VF lanes of all UF parts, unless the // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF; + unsigned EndLane = IsUniform ? 1 : State.VF.Min; for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, @@ -7948,8 +8091,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( Hints.getForce() != LoopVectorizeHints::FK_Enabled)) return CM_ScalarEpilogueNotAllowedOptSize; - bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && - !PreferPredicateOverEpilog; + bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && + !PreferPredicateOverEpilogue; // 2) Next, if disabling predication is requested on the command line, honour // this and request a scalar epilogue. @@ -7958,8 +8101,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( // 3) and 4) look if enabling predication is requested on the command line, // with a loop hint, or if the TTI hook indicates this is profitable, request - // predication . - if (PreferPredicateOverEpilog || + // predication. + if (PreferPredicateOverEpilogue || Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LVL.getLAI()) && @@ -8002,7 +8145,8 @@ static bool processLoopInVPlanNativePath( const unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); + const VectorizationFactor VF = + LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -8168,7 +8312,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + Optional MaybeVF = + LVP.plan(ElementCount::getFixed(UserVF), UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 6d8a12ab25ad5..e5e3d3611d07a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -286,7 +286,8 @@ static bool isCommutative(Instruction *I) { static Optional isShuffle(ArrayRef VL) { auto *EI0 = cast(VL[0]); - unsigned Size = EI0->getVectorOperandType()->getNumElements(); + unsigned Size = + cast(EI0->getVectorOperandType())->getNumElements(); Value *Vec1 = nullptr; Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; @@ -295,7 +296,7 @@ isShuffle(ArrayRef VL) { auto *EI = cast(VL[I]); auto *Vec = EI->getVectorOperand(); // All vector operands must have the same number of vector elements. - if (cast(Vec->getType())->getNumElements() != Size) + if (cast(Vec->getType())->getNumElements() != Size) return None; auto *Idx = dyn_cast(EI->getIndexOperand()); if (!Idx) @@ -1411,7 +1412,7 @@ class BoUpSLP { /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(VectorType *Ty, + int getGatherCost(FixedVectorType *Ty, const DenseSet &ShuffledIndices) const; /// \returns the scalarization cost for this list of values. Assuming that @@ -1424,7 +1425,7 @@ class BoUpSLP { void setInsertPointAfterBundle(TreeEntry *E); /// \returns a vector from a collection of scalars in \p VL. - Value *Gather(ArrayRef VL, VectorType *Ty); + Value *Gather(ArrayRef VL, FixedVectorType *Ty); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. @@ -3166,7 +3167,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { N *= AT->getNumElements(); EltTy = AT->getElementType(); } else { - auto *VT = cast(EltTy); + auto *VT = cast(EltTy); N *= VT->getNumElements(); EltTy = VT->getElementType(); } @@ -3204,7 +3205,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) return false; } else { - NElts = cast(Vec->getType())->getNumElements(); + NElts = cast(Vec->getType())->getNumElements(); } if (NElts != VL.size()) @@ -3255,8 +3256,8 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { } static std::pair -getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI, - TargetLibraryInfo *TLI) { +getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, + TargetTransformInfo *TTI, TargetLibraryInfo *TLI) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. @@ -3928,7 +3929,7 @@ int BoUpSLP::getTreeCost() { return Cost; } -int BoUpSLP::getGatherCost(VectorType *Ty, +int BoUpSLP::getGatherCost(FixedVectorType *Ty, const DenseSet &ShuffledIndices) const { unsigned NumElts = Ty->getNumElements(); APInt DemandedElts = APInt::getNullValue(NumElts); @@ -4041,7 +4042,7 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } -Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) { +Value *BoUpSLP::Gather(ArrayRef VL, FixedVectorType *Ty) { Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 302a4845e9a86..a616de6eb4f07 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -300,7 +300,8 @@ void VPRegionBlock::execute(VPTransformState *State) { for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { State->Instance->Part = Part; - for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) { + assert(!State->VF.Scalable && "VF is assumed to be non scalable."); + for (unsigned Lane = 0, VF = State->VF.Min; Lane < VF; ++Lane) { State->Instance->Lane = Lane; // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { @@ -383,14 +384,14 @@ void VPInstruction::generateInstruction(VPTransformState &State, case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), {Part, 0}); - // Get first lane of backedge-taken-count. - Value *ScalarBTC = State.get(getOperand(1), {Part, 0}); + // Get the original loop tripcount. + Value *ScalarTC = State.TripCount; auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF); + auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.Min); Instruction *Call = Builder.CreateIntrinsic( - Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()}, - {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask"); + Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, + {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); State.set(this, Call, Part); break; } @@ -838,14 +839,15 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.CanonicalIV; Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - auto VF = State.VF; - Value *VStart = VF == 1 - ? CanonicalIV - : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); + ElementCount VF = State.VF; + assert(!VF.Scalable && "the code following assumes non scalables ECs"); + Value *VStart = VF.isScalar() ? CanonicalIV + : Builder.CreateVectorSplat(VF.Min, CanonicalIV, + "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { SmallVector Indices; - for (unsigned Lane = 0; Lane < VF; ++Lane) - Indices.push_back(ConstantInt::get(STy, Part * VF + Lane)); + for (unsigned Lane = 0; Lane < VF.Min; ++Lane) + Indices.push_back(ConstantInt::get(STy, Part * VF.Min + Lane)); // If VF == 1, there is only one iteration in the loop above, thus the // element pushed back into Indices is ConstantInt::get(STy, Part) Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 54700cb488391..6eed236fc1493 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -115,7 +115,7 @@ struct VectorizerValueMap { /// The vectorization factor. Each entry in the scalar map contains UF x VF /// scalar values. - unsigned VF; + ElementCount VF; /// The vector and scalar map storage. We use std::map and not DenseMap /// because insertions to DenseMap invalidate its iterators. @@ -126,7 +126,7 @@ struct VectorizerValueMap { public: /// Construct an empty map with the given unroll and vectorization factors. - VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {} /// \return True if the map has any vector entry for \p Key. bool hasAnyVectorValue(Value *Key) const { @@ -151,12 +151,14 @@ struct VectorizerValueMap { /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF && "Queried Scalar Lane is too large."); + assert(Instance.Lane < VF.Min && "Queried Scalar Lane is too large."); + assert(!VF.Scalable && "VF is assumed to be non scalable."); + if (!hasAnyScalarValue(Key)) return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF && + assert(Entry[Instance.Part].size() == VF.Min && "ScalarParts has wrong dimensions."); return Entry[Instance.Part][Instance.Lane] != nullptr; } @@ -195,7 +197,7 @@ struct VectorizerValueMap { // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF, nullptr); + Entry[Part].resize(VF.Min, nullptr); ScalarMapStorage[Key] = Entry; } ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; @@ -234,14 +236,15 @@ struct VPCallback { /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, - IRBuilder<> &Builder, VectorizerValueMap &ValueMap, - InnerLoopVectorizer *ILV, VPCallback &Callback) + VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, + DominatorTree *DT, IRBuilder<> &Builder, + VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, + VPCallback &Callback) : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. - unsigned VF; + ElementCount VF; unsigned UF; /// Hold the indices to generate specific scalar instructions. Null indicates @@ -1583,7 +1586,7 @@ class VPlan { VPBlockBase *Entry; /// Holds the VFs applicable to this VPlan. - SmallSet VFs; + SmallSetVector VFs; /// Holds the name of the VPlan, for printing. std::string Name; @@ -1647,9 +1650,9 @@ class VPlan { return BackedgeTakenCount; } - void addVF(unsigned VF) { VFs.insert(VF); } + void addVF(ElementCount VF) { VFs.insert(VF); } - bool hasVF(unsigned VF) { return VFs.count(VF); } + bool hasVF(ElementCount VF) { return VFs.count(VF); } const std::string &getName() const { return Name; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 67725e30b8342..1cc0e40da3a2b 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -437,8 +437,10 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy)) return false; - unsigned DestNumElts = DestTy->getNumElements(); - unsigned SrcNumElts = SrcTy->getNumElements(); + // FIXME: it should be possible to implement the computation of the widened + // shuffle mask in terms of ElementCount to work with scalable shuffles. + unsigned DestNumElts = cast(DestTy)->getNumElements(); + unsigned SrcNumElts = cast(SrcTy)->getNumElements(); SmallVector NewMask; if (SrcNumElts <= DestNumElts) { // The bitcast is from wide to narrow/equal elements. The shuffle mask can diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt index b97cc73dd0fcf..04a13e48cff51 100644 --- a/llvm/runtimes/CMakeLists.txt +++ b/llvm/runtimes/CMakeLists.txt @@ -309,7 +309,6 @@ ${error} Set RUNTIMES_BUILD_ALLOW_DARWIN to allow a single darwin triple.") -DCMAKE_ASM_COMPILER_WORKS=ON -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON ${${target}_extra_args} - TOOLCHAIN_TOOLS clang lld llvm-ar llvm-lipo llvm-ranlib llvm-nm llvm-objcopy llvm-objdump llvm-strip USE_TOOLCHAIN ${EXTRA_ARGS}) endfunction() @@ -529,7 +528,6 @@ ${error} Set RUNTIMES_BUILD_ALLOW_DARWIN to allow a single darwin triple.") -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON -DLLVM_RUNTIMES_TARGET=${name} ${${name}_extra_args} - TOOLCHAIN_TOOLS clang lld llvm-ar llvm-lipo llvm-ranlib llvm-nm llvm-objcopy llvm-objdump llvm-strip EXTRA_TARGETS ${${name}_extra_targets} ${${name}_test_targets} USE_TOOLCHAIN diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll new file mode 100644 index 0000000000000..32e760f2015d7 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll @@ -0,0 +1,14 @@ +; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -cost-model -analyze < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; CHECK: Found an estimated cost of 0 for instruction: %0 = trunc %v to + +define void @trunc_nxv2i64_to_nxv2i32(* %ptr, %v) { +entry: + %0 = trunc %v to + store %0, * %ptr + ret void +} diff --git a/llvm/test/Analysis/CostModel/ARM/arith.ll b/llvm/test/Analysis/CostModel/ARM/arith.ll index 55b60fb9c2877..8513cefe5c119 100644 --- a/llvm/test/Analysis/CostModel/ARM/arith.ll +++ b/llvm/test/Analysis/CostModel/ARM/arith.ll @@ -5,6 +5,80 @@ ; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN ; RUN: opt -cost-model -analyze -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE ; RUN: opt -cost-model -analyze -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=thumbv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE + +define void @i1() { +; CHECK-LABEL: 'i1' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i1 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i1 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i1 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8M-MAIN-LABEL: 'i1' +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i1 undef, undef +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i1 undef, undef +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i1 undef, undef +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; CHECK-V8M-BASE-LABEL: 'i1' +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i1 undef, undef +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i1 undef, undef +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i1 undef, undef +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; CHECK-V8R-LABEL: 'i1' +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i1 undef, undef +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i1 undef, undef +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i1 undef, undef +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-SIZE-LABEL: 'i1' +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i1 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i1 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i1 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %c = add i1 undef, undef + %d = sub i1 undef, undef + %e = mul i1 undef, undef + %f = ashr i1 undef, undef + %g = lshr i1 undef, undef + %h = shl i1 undef, undef + %i = and i1 undef, undef + %j = or i1 undef, undef + %k = xor i1 undef, undef + ret void +} define void @i8() { ; CHECK-LABEL: 'i8' @@ -54,6 +128,18 @@ define void @i8() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i8 undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i8 undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-SIZE-LABEL: 'i8' +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i8 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i8 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i8 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i8 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i8 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i8 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i8 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i8 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i8 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %c = add i8 undef, undef %d = sub i8 undef, undef @@ -115,6 +201,18 @@ define void @i16() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i16 undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i16 undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-SIZE-LABEL: 'i16' +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i16 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i16 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i16 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i16 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i16 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i16 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i16 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i16 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i16 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %c = add i16 undef, undef %d = sub i16 undef, undef @@ -176,6 +274,18 @@ define void @i32() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i32 undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i32 undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-SIZE-LABEL: 'i32' +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i32 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i32 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i32 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i32 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i32 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i32 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i32 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i32 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i32 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %c = add i32 undef, undef %d = sub i32 undef, undef @@ -237,6 +347,18 @@ define void @i64() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j = or i64 undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k = xor i64 undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-SIZE-LABEL: 'i64' +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i64 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i64 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i64 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i64 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i64 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i64 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i64 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i64 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i64 undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %c = add i64 undef, undef %d = sub i64 undef, undef @@ -485,6 +607,45 @@ define void @vi8() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j16 = or <16 x i8> undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k16 = xor <16 x i8> undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-SIZE-LABEL: 'vi8' +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c2 = add <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d2 = sub <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e2 = mul <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f2 = ashr <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g2 = lshr <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h2 = shl <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e8 = mul <8 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f8 = ashr <8 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g8 = lshr <8 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h8 = shl <8 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c16 = add <16 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d16 = sub <16 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e16 = mul <16 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = ashr <16 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g16 = lshr <16 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h16 = shl <16 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = and <16 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j16 = or <16 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k16 = xor <16 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %c2 = add <2 x i8> undef, undef %d2 = sub <2 x i8> undef, undef @@ -759,6 +920,45 @@ define void @vi16() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j16 = or <16 x i16> undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k16 = xor <16 x i16> undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-SIZE-LABEL: 'vi16' +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c2 = add <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d2 = sub <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e2 = mul <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f2 = ashr <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g2 = lshr <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h2 = shl <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e8 = mul <8 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f8 = ashr <8 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g8 = lshr <8 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h8 = shl <8 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c16 = add <16 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d16 = sub <16 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e16 = mul <16 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = ashr <16 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g16 = lshr <16 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h16 = shl <16 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = and <16 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j16 = or <16 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k16 = xor <16 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %c2 = add <2 x i16> undef, undef %d2 = sub <2 x i16> undef, undef @@ -1033,6 +1233,45 @@ define void @vi32() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j16 = or <16 x i32> undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k16 = xor <16 x i32> undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-SIZE-LABEL: 'vi32' +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c2 = add <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d2 = sub <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e2 = mul <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f2 = ashr <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g2 = lshr <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h2 = shl <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e8 = mul <8 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f8 = ashr <8 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g8 = lshr <8 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h8 = shl <8 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c16 = add <16 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d16 = sub <16 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e16 = mul <16 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = ashr <16 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g16 = lshr <16 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h16 = shl <16 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = and <16 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j16 = or <16 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k16 = xor <16 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %c2 = add <2 x i32> undef, undef %d2 = sub <2 x i32> undef, undef @@ -1307,6 +1546,45 @@ define void @vi64() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j16 = or <16 x i64> undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k16 = xor <16 x i64> undef, undef ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-SIZE-LABEL: 'vi64' +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c2 = add <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d2 = sub <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e2 = mul <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f2 = ashr <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g2 = lshr <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h2 = shl <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e8 = mul <8 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f8 = ashr <8 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g8 = lshr <8 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h8 = shl <8 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c16 = add <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d16 = sub <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e16 = mul <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = ashr <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g16 = lshr <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h16 = shl <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = and <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j16 = or <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k16 = xor <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %c2 = add <2 x i64> undef, undef %d2 = sub <2 x i64> undef, undef diff --git a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll index 02bb080b65b64..03444997c3035 100644 --- a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll +++ b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll @@ -7,7 +7,7 @@ define i32 @masked_gather() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 4, <2 x i1> undef, <2 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 4, <16 x i1> undef, <16 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 4, <8 x i1> undef, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 4, <4 x i1> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 4, <4 x i1> undef, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 4, <2 x i1> undef, <2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 2, <16 x i1> undef, <16 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 2, <8 x i1> undef, <8 x half> undef) @@ -17,7 +17,7 @@ define i32 @masked_gather() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 4, <16 x i1> undef, <16 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 4, <8 x i1> undef, <8 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 4, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 4, <4 x i1> undef, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 4, <2 x i1> undef, <2 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 2, <16 x i1> undef, <16 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 2, <8 x i1> undef, <8 x i16> undef) @@ -71,7 +71,7 @@ define i32 @masked_scatter() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 4, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 576 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 4, <16 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 4, <8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 4, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 4, <4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 4, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 544 for instruction: call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> undef, <16 x half*> undef, i32 2, <16 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> undef, <8 x half*> undef, i32 2, <8 x i1> undef) @@ -81,7 +81,7 @@ define i32 @masked_scatter() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 4, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 576 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 4, <16 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 4, <8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 4, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 4, <4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 4, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 544 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 2, <16 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 2, <8 x i1> undef) @@ -132,27 +132,27 @@ define i32 @masked_scatter() { define void @gep_v4i32(i32* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask) { ; CHECK-LABEL: 'gep_v4i32' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i32, i32* %base, <4 x i32> %ind32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res1, <4 x i32*> %gep1, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res1, <4 x i32*> %gep1, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i32, i32* %base, <4 x i32> %indzext -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res2, <4 x i32*> %gep2, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res2, <4 x i32*> %gep2, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i32, i32* %base, <4 x i32> %indsext -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res3, <4 x i32*> %gep3, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res3, <4 x i32*> %gep3, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepu = getelementptr i32, i32* %base, <4 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resu, <4 x i32*> %gepu, i32 1, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x i32*> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resos, <4 x i32*> %geposb, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resos, <4 x i32*> %geposb, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x i32*> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resbs, <4 x i32*> %gepbsb, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resbs, <4 x i32*> %gepbsb, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %gep1 = getelementptr i32, i32* %base, <4 x i32> %ind32 @@ -191,27 +191,27 @@ define void @gep_v4i32(i32* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, < define void @gep_v4f32(float* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask) { ; CHECK-LABEL: 'gep_v4f32' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr float, float* %base, <4 x i32> %ind32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res1, <4 x float*> %gep1, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res1, <4 x float*> %gep1, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr float, float* %base, <4 x i32> %indzext -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res2, <4 x float*> %gep2, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res2, <4 x float*> %gep2, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr float, float* %base, <4 x i32> %indsext -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res3, <4 x float*> %gep3, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res3, <4 x float*> %gep3, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gepu = getelementptr float, float* %base, <4 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resu, <4 x float*> %gepu, i32 1, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x float*> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resos, <4 x float*> %geposb, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resos, <4 x float*> %geposb, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x float*> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resbs, <4 x float*> %gepbsb, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resbs, <4 x float*> %gepbsb, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %gep1 = getelementptr float, float* %base, <4 x i32> %ind32 @@ -261,14 +261,14 @@ define void @gep_v4i16(i16* %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res3, <4 x i16*> %gep3, i32 2, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i16, i16* %base, <4 x i16> %ind16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <4 x i16> %res5 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res5trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res5trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <4 x i16> %res6 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res6trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res6trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %gep1 = getelementptr i16, i16* %base, <4 x i32> %ind32 @@ -304,14 +304,14 @@ define void @gep_v4i16(i16* %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> define void @gep_v4i8(i8* %base, <4 x i8> %ind8, <4 x i1> %mask) { ; CHECK-LABEL: 'gep_v4i8' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, i8* %base, <4 x i8> %ind8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res5zext = zext <4 x i8> %res5 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i8> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res5trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res5trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res6sext = sext <4 x i8> %res6 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i8> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res6trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res6trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; result zext @@ -337,8 +337,8 @@ define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, < ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef) @@ -347,8 +347,8 @@ define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, < ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resu, <8 x i16*> %gep2, i32 1, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x i16*> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resos, <8 x i16*> %geposb, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resos, <8 x i16*> %geposb, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef) @@ -356,11 +356,11 @@ define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, < ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext4 = zext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i16, i16* %base, <8 x i32> %indzext4 ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indtrunc = trunc <8 x i32> %ind32 to <8 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %indtrunc, <8 x i16*> %gep4, i32 2, <8 x i1> %mask) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %indtrunc, <8 x i16*> %gep4, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %ressext = sext <8 x i16> %res to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %restrunc = trunc <8 x i32> %ressext to <8 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %restrunc, <8 x i16*> %gep4, i32 4, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %restrunc, <8 x i16*> %gep4, i32 4, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; no offset ext @@ -418,8 +418,8 @@ define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr half, half* %base, <8 x i32> %indzext -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr half, half* %base, <8 x i32> %indsext ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef) @@ -428,8 +428,8 @@ define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resu, <8 x half*> %gep2, i32 1, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x half*> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resos, <8 x half*> %geposb, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resos, <8 x half*> %geposb, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef) @@ -476,14 +476,14 @@ define void @gep_v8i8(i8* %base, <8 x i8> %ind8, <8 x i1> %mask) { ; CHECK-LABEL: 'gep_v8i8' ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %indzext = zext <8 x i8> %ind8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, i8* %base, <8 x i32> %indzext -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <8 x i8> %res5 to <8 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <8 x i16> %res5zext to <8 x i8> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res5trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res5trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <8 x i8> %res6 to <8 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <8 x i16> %res6sext to <8 x i8> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res6trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res6trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; result zext @@ -510,8 +510,8 @@ define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind ; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indzext = zext <16 x i8> %ind8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indsext = sext <16 x i8> %ind8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext ; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef) @@ -523,7 +523,7 @@ define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indzext4 = zext <16 x i8> %ind8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i8, i8* %base, <16 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indtrunc = trunc <16 x i32> %ind32 to <16 x i8> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %indtrunc, <16 x i8*> %gep4, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %indtrunc, <16 x i8*> %gep4, i32 2, <16 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; no offset ext diff --git a/llvm/test/Analysis/LoopNestAnalysis/nests-with-lcssa.ll b/llvm/test/Analysis/LoopNestAnalysis/nests-with-lcssa.ll new file mode 100644 index 0000000000000..c5d555b8ff326 --- /dev/null +++ b/llvm/test/Analysis/LoopNestAnalysis/nests-with-lcssa.ll @@ -0,0 +1,248 @@ +; RUN: opt -S -passes='print' < %s 2>&1 > /dev/null | FileCheck %s + +; int f(int N, int M) { +; int res = 0; +; for (int i = 0; i < N; ++i) { +; for (int j = 0; j < M; ++j) res += i * j; +; } +; return res; +; } + +define i32 @f(i32 %N, i32 %M) #0 { +; CHECK: IsPerfect=true, Depth=1, OutermostLoop: for.j, Loops: ( for.j ) +; CHECK: IsPerfect=true, Depth=2, OutermostLoop: for.i, Loops: ( for.i for.j ) +entry: + %cmp4 = icmp slt i32 0, %N + br i1 %cmp4, label %for.i.ph, label %for.i.end + +for.i.ph: ; preds = %entry + br label %for.i + +for.i: ; preds = %for.i.ph, %for.i.inc + %i.06 = phi i32 [ 0, %for.i.ph ], [ %inc5, %for.i.inc ] + %res.05 = phi i32 [ 0, %for.i.ph ], [ %res.1.lcssa, %for.i.inc ] + %cmp21 = icmp slt i32 0, %M + br i1 %cmp21, label %for.j.ph, label %for.j.end + +for.j.ph: ; preds = %for.i + br label %for.j + +for.j: ; preds = %for.j.ph, %for.j.inc + %j.03 = phi i32 [ 0, %for.j.ph ], [ %inc, %for.j.inc ] + %res.12 = phi i32 [ %res.05, %for.j.ph ], [ %add, %for.j.inc ] + %mul = mul nsw i32 %i.06, %j.03 + %add = add nsw i32 %res.12, %mul + br label %for.j.inc + +for.j.inc: ; preds = %for.j + %inc = add nsw i32 %j.03, 1 + %cmp2 = icmp slt i32 %inc, %M + br i1 %cmp2, label %for.j, label %for.j.end_crit_edge + +for.j.end_crit_edge: ; preds = %for.j.inc + %split = phi i32 [ %add, %for.j.inc ] + br label %for.j.end + +for.j.end: ; preds = %for.j.end_crit_edge, %for.i + %res.1.lcssa = phi i32 [ %split, %for.j.end_crit_edge ], [ %res.05, %for.i ] + br label %for.i.inc + +for.i.inc: ; preds = %for.j.end + %inc5 = add nsw i32 %i.06, 1 + %cmp = icmp slt i32 %inc5, %N + br i1 %cmp, label %for.i, label %for.i.end_crit_edge + +for.i.end_crit_edge: ; preds = %for.i.inc + %split7 = phi i32 [ %res.1.lcssa, %for.i.inc ] + br label %for.i.end + +for.i.end: ; preds = %for.i.end_crit_edge, %entry + %res.0.lcssa = phi i32 [ %split7, %for.i.end_crit_edge ], [ 0, %entry ] + ret i32 %res.0.lcssa +} + +; int g(int N, int M, int K) { +; int sum = 0, prod = 1; +; for (int i = 0; i < N; ++i) { +; for (int j = 0; j < M; ++j) { +; for (int k = 0; k < K; ++k) { +; sum += i * j * k; +; } +; prod *= (i + j); +; } +; } +; return sum + prod; +; } +define i32 @g(i32 %N, i32 %M, i32 %K) #0 { +; CHECK: IsPerfect=true, Depth=1, OutermostLoop: for.k, Loops: ( for.k ) +; CHECK: IsPerfect=false, Depth=2, OutermostLoop: for.j, Loops: ( for.j for.k ) +; CHECK: IsPerfect=false, Depth=3, OutermostLoop: for.i, Loops: ( for.i for.j for.k ) +entry: + %cmp10 = icmp slt i32 0, %N + br i1 %cmp10, label %for.i.ph, label %for.i.end + +for.i.ph: ; preds = %entry + br label %for.i + +for.i: ; preds = %for.i.ph, %for.i.inc + %i.013 = phi i32 [ 0, %for.i.ph ], [ %inc14, %for.i.inc ] + %sum.012 = phi i32 [ 0, %for.i.ph ], [ %sum.1.lcssa, %for.i.inc ] + %prod.011 = phi i32 [ 1, %for.i.ph ], [ %prod.1.lcssa, %for.i.inc ] + %cmp24 = icmp slt i32 0, %M + br i1 %cmp24, label %for.j.ph, label %for.j.end + +for.j.ph: ; preds = %for.i + br label %for.j + +for.j: ; preds = %for.j.ph, %for.j.inc + %j.07 = phi i32 [ 0, %for.j.ph ], [ %inc11, %for.j.inc ] + %sum.16 = phi i32 [ %sum.012, %for.j.ph ], [ %sum.2.lcssa, %for.j.inc ] + %prod.15 = phi i32 [ %prod.011, %for.j.ph ], [ %mul9, %for.j.inc ] + %cmp51 = icmp slt i32 0, %K + br i1 %cmp51, label %for.k.ph, label %for.k.end + +for.k.ph: ; preds = %for.j + br label %for.k + +for.k: ; preds = %for.k.ph, %for.k.inc + %k.03 = phi i32 [ 0, %for.k.ph ], [ %inc, %for.k.inc ] + %sum.22 = phi i32 [ %sum.16, %for.k.ph ], [ %add, %for.k.inc ] + %mul = mul nsw i32 %i.013, %j.07 + %mul7 = mul nsw i32 %mul, %k.03 + %add = add nsw i32 %sum.22, %mul7 + br label %for.k.inc + +for.k.inc: ; preds = %for.k + %inc = add nsw i32 %k.03, 1 + %cmp5 = icmp slt i32 %inc, %K + br i1 %cmp5, label %for.k, label %for.k.end_crit_edge + +for.k.end_crit_edge: ; preds = %for.k.inc + %split = phi i32 [ %add, %for.k.inc ] + br label %for.k.end + +for.k.end: ; preds = %for.k.end_crit_edge, %for.j + %sum.2.lcssa = phi i32 [ %split, %for.k.end_crit_edge ], [ %sum.16, %for.j ] + %add8 = add nsw i32 %i.013, %j.07 + %mul9 = mul nsw i32 %prod.15, %add8 + br label %for.j.inc + +for.j.inc: ; preds = %for.k.end + %inc11 = add nsw i32 %j.07, 1 + %cmp2 = icmp slt i32 %inc11, %M + br i1 %cmp2, label %for.j, label %for.j.end_crit_edge + +for.j.end_crit_edge: ; preds = %for.j.inc + %split8 = phi i32 [ %mul9, %for.j.inc ] + %split9 = phi i32 [ %sum.2.lcssa, %for.j.inc ] + br label %for.j.end + +for.j.end: ; preds = %for.j.end1crit_edge, %for.i + %prod.1.lcssa = phi i32 [ %split8, %for.j.end_crit_edge ], [ %prod.011, %for.i ] + %sum.1.lcssa = phi i32 [ %split9, %for.j.end_crit_edge ], [ %sum.012, %for.i ] + br label %for.i.inc + +for.i.inc: ; preds = %for.j.end + %inc14 = add nsw i32 %i.013, 1 + %cmp = icmp slt i32 %inc14, %N + br i1 %cmp, label %for.i, label %for.i.end_crit_edge + +for.i.end_crit_edge: ; preds = %for.i.inc + %split14 = phi i32 [ %prod.1.lcssa, %for.i.inc ] + %split15 = phi i32 [ %sum.1.lcssa, %for.i.inc ] + br label %for.i.end + +for.i.end: ; preds = %for.i.end_crit_edge, %entry + %prod.0.lcssa = phi i32 [ %split14, %for.i.end_crit_edge ], [ 1, %entry ] + %sum.0.lcssa = phi i32 [ %split15, %for.i.end_crit_edge ], [ 0, %entry ] + %add16 = add nsw i32 %sum.0.lcssa, %prod.0.lcssa + ret i32 %add16 +} + +; int h(int N, int M, int K) { +; int sum = 0; +; for (int i = 0; i < N; ++i) { +; for (int j = 0; j < M; ++j) { +; for (int k = 0; k < K; ++k) { +; sum += i * j * k; +; } +; } +; } +; return sum; +; } +define i32 @h(i32 %N, i32 %M, i32 %K) #0 { +; CHECK: IsPerfect=true, Depth=1, OutermostLoop: for.k, Loops: ( for.k ) +; CHECK: IsPerfect=true, Depth=2, OutermostLoop: for.j, Loops: ( for.j for.k ) +; CHECK: IsPerfect=true, Depth=3, OutermostLoop: for.i, Loops: ( for.i for.j for.k ) +entry: + %cmp8 = icmp slt i32 0, %N + br i1 %cmp8, label %for.i.ph, label %for.i.end + +for.i.ph: ; preds = %entry + br label %for.i + +for.i: ; preds = %for.i.ph, %for.i.inc + %i.010 = phi i32 [ 0, %for.i.ph ], [ %inc12, %for.i.inc ] + %sum.09 = phi i32 [ 0, %for.i.ph ], [ %sum.1.lcssa, %for.i.inc ] + %cmp24 = icmp slt i32 0, %M + br i1 %cmp24, label %for.j.ph, label %for.j.end + +for.j.ph: ; preds = %for.i + br label %for.j + +for.j: ; preds = %for.j.ph, %for.j.inc + %j.06 = phi i32 [ 0, %for.j.ph ], [ %inc9, %for.j.inc ] + %sum.15 = phi i32 [ %sum.09, %for.j.ph ], [ %sum.2.lcssa, %for.j.inc ] + %cmp51 = icmp slt i32 0, %K + br i1 %cmp51, label %for.k.ph, label %for.k.end + +for.k.ph: ; preds = %for.j + br label %for.k + +for.k: ; preds = %for.k.ph, %for.k.inc + %k.03 = phi i32 [ 0, %for.k.ph ], [ %inc, %for.k.inc ] + %sum.22 = phi i32 [ %sum.15, %for.k.ph ], [ %add, %for.k.inc ] + %mul = mul nsw i32 %i.010, %j.06 + %mul7 = mul nsw i32 %mul, %k.03 + %add = add nsw i32 %sum.22, %mul7 + br label %for.k.inc + +for.k.inc: ; preds = %for.k + %inc = add nsw i32 %k.03, 1 + %cmp5 = icmp slt i32 %inc, %K + br i1 %cmp5, label %for.k, label %for.k.end_crit_edge + +for.k.end_crit_edge: ; preds = %for.k.inc + %split = phi i32 [ %add, %for.k.inc ] + br label %for.k.end + +for.k.end: ; preds = %for.k.end_crit_edge, %for.j + %sum.2.lcssa = phi i32 [ %split, %for.k.end_crit_edge ], [ %sum.15, %for.j ] + br label %for.j.inc + +for.j.inc: ; preds = %for.k.end + %inc9 = add nsw i32 %j.06, 1 + %cmp2 = icmp slt i32 %inc9, %M + br i1 %cmp2, label %for.j, label %for.j.end_crit_edge + +for.j.end_crit_edge: ; preds = %for.j.inc + %split7 = phi i32 [ %sum.2.lcssa, %for.j.inc ] + br label %for.j.end + +for.j.end: ; preds = %for.j.end_crit_edge, %for.i + %sum.1.lcssa = phi i32 [ %split7, %for.j.end_crit_edge ], [ %sum.09, %for.i ] + br label %for.i.inc + +for.i.inc: ; preds = %for.j.end + %inc12 = add nsw i32 %i.010, 1 + %cmp = icmp slt i32 %inc12, %N + br i1 %cmp, label %for.i, label %for.i.end_crit_edge + +for.i.end_crit_edge: ; preds = %for.i.inc + %split11 = phi i32 [ %sum.1.lcssa, %for.i.inc ] + br label %for.i.end + +for.i.end: ; preds = %for.i.end_crit_edge, %entry + %sum.0.lcssa = phi i32 [ %split11, %for.i.end_crit_edge ], [ 0, %entry ] + ret i32 %sum.0.lcssa +} diff --git a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll index d29029530b6cf..45c68b5e3d02c 100644 --- a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll +++ b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll @@ -648,8 +648,8 @@ entry: ; CHECK: entry: ; CHECK-NEXT: Alive: <> %x = alloca i8, align 4 - call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) -; CHECK: call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) +; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) ; CHECK-NEXT: Alive: br label %l2 @@ -659,8 +659,8 @@ l2: ; preds = %l2, %entry ; MAY-NEXT: Alive: ; MUST-NEXT: Alive: <> call void @capture8(i8* %x) - call void @llvm.lifetime.end.p0i8(i64 4, i8* %x) -; CHECK: call void @llvm.lifetime.end.p0i8(i64 4, i8* %x) + call void @llvm.lifetime.end.p0i8(i64 1, i8* %x) +; CHECK: call void @llvm.lifetime.end.p0i8(i64 1, i8* %x) ; CHECK-NEXT: Alive: <> br label %l2 @@ -673,8 +673,8 @@ entry: ; CHECK-NEXT: Alive: <> %x = alloca i8, align 4 %y = alloca i8, align 4 - call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) -; CHECK: call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) +; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) ; CHECK-NEXT: Alive: br label %l2 @@ -682,17 +682,17 @@ entry: l2: ; preds = %l2, %entry ; CHECK: l2: ; CHECK-NEXT: Alive: - call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) -; CHECK: call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %y) +; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %y) ; CHECK-NEXT: Alive: call void @capture8(i8* %y) - call void @llvm.lifetime.end.p0i8(i64 4, i8* %y) -; CHECK: call void @llvm.lifetime.end.p0i8(i64 4, i8* %y) + call void @llvm.lifetime.end.p0i8(i64 1, i8* %y) +; CHECK: call void @llvm.lifetime.end.p0i8(i64 1, i8* %y) ; CHECK-NEXT: Alive: - call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) -; CHECK: call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) +; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) ; CHECK-NEXT: Alive: call void @capture8(i8* %x) @@ -758,8 +758,8 @@ entry: if.then: ; CHECK: if.then: ; CHECK-NEXT: Alive: <> - call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) -; CHECK: call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %y) +; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %y) ; CHECK-NEXT: Alive: br label %if.end @@ -769,12 +769,12 @@ if.then: if.else: ; CHECK: if.else: ; CHECK-NEXT: Alive: <> - call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) -; CHECK: call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %y) +; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %y) ; CHECK-NEXT: Alive: - call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) -; CHECK: call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) +; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) ; CHECK-NEXT: Alive: br label %if.end @@ -797,12 +797,12 @@ entry: %x = alloca i8, align 4 %y = alloca i8, align 4 - call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) -; CHECK: call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %y) +; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %y) ; CHECK-NEXT: Alive: - call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) -; CHECK: call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) +; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %x) ; CHECK-NEXT: Alive: br label %end @@ -880,8 +880,54 @@ entry: ret void } +define void @alloca_offset() { +; CHECK-LABEL: define void @alloca_offset +entry: +; CHECK: entry: +; MAY-NEXT: Alive: +; MUST-NEXT: Alive: <> + %x = alloca [5 x i32], align 4 + %x2 = getelementptr [5 x i32], [5 x i32]* %x, i64 0, i64 1 + + call void @llvm.lifetime.start.p0i32(i64 20, i32* %x2) +; CHECK: call void @llvm.lifetime.start.p0i32(i64 20, i32* %x2) +; MAY-NEXT: Alive: +; MUST-NEXT: Alive: <> + + call void @llvm.lifetime.end.p0i32(i64 20, i32* %x2) +; CHECK: call void @llvm.lifetime.end.p0i32(i64 20, i32* %x2) +; MAY-NEXT: Alive: +; MUST-NEXT: Alive: <> + + ret void +} + +define void @alloca_size() { +; CHECK-LABEL: define void @alloca_size +entry: +; CHECK: entry: +; MAY-NEXT: Alive: +; MUST-NEXT: Alive: <> + %x = alloca [5 x i32], align 4 + %x2 = getelementptr [5 x i32], [5 x i32]* %x, i64 0, i64 0 + + call void @llvm.lifetime.start.p0i32(i64 15, i32* %x2) +; CHECK: call void @llvm.lifetime.start.p0i32(i64 15, i32* %x2) +; MAY-NEXT: Alive: +; MUST-NEXT: Alive: <> + + call void @llvm.lifetime.end.p0i32(i64 15, i32* %x2) +; CHECK: call void @llvm.lifetime.end.p0i32(i64 15, i32* %x2) +; MAY-NEXT: Alive: +; MUST-NEXT: Alive: <> + + ret void +} + declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) +declare void @llvm.lifetime.start.p0i32(i64, i32* nocapture) +declare void @llvm.lifetime.end.p0i32(i64, i32* nocapture) declare void @capture8(i8*) declare void @capture32(i32*) declare void @capture64(i64*) diff --git a/llvm/test/Assembler/2002-04-07-HexFloatConstants.ll b/llvm/test/Assembler/2002-04-07-HexFloatConstants.ll index 90ee85a2a302c..6bd583eb1a60b 100644 --- a/llvm/test/Assembler/2002-04-07-HexFloatConstants.ll +++ b/llvm/test/Assembler/2002-04-07-HexFloatConstants.ll @@ -5,8 +5,8 @@ ; of the bug that was causing the Olden Health benchmark to output incorrect ; results! ; -; RUN: opt -constprop -S > %t.1 < %s -; RUN: llvm-as < %s | llvm-dis | llvm-as | opt -constprop | \ +; RUN: opt -instsimplify -S > %t.1 < %s +; RUN: llvm-as < %s | llvm-dis | llvm-as | opt -instsimplify | \ ; RUN: llvm-dis > %t.2 ; RUN: diff %t.1 %t.2 ; RUN: verify-uselistorder %s diff --git a/llvm/test/Bindings/OCaml/scalar_opts.ml b/llvm/test/Bindings/OCaml/scalar_opts.ml index c75e1c8e5549b..b130fe1254614 100644 --- a/llvm/test/Bindings/OCaml/scalar_opts.ml +++ b/llvm/test/Bindings/OCaml/scalar_opts.ml @@ -70,7 +70,6 @@ let test_transforms () = ++ add_scalar_repl_aggregation_with_threshold 4 ++ add_lib_call_simplification ++ add_tail_call_elimination - ++ add_constant_propagation ++ add_memory_to_register_demotion ++ add_verifier ++ add_correlated_value_propagation diff --git a/llvm/test/Bitcode/aarch64-bf16-upgrade.ll b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll new file mode 100644 index 0000000000000..a1ae9f172994d --- /dev/null +++ b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll @@ -0,0 +1,76 @@ +; RUN: llvm-dis < %s.bc | FileCheck %s + +; Bitcode was generated from file below + +define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { +; CHECK-LABEL: @test_vbfdot_f32 +entry: + %0 = bitcast <4 x bfloat> %a to <8 x i8> + %1 = bitcast <4 x bfloat> %b to <8 x i8> + ; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat> + ; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat> + ; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3) + %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) + ret <2 x float> %vbfdot1.i +} + +define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfdotq_f32 +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfdot1.i +} + +define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmmlaq_f32 +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmmla1.i +} + +define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmlalbq_laneq_f32 +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmlalb1.i +} + +define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmlaltq_laneq_f32 +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmlalt1.i +} + +declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) +; CHECK: declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>) \ No newline at end of file diff --git a/llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc new file mode 100644 index 0000000000000..9ef6b07c19ff3 Binary files /dev/null and b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc differ diff --git a/llvm/test/Bitcode/arm-bf16-upgrade.ll b/llvm/test/Bitcode/arm-bf16-upgrade.ll new file mode 100644 index 0000000000000..a8ee8e4ac7e50 --- /dev/null +++ b/llvm/test/Bitcode/arm-bf16-upgrade.ll @@ -0,0 +1,76 @@ +; RUN: llvm-dis < %s.bc | FileCheck %s + +; Bitcode was generated from file below + +define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { +; CHECK-LABEL: @test_vbfdot_f32 +entry: + %0 = bitcast <4 x bfloat> %a to <8 x i8> + %1 = bitcast <4 x bfloat> %b to <8 x i8> + %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) + ; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat> + ; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat> + ; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3) + ret <2 x float> %vbfdot1.i +} + +define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfdotq_f32 +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfdot1.i +} + +define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmmlaq_f32 +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmmla1.i +} + +define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmlalbq_laneq_f32 +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmlalb1.i +} + +define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmlaltq_laneq_f32 +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmlalt1.i +} + +declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) +; CHECK: declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>) \ No newline at end of file diff --git a/llvm/test/Bitcode/arm-bf16-upgrade.ll.bc b/llvm/test/Bitcode/arm-bf16-upgrade.ll.bc new file mode 100644 index 0000000000000..d9d97dbc9881d Binary files /dev/null and b/llvm/test/Bitcode/arm-bf16-upgrade.ll.bc differ diff --git a/llvm/test/Bitcode/extractelement.ll b/llvm/test/Bitcode/extractelement.ll index 90a883d6f02d6..10858a6dd30a5 100644 --- a/llvm/test/Bitcode/extractelement.ll +++ b/llvm/test/Bitcode/extractelement.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -constprop | llvm-dis -disable-output +; RUN: opt < %s -instsimplify | llvm-dis -disable-output ; RUN: verify-uselistorder < %s ; PR3465 diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index 1242a9fe9d059..a6efd05e21f9b 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -157,6 +157,7 @@ if(LLVM_BUILD_EXAMPLES) Kaleidoscope-Ch5 Kaleidoscope-Ch6 Kaleidoscope-Ch7 + LLJITWithThinLTOSummaries ) if (NOT WIN32) list(APPEND LLVM_TEST_DEPENDS diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index a896b05512dd7..946e12c9e5abe 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1138,7 +1138,7 @@ define void @test_memcpy(i8* %dst, i8* %src, i64 %size) { ; CHECK: [[DST:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK: [[SRC:%[0-9]+]]:_(p0) = COPY $x1 ; CHECK: [[SIZE:%[0-9]+]]:_(s64) = COPY $x2 -; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[DST]](p0), [[SRC]](p0), [[SIZE]](s64), 0 :: (store 1 into %ir.dst), (load 1 from %ir.src) +; CHECK: G_MEMCPY [[DST]](p0), [[SRC]](p0), [[SIZE]](s64), 0 :: (store 1 into %ir.dst), (load 1 from %ir.src) call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i1 0) ret void } @@ -1148,7 +1148,7 @@ define void @test_memcpy_tail(i8* %dst, i8* %src, i64 %size) { ; CHECK: [[DST:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK: [[SRC:%[0-9]+]]:_(p0) = COPY $x1 ; CHECK: [[SIZE:%[0-9]+]]:_(s64) = COPY $x2 -; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[DST]](p0), [[SRC]](p0), [[SIZE]](s64), 1 :: (store 1 into %ir.dst), (load 1 from %ir.src) +; CHECK: G_MEMCPY [[DST]](p0), [[SRC]](p0), [[SIZE]](s64), 1 :: (store 1 into %ir.dst), (load 1 from %ir.src) tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i1 0) ret void } @@ -1159,7 +1159,7 @@ define void @test_memcpy_nonzero_as(i8 addrspace(1)* %dst, i8 addrspace(1) * %sr ; CHECK: [[DST:%[0-9]+]]:_(p1) = COPY $x0 ; CHECK: [[SRC:%[0-9]+]]:_(p1) = COPY $x1 ; CHECK: [[SIZE:%[0-9]+]]:_(s64) = COPY $x2 -; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[DST]](p1), [[SRC]](p1), [[SIZE]](s64), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 1) +; CHECK: G_MEMCPY [[DST]](p1), [[SRC]](p1), [[SIZE]](s64), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 1) call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %size, i1 0) ret void } @@ -1170,7 +1170,7 @@ define void @test_memmove(i8* %dst, i8* %src, i64 %size) { ; CHECK: [[DST:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK: [[SRC:%[0-9]+]]:_(p0) = COPY $x1 ; CHECK: [[SIZE:%[0-9]+]]:_(s64) = COPY $x2 -; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), [[DST]](p0), [[SRC]](p0), [[SIZE]](s64), 0 :: (store 1 into %ir.dst), (load 1 from %ir.src) +; CHECK: G_MEMMOVE [[DST]](p0), [[SRC]](p0), [[SIZE]](s64), 0 :: (store 1 into %ir.dst), (load 1 from %ir.src) call void @llvm.memmove.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i1 0) ret void } @@ -1182,7 +1182,7 @@ define void @test_memset(i8* %dst, i8 %val, i64 %size) { ; CHECK: [[SRC_C:%[0-9]+]]:_(s32) = COPY $w1 ; CHECK: [[SRC:%[0-9]+]]:_(s8) = G_TRUNC [[SRC_C]] ; CHECK: [[SIZE:%[0-9]+]]:_(s64) = COPY $x2 -; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), [[DST]](p0), [[SRC]](s8), [[SIZE]](s64), 0 :: (store 1 into %ir.dst) +; CHECK: G_MEMSET [[DST]](p0), [[SRC]](s8), [[SIZE]](s64), 0 :: (store 1 into %ir.dst) call void @llvm.memset.p0i8.i64(i8* %dst, i8 %val, i64 %size, i1 0) ret void } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir new file mode 100644 index 0000000000000..83e5abdd736dd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir @@ -0,0 +1,62 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +# RUN: llc -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +--- +# select (c, x, x) -> x +name: test_combine_select_same_res +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: test_combine_select_same_res + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: $x0 = COPY [[COPY]](s64) + %0:_(s64) = COPY $x0 + %1:_(s1) = G_TRUNC %0 + %2:_(s64) = G_SELECT %1, %0, %0 + $x0 = COPY %2(s64) +... +--- +# select (undef, x, y) -> y +name: test_combine_select_undef_res0_res1 +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: test_combine_select_undef_res0_res1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: $x0 = COPY [[COPY]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s1) = G_IMPLICIT_DEF + %3:_(s64) = G_SELECT %2, %0, %1 + $x0 = COPY %3(s64) +... +--- +# select (false, x, y) -> y +name: test_combine_select_false_res0_res1 +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: test_combine_select_false_res0_res1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK: $x0 = COPY [[COPY]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s1) = G_CONSTANT i1 false + %3:_(s64) = G_SELECT %2, %0, %1 + $x0 = COPY %3(s64) +... +--- +# select (true, x, y) -> x +name: test_combine_select_true_res0_res1 +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: test_combine_select_true_res0_res1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: $x0 = COPY [[COPY]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s1) = G_CONSTANT i1 true + %3:_(s64) = G_SELECT %2, %0, %1 + $x0 = COPY %3(s64) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir index ed39bd46de341..0eeed417cb99c 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir @@ -70,12 +70,12 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[COPY]](p0), [[COPY1]](p0), [[COPY2]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; CHECK: G_MEMCPY [[COPY]](p0), [[COPY1]](p0), [[COPY2]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) ; CHECK: RET_ReallyLR %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = COPY $x2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMCPY %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... @@ -122,7 +122,7 @@ body: | %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = G_CONSTANT i64 72 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMCPY %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... @@ -169,7 +169,7 @@ body: | %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = G_CONSTANT i64 72 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMCPY %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... @@ -191,12 +191,12 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 72 - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[COPY]](p0), [[COPY1]](p0), [[C]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; CHECK: G_MEMCPY [[COPY]](p0), [[COPY1]](p0), [[C]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) ; CHECK: RET_ReallyLR %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = G_CONSTANT i64 72 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMCPY %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... @@ -263,7 +263,7 @@ body: | %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = G_CONSTANT i64 143 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMCPY %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir index 39384188bb074..c4444731fbc7e 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir @@ -55,12 +55,12 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), [[COPY]](p0), [[COPY1]](p0), [[COPY2]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; CHECK: G_MEMMOVE [[COPY]](p0), [[COPY1]](p0), [[COPY2]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) ; CHECK: RET_ReallyLR %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = COPY $x2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMMOVE %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... @@ -94,7 +94,7 @@ body: | %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = G_CONSTANT i64 48 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMMOVE %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... @@ -111,12 +111,12 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 96 - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), [[COPY]](p0), [[COPY1]](p0), [[C]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; CHECK: G_MEMMOVE [[COPY]](p0), [[COPY1]](p0), [[C]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) ; CHECK: RET_ReallyLR %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = G_CONSTANT i64 96 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMMOVE %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... @@ -156,7 +156,7 @@ body: | %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = G_CONSTANT i64 52 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMMOVE %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir index a736f57c4383f..cea0af2ff0af5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir @@ -67,14 +67,14 @@ body: | ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY2]](s32) - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), [[COPY]](p0), [[TRUNC]](s8), [[ZEXT]](s64), 1 :: (store 1 into %ir.dst) + ; CHECK: G_MEMSET [[COPY]](p0), [[TRUNC]](s8), [[ZEXT]](s64), 1 :: (store 1 into %ir.dst) ; CHECK: RET_ReallyLR %0:_(p0) = COPY $x0 %1:_(s32) = COPY $w1 %2:_(s32) = COPY $w2 %3:_(s8) = G_TRUNC %1(s32) %4:_(s64) = G_ZEXT %2(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %3(s8), %4(s64), 1 :: (store 1 into %ir.dst) + G_MEMSET %0(p0), %3(s8), %4(s64), 1 :: (store 1 into %ir.dst) RET_ReallyLR ... @@ -103,7 +103,7 @@ body: | %1:_(s32) = COPY $w1 %3:_(s64) = G_CONSTANT i64 16 %2:_(s8) = G_TRUNC %1(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %2(s8), %3(s64), 1 :: (store 1 into %ir.dst) + G_MEMSET %0(p0), %2(s8), %3(s64), 1 :: (store 1 into %ir.dst) RET_ReallyLR ... @@ -135,7 +135,7 @@ body: | %1:_(s32) = G_CONSTANT i32 0 %3:_(s64) = G_CONSTANT i64 64 %2:_(s8) = G_TRUNC %1(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %2(s8), %3(s64), 1 :: (store 1 into %ir.dst) + G_MEMSET %0(p0), %2(s8), %3(s64), 1 :: (store 1 into %ir.dst) RET_ReallyLR ... @@ -160,7 +160,7 @@ body: | %0:_(p0) = COPY $x0 %1:_(s8) = G_CONSTANT i8 64 %2:_(s64) = G_CONSTANT i64 16 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %1(s8), %2(s64), 1 :: (store 1 into %ir.dst) + G_MEMSET %0(p0), %1(s8), %2(s64), 1 :: (store 1 into %ir.dst) RET_ReallyLR ... @@ -196,7 +196,7 @@ body: | %1:_(s32) = COPY $w1 %3:_(s64) = G_CONSTANT i64 60 %2:_(s8) = G_TRUNC %1(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %2(s8), %3(s64), 1 :: (store 1 into %ir.dst) + G_MEMSET %0(p0), %2(s8), %3(s64), 1 :: (store 1 into %ir.dst) RET_ReallyLR ... @@ -224,7 +224,7 @@ body: | %0:_(p0) = COPY $x0 %1:_(s8) = G_CONSTANT i8 64 %2:_(s64) = G_CONSTANT i64 18 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %1(s8), %2(s64), 1 :: (store 1 into %ir.dst) + G_MEMSET %0(p0), %1(s8), %2(s64), 1 :: (store 1 into %ir.dst) RET_ReallyLR ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir index fb4444124ad96..cf74772a125ec 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir @@ -53,7 +53,7 @@ body: | %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = G_CONSTANT i64 32 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMCPY %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... @@ -75,12 +75,12 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[COPY]](p0), [[COPY1]](p0), [[C]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; CHECK: G_MEMCPY [[COPY]](p0), [[COPY1]](p0), [[C]](s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) ; CHECK: RET_ReallyLR %0:_(p0) = COPY $x0 %1:_(p0) = COPY $x1 %2:_(s64) = G_CONSTANT i64 36 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + G_MEMCPY %0(p0), %1(p0), %2(s64), 1 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) RET_ReallyLR ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memcpy-et-al.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memcpy-et-al.mir index 8bc5bcf118367..91f7d019eb241 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memcpy-et-al.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memcpy-et-al.mir @@ -25,7 +25,7 @@ body: | %1:_(p0) = COPY $x1 %2:_(s32) = COPY $w2 %3:_(s64) = G_ZEXT %2(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %3(s64), 0 + G_MEMCPY %0(p0), %1(p0), %3(s64), 0 :: (store unknown-size), (load unknown-size) RET_ReallyLR ... @@ -50,7 +50,7 @@ body: | %1:_(p0) = COPY $x1 %2:_(s32) = COPY $w2 %3:_(s64) = G_ZEXT %2(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %3(s64), 1 + G_MEMCPY %0(p0), %1(p0), %3(s64), 1 :: (store unknown-size), (load unknown-size) RET_ReallyLR ... @@ -78,7 +78,7 @@ body: | %1:_(p0) = COPY $x1 %2:_(s32) = COPY $w2 %3:_(s64) = G_ZEXT %2(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %1(p0), %3(s64), 0 + G_MEMMOVE %0(p0), %1(p0), %3(s64), 0 :: (store unknown-size), (load unknown-size) RET_ReallyLR ... @@ -108,7 +108,7 @@ body: | %2:_(s32) = COPY $w2 %3:_(s8) = G_TRUNC %1(s32) %4:_(s64) = G_ZEXT %2(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %3(s8), %4(s64), 0 + G_MEMSET %0(p0), %3(s8), %4(s64), 0 :: (store unknown-size) RET_ReallyLR ... @@ -137,7 +137,7 @@ body: | %1:_(p0) = COPY $x1 %2:_(s32) = COPY $w2 %3:_(s64) = G_ZEXT %2(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %3(s64), 1 + G_MEMCPY %0(p0), %1(p0), %3(s64), 1 :: (store unknown-size), (load unknown-size) $x0 = COPY %3 RET_ReallyLR implicit $x0 @@ -166,5 +166,5 @@ body: | %2:_(s32) = COPY $w2 %4:_(s1) = G_CONSTANT i1 false %3:_(s64) = G_ZEXT %2(s32) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %3(s64), 1 + G_MEMCPY %0(p0), %1(p0), %3(s64), 1 :: (store unknown-size), (load unknown-size) TCRETURNdi &memset, 0, csr_aarch64_aapcs, implicit $sp diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memcpy-with-debug-info.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memcpy-with-debug-info.mir index 204694e849536..26c9f579b8219 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memcpy-with-debug-info.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memcpy-with-debug-info.mir @@ -51,7 +51,7 @@ body: | %1:_(p0) = COPY $x1, debug-location !DILocation(line: 3, column: 1, scope: !11) %2:_(s32) = COPY $w2, debug-location !DILocation(line: 4, column: 1, scope: !11) %3:_(s64) = G_ZEXT %2(s32), debug-location !DILocation(line: 5, column: 1, scope: !11) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %3(s64), 1, debug-location !DILocation(line: 6, column: 1, scope: !11) + G_MEMCPY %0(p0), %1(p0), %3(s64), 1, debug-location !DILocation(line: 6, column: 1, scope: !11) :: (store unknown-size), (load unknown-size) DBG_VALUE 0, $noreg, !13, !DIExpression(), debug-location !DILocation(line: 6, column: 1, scope: !11) DBG_VALUE 0, $noreg, !13, !DIExpression(), debug-location !DILocation(line: 6, column: 1, scope: !11) RET_ReallyLR debug-location !DILocation(line: 7, column: 1, scope: !11) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memlib-debug-loc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memlib-debug-loc.mir index 6a5df883acd09..e26b037cbefe0 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memlib-debug-loc.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-memlib-debug-loc.mir @@ -54,7 +54,7 @@ body: | %2:_(s32) = COPY $w2 %3:_(s64) = G_ZEXT %2(s32), debug-location !11 %4:_(s8) = G_TRUNC %1(s32), debug-location !11 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %4(s8), %3(s64), 0, debug-location !11 :: (store 1 into %ir.ptr) + G_MEMSET %0(p0), %4(s8), %3(s64), 0, debug-location !11 :: (store 1 into %ir.ptr) RET_ReallyLR debug-location !12 ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/lit.local.cfg b/llvm/test/CodeGen/AArch64/GlobalISel/lit.local.cfg deleted file mode 100644 index e99d1bb8446ce..0000000000000 --- a/llvm/test/CodeGen/AArch64/GlobalISel/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not 'global-isel' in config.root.available_features: - config.unsupported = True diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-and-trivial-mask.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-and-trivial-mask.mir index 61e789d913ab2..88f0f8d8e03d2 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-and-trivial-mask.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-and-trivial-mask.mir @@ -4,6 +4,7 @@ # # RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- name: remove_and_with_one_bit legalized: true tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll index 96513115f2d9f..df35a4f382504 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll @@ -7,10 +7,8 @@ define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat ; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h ; CHECK-NEXT: ret entry: - %0 = bitcast <4 x bfloat> %a to <8 x i8> - %1 = bitcast <4 x bfloat> %b to <8 x i8> - %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) - ret <2 x float> %vbfdot1.i + %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -19,24 +17,22 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa ; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfdot1.i + %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfdot3.i } define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfdot_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfdot v0.2s, v1.4h, v2.2h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[0] ; CHECK-NEXT: ret entry: - %0 = bitcast <4 x bfloat> %b to <2 x float> - %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer - %1 = bitcast <4 x bfloat> %a to <8 x i8> - %2 = bitcast <2 x float> %shuffle to <8 x i8> - %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) - ret <2 x float> %vbfdot1.i + %.cast = bitcast <4 x bfloat> %b to <2 x float> + %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer + %.cast1 = bitcast <2 x float> %lane to <4 x bfloat> + %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -45,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x ; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3] ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %b to <4 x float> - %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> - %1 = bitcast <8 x bfloat> %a to <16 x i8> - %2 = bitcast <4 x float> %shuffle to <16 x i8> - %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) - ret <4 x float> %vbfdot1.i + %.cast = bitcast <8 x bfloat> %b to <4 x float> + %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> + %.cast1 = bitcast <4 x float> %lane to <8 x bfloat> + %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) + ret <4 x float> %vbfdot3.i } define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) { @@ -59,26 +54,25 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x ; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3] ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %b to <4 x float> - %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> - %1 = bitcast <4 x bfloat> %a to <8 x i8> - %2 = bitcast <2 x float> %shuffle to <8 x i8> - %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) - ret <2 x float> %vbfdot1.i + %.cast = bitcast <8 x bfloat> %b to <4 x float> + %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> + %.cast1 = bitcast <2 x float> %lane to <4 x bfloat> + %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfdotq_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfdot v0.4s, v1.8h, v2.2h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[0] ; CHECK-NEXT: ret entry: - %0 = bitcast <4 x bfloat> %b to <2 x float> - %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer - %1 = bitcast <8 x bfloat> %a to <16 x i8> - %2 = bitcast <4 x float> %shuffle to <16 x i8> - %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) - ret <4 x float> %vbfdot1.i + %.cast = bitcast <4 x bfloat> %b to <2 x float> + %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer + %.cast1 = bitcast <4 x float> %lane to <8 x bfloat> + %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) + ret <4 x float> %vbfdot3.i } define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -87,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo ; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmmla1.i + %vbfmmlaq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmmlaq_v3.i } define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -99,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl ; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -111,23 +101,20 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl ; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfmlalbq_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -137,23 +124,20 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfmlaltq_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -163,14 +147,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } -declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 +declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>) diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll index b0ed3d0490cc0..e95321582def9 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -214,11 +214,11 @@ entry: ret void } -; FIXME - The SU(4) and SU(7) can be clustered even with +; Verify that the SU(4) and SU(7) can be clustered even with ; different preds ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: cluster_with_different_preds:%bb.0 -; CHECK-NOT:Cluster ld/st SU(4) - SU(7) +; CHECK:Cluster ld/st SU(4) - SU(7) ; CHECK:SU(3): STRWui %2:gpr32, %0:gpr64common, 0 :: ; CHECK:SU(4): %3:gpr32 = LDRWui %1:gpr64common, 0 :: ; CHECK:Predecessors: diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll index eee0d77d98eba..b38b91e9d705e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1966,7 +1966,7 @@ define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) { define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) { ; CHECK-LABEL: test_vadd_lane2_i16_bitcast_bigger_aligned: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.8b, v1.8b, v0.8b, #2 +; CHECK-NEXT: dup v1.4h, v1.h[2] ; CHECK-NEXT: dup v1.4h, v1.h[1] ; CHECK-NEXT: add v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll b/llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll index c45e55edeca52..c51ea172232a7 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll @@ -14,7 +14,7 @@ entry: define <4 x i16> @vext_6701_12(<4 x i16> %a1, <4 x i16> %a2) { entry: ; CHECK-LABEL: vext_6701_12: -; CHECK: ext v0.8b, v0.8b, v0.8b, #4 +; CHECK: dup v0.2s, v0.s[0] %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> ret <4 x i16> %x } @@ -54,7 +54,7 @@ entry: define <4 x i16> @vext_6701_34(<4 x i16> %a1, <4 x i16> %a2) { entry: ; CHECK-LABEL: vext_6701_34: -; CHECK: ext v0.8b, v1.8b, v0.8b, #4 +; CHECK: dup v0.2s, v1.s[1] %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> ret <4 x i16> %x } diff --git a/llvm/test/CodeGen/AArch64/framelayout-fp-csr.ll b/llvm/test/CodeGen/AArch64/framelayout-fp-csr.ll new file mode 100644 index 0000000000000..3b13dee29f069 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/framelayout-fp-csr.ll @@ -0,0 +1,22 @@ +; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-post-ra --frame-pointer=all < %s | FileCheck %s + +; The purpose of this test is to verify that frame pointer (x29) +; is correctly setup in the presence of callee-saved floating +; point registers. The frame pointer should point to the frame +; record, which is located 16 bytes above the end of the CSR +; space when a single FP CSR is in use. +define void @test1(i32) #26 { +entry: + call void asm sideeffect "nop", "~{d8}"() #26 + ret void +} +; CHECK-LABEL: test1: +; CHECK: str d8, [sp, #-32]! +; CHECK-NEXT: stp x29, x30, [sp, #16] +; CHECK-NEXT: add x29, sp, #16 +; CHECK: nop +; CHECK: ldp x29, x30, [sp, #16] +; CHECK-NEXT: ldr d8, [sp], #32 +; CHECK-NEXT: ret + +attributes #26 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir b/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir new file mode 100644 index 0000000000000..ab4af04401c5e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir @@ -0,0 +1,29 @@ +# RUN: llc -mtriple=aarch64-linux-gnu -start-before prologepilog %s -o - | FileCheck %s + +--- +name: TestFrameRecordLocation +tracksRegLiveness: true +frameInfo: + isFrameAddressTaken: true +body: | + bb.0: + $d8 = IMPLICIT_DEF + $d9 = IMPLICIT_DEF + $x19 = IMPLICIT_DEF + RET_ReallyLR + +# CHECK-LABEL: TestFrameRecordLocation + +# CHECK: stp d9, d8, [sp, #-48]! +# CHECK: stp x29, x30, [sp, #16] +# CHECK: str x19, [sp, #32] + +# CHECK: add x29, sp, #16 + +# CHECK: .cfi_def_cfa w29, 32 +# CHECK: .cfi_offset w19, -16 +# CHECK: .cfi_offset w30, -24 +# CHECK: .cfi_offset w29, -32 +# CHECK: .cfi_offset b8, -40 +# CHECK: .cfi_offset b9, -48 +... diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir new file mode 100644 index 0000000000000..a3cbd39c6531f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +# RUN: llc -mattr=+sve -mtriple=aarch64-none-linux-gnu -start-before=prologepilog %s -o - | FileCheck %s + +--- | + define aarch64_sve_vector_pcs void @fix_restorepoint_p4() { entry: unreachable } + ; CHECK-LABEL: fix_restorepoint_p4: + ; CHECK: // %bb.0: // %entry + ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill + ; CHECK-NEXT: addvl sp, sp, #-2 + ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill + ; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill + ; CHECK-NEXT: addvl sp, sp, #-1 + ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG + ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG + ; CHECK-NEXT: .cfi_offset w29, -16 + ; CHECK-NEXT: // implicit-def: $z8 + ; CHECK-NEXT: // implicit-def: $p4 + ; CHECK-NEXT: addvl sp, sp, #1 + ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload + ; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload + ; CHECK-NEXT: addvl sp, sp, #2 + ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload + ; CHECK-NEXT: ret +... +name: fix_restorepoint_p4 +stack: + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 } +body: | + bb.0.entry: + $z8 = IMPLICIT_DEF + $p4 = IMPLICIT_DEF + B %bb.1 + + bb.1.entry: + RET_ReallyLR +--- diff --git a/llvm/test/CodeGen/AArch64/framelayout-unaligned-fp.ll b/llvm/test/CodeGen/AArch64/framelayout-unaligned-fp.ll new file mode 100644 index 0000000000000..160eb2dd099bb --- /dev/null +++ b/llvm/test/CodeGen/AArch64/framelayout-unaligned-fp.ll @@ -0,0 +1,42 @@ +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +; The purpose of this test is to construct a scenario where an odd number +; of callee-saved GPRs as well as an odd number of callee-saved FPRs are +; used. This caused the frame pointer to be aligned to a multiple of 8 +; on non-Darwin platforms, rather than a multiple of 16 as usual. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +@a = global i64 0, align 4 + + +define i64 @b() { +entry: + %call = tail call i64 @d() + %0 = alloca i8, i64 ptrtoint (i64 ()* @d to i64), align 16 + %1 = ptrtoint i8* %0 to i64 + store i64 %1, i64* @a, align 4 + %call1 = call i64 @e() + %conv = sitofp i64 %call1 to float + %2 = load i64, i64* @a, align 4 + %call2 = call i64 @f(i64 %2) + %conv3 = fptosi float %conv to i64 + ret i64 %conv3 +} + +; CHECK-LABEL: b: +; CHECK: str d8, [sp, #-32]! +; CHECK-NEXT: stp x29, x30, [sp, #8] +; CHECK-NEXT: str x19, [sp, #24] +; CHECK-NEXT: add x29, sp, #8 + +; CHECK: sub sp, x29, #8 +; CHECK-NEXT: ldr x19, [sp, #24] +; CHECK-NEXT: ldp x29, x30, [sp, #8] +; CHECK-NEXT: ldr d8, [sp], #32 +; CHECK-NEXT: ret + +declare i64 @d() +declare i64 @e() +declare i64 @f(i64) diff --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll index 9d9ea3ec8951e..3f8fa3e9e3837 100644 --- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll +++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll @@ -200,12 +200,17 @@ define void @be_i32_to_i8_order(i32 %x, i8* %p0) { } define void @le_i32_to_i16(i32 %x, i16* %p0) { -; CHECK-LABEL: le_i32_to_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w0, [x1] -; CHECK-NEXT: strh w8, [x1, #2] -; CHECK-NEXT: ret +; LE-LABEL: le_i32_to_i16: +; LE: // %bb.0: +; LE-NEXT: str w0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i32_to_i16: +; BE: // %bb.0: +; BE-NEXT: lsr w8, w0, #16 +; BE-NEXT: strh w0, [x1] +; BE-NEXT: strh w8, [x1, #2] +; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 %t1 = trunc i32 %sh1 to i16 @@ -216,12 +221,17 @@ define void @le_i32_to_i16(i32 %x, i16* %p0) { } define void @le_i32_to_i16_order(i32 %x, i16* %p0) { -; CHECK-LABEL: le_i32_to_i16_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w8, [x1, #2] -; CHECK-NEXT: strh w0, [x1] -; CHECK-NEXT: ret +; LE-LABEL: le_i32_to_i16_order: +; LE: // %bb.0: +; LE-NEXT: str w0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i32_to_i16_order: +; BE: // %bb.0: +; BE-NEXT: lsr w8, w0, #16 +; BE-NEXT: strh w8, [x1, #2] +; BE-NEXT: strh w0, [x1] +; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 %t1 = trunc i32 %sh1 to i16 @@ -232,12 +242,17 @@ define void @le_i32_to_i16_order(i32 %x, i16* %p0) { } define void @be_i32_to_i16(i32 %x, i16* %p0) { -; CHECK-LABEL: be_i32_to_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w0, [x1, #2] -; CHECK-NEXT: strh w8, [x1] -; CHECK-NEXT: ret +; LE-LABEL: be_i32_to_i16: +; LE: // %bb.0: +; LE-NEXT: lsr w8, w0, #16 +; LE-NEXT: strh w0, [x1, #2] +; LE-NEXT: strh w8, [x1] +; LE-NEXT: ret +; +; BE-LABEL: be_i32_to_i16: +; BE: // %bb.0: +; BE-NEXT: str w0, [x1] +; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 %t1 = trunc i32 %sh1 to i16 @@ -248,12 +263,17 @@ define void @be_i32_to_i16(i32 %x, i16* %p0) { } define void @be_i32_to_i16_order(i32 %x, i16* %p0) { -; CHECK-LABEL: be_i32_to_i16_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w8, [x1] -; CHECK-NEXT: strh w0, [x1, #2] -; CHECK-NEXT: ret +; LE-LABEL: be_i32_to_i16_order: +; LE: // %bb.0: +; LE-NEXT: lsr w8, w0, #16 +; LE-NEXT: strh w8, [x1] +; LE-NEXT: strh w0, [x1, #2] +; LE-NEXT: ret +; +; BE-LABEL: be_i32_to_i16_order: +; BE: // %bb.0: +; BE-NEXT: str w0, [x1] +; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 %t1 = trunc i32 %sh1 to i16 @@ -440,16 +460,21 @@ define void @be_i64_to_i8_order(i64 %x, i8* %p0) { } define void @le_i64_to_i16(i64 %x, i16* %p0) { -; CHECK-LABEL: le_i64_to_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: lsr x9, x0, #32 -; CHECK-NEXT: lsr x10, x0, #48 -; CHECK-NEXT: strh w0, [x1] -; CHECK-NEXT: strh w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1, #4] -; CHECK-NEXT: strh w10, [x1, #6] -; CHECK-NEXT: ret +; LE-LABEL: le_i64_to_i16: +; LE: // %bb.0: +; LE-NEXT: str x0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i64_to_i16: +; BE: // %bb.0: +; BE-NEXT: lsr x8, x0, #16 +; BE-NEXT: lsr x9, x0, #32 +; BE-NEXT: lsr x10, x0, #48 +; BE-NEXT: strh w0, [x1] +; BE-NEXT: strh w8, [x1, #2] +; BE-NEXT: strh w9, [x1, #4] +; BE-NEXT: strh w10, [x1, #6] +; BE-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 %sh3 = lshr i64 %x, 48 @@ -468,16 +493,21 @@ define void @le_i64_to_i16(i64 %x, i16* %p0) { } define void @le_i64_to_i16_order(i64 %x, i16* %p0) { -; CHECK-LABEL: le_i64_to_i16_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: lsr x9, x0, #32 -; CHECK-NEXT: lsr x10, x0, #48 -; CHECK-NEXT: strh w0, [x1] -; CHECK-NEXT: strh w8, [x1, #2] -; CHECK-NEXT: strh w10, [x1, #6] -; CHECK-NEXT: strh w9, [x1, #4] -; CHECK-NEXT: ret +; LE-LABEL: le_i64_to_i16_order: +; LE: // %bb.0: +; LE-NEXT: str x0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i64_to_i16_order: +; BE: // %bb.0: +; BE-NEXT: lsr x8, x0, #16 +; BE-NEXT: lsr x9, x0, #32 +; BE-NEXT: lsr x10, x0, #48 +; BE-NEXT: strh w0, [x1] +; BE-NEXT: strh w8, [x1, #2] +; BE-NEXT: strh w10, [x1, #6] +; BE-NEXT: strh w9, [x1, #4] +; BE-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 %sh3 = lshr i64 %x, 48 @@ -496,16 +526,21 @@ define void @le_i64_to_i16_order(i64 %x, i16* %p0) { } define void @be_i64_to_i16(i64 %x, i16* %p0) { -; CHECK-LABEL: be_i64_to_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: lsr x9, x0, #32 -; CHECK-NEXT: lsr x10, x0, #48 -; CHECK-NEXT: strh w0, [x1, #6] -; CHECK-NEXT: strh w8, [x1, #4] -; CHECK-NEXT: strh w9, [x1, #2] -; CHECK-NEXT: strh w10, [x1] -; CHECK-NEXT: ret +; LE-LABEL: be_i64_to_i16: +; LE: // %bb.0: +; LE-NEXT: lsr x8, x0, #16 +; LE-NEXT: lsr x9, x0, #32 +; LE-NEXT: lsr x10, x0, #48 +; LE-NEXT: strh w0, [x1, #6] +; LE-NEXT: strh w8, [x1, #4] +; LE-NEXT: strh w9, [x1, #2] +; LE-NEXT: strh w10, [x1] +; LE-NEXT: ret +; +; BE-LABEL: be_i64_to_i16: +; BE: // %bb.0: +; BE-NEXT: str x0, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 %sh3 = lshr i64 %x, 48 @@ -524,16 +559,21 @@ define void @be_i64_to_i16(i64 %x, i16* %p0) { } define void @be_i64_to_i16_order(i64 %x, i16* %p0) { -; CHECK-LABEL: be_i64_to_i16_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: lsr x9, x0, #32 -; CHECK-NEXT: lsr x10, x0, #48 -; CHECK-NEXT: strh w0, [x1, #6] -; CHECK-NEXT: strh w10, [x1] -; CHECK-NEXT: strh w9, [x1, #2] -; CHECK-NEXT: strh w8, [x1, #4] -; CHECK-NEXT: ret +; LE-LABEL: be_i64_to_i16_order: +; LE: // %bb.0: +; LE-NEXT: lsr x8, x0, #16 +; LE-NEXT: lsr x9, x0, #32 +; LE-NEXT: lsr x10, x0, #48 +; LE-NEXT: strh w0, [x1, #6] +; LE-NEXT: strh w10, [x1] +; LE-NEXT: strh w9, [x1, #2] +; LE-NEXT: strh w8, [x1, #4] +; LE-NEXT: ret +; +; BE-LABEL: be_i64_to_i16_order: +; BE: // %bb.0: +; BE-NEXT: str x0, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 %sh3 = lshr i64 %x, 48 @@ -552,11 +592,16 @@ define void @be_i64_to_i16_order(i64 %x, i16* %p0) { } define void @le_i64_to_i32(i64 %x, i32* %p0) { -; CHECK-LABEL: le_i64_to_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: stp w0, w8, [x1] -; CHECK-NEXT: ret +; LE-LABEL: le_i64_to_i32: +; LE: // %bb.0: +; LE-NEXT: str x0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i64_to_i32: +; BE: // %bb.0: +; BE-NEXT: lsr x8, x0, #32 +; BE-NEXT: stp w0, w8, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 %t1 = trunc i64 %sh1 to i32 @@ -567,11 +612,16 @@ define void @le_i64_to_i32(i64 %x, i32* %p0) { } define void @le_i64_to_i32_order(i64 %x, i32* %p0) { -; CHECK-LABEL: le_i64_to_i32_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: stp w0, w8, [x1] -; CHECK-NEXT: ret +; LE-LABEL: le_i64_to_i32_order: +; LE: // %bb.0: +; LE-NEXT: str x0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i64_to_i32_order: +; BE: // %bb.0: +; BE-NEXT: lsr x8, x0, #32 +; BE-NEXT: stp w0, w8, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 %t1 = trunc i64 %sh1 to i32 @@ -582,11 +632,16 @@ define void @le_i64_to_i32_order(i64 %x, i32* %p0) { } define void @be_i64_to_i32(i64 %x, i32* %p0) { -; CHECK-LABEL: be_i64_to_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: stp w8, w0, [x1] -; CHECK-NEXT: ret +; LE-LABEL: be_i64_to_i32: +; LE: // %bb.0: +; LE-NEXT: lsr x8, x0, #32 +; LE-NEXT: stp w8, w0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: be_i64_to_i32: +; BE: // %bb.0: +; BE-NEXT: str x0, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 %t1 = trunc i64 %sh1 to i32 @@ -597,11 +652,16 @@ define void @be_i64_to_i32(i64 %x, i32* %p0) { } define void @be_i64_to_i32_order(i64 %x, i32* %p0) { -; CHECK-LABEL: be_i64_to_i32_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: stp w8, w0, [x1] -; CHECK-NEXT: ret +; LE-LABEL: be_i64_to_i32_order: +; LE: // %bb.0: +; LE-NEXT: lsr x8, x0, #32 +; LE-NEXT: stp w8, w0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: be_i64_to_i32_order: +; BE: // %bb.0: +; BE-NEXT: str x0, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 %t1 = trunc i64 %sh1 to i32 @@ -611,6 +671,8 @@ define void @be_i64_to_i32_order(i64 %x, i32* %p0) { ret void } +; Negative test - not consecutive addresses + define void @i64_to_i32_wrong_addr(i64 %x, i32* %p0) { ; CHECK-LABEL: i64_to_i32_wrong_addr: ; CHECK: // %bb.0: @@ -627,6 +689,8 @@ define void @i64_to_i32_wrong_addr(i64 %x, i32* %p0) { ret void } +; Negative test - addresses don't line up with shift amounts + define void @i64_to_i16_wrong_order(i64 %x, i16* %p0) { ; CHECK-LABEL: i64_to_i16_wrong_order: ; CHECK: // %bb.0: @@ -655,6 +719,8 @@ define void @i64_to_i16_wrong_order(i64 %x, i16* %p0) { ret void } +; Negative test - no store of 't1' + define void @i32_to_i8_incomplete(i32 %x, i8* %p0) { ; CHECK-LABEL: i32_to_i8_incomplete: ; CHECK: // %bb.0: @@ -680,6 +746,8 @@ define void @i32_to_i8_incomplete(i32 %x, i8* %p0) { ret void } +; Negative test - no store of 't3' + define void @i64_to_i8_incomplete(i64 %x, i8* %p0) { ; CHECK-LABEL: i64_to_i8_incomplete: ; CHECK: // %bb.0: @@ -729,6 +797,8 @@ define void @i64_to_i8_incomplete(i64 %x, i8* %p0) { ret void } +; Negative test - not consecutive addresses + define void @i32_to_i16_wrong_addr(i32 %x, i16* %p0) { ; CHECK-LABEL: i32_to_i16_wrong_addr: ; CHECK: // %bb.0: @@ -745,6 +815,8 @@ define void @i32_to_i16_wrong_addr(i32 %x, i16* %p0) { ret void } +; Negative test - addresses don't line up with shift amounts + define void @i32_to_i8_wrong_order(i32 %x, i8* %p0) { ; CHECK-LABEL: i32_to_i8_wrong_order: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/neon-extract.ll b/llvm/test/CodeGen/AArch64/neon-extract.ll index c159da1e9d18a..0cac894246422 100644 --- a/llvm/test/CodeGen/AArch64/neon-extract.ll +++ b/llvm/test/CodeGen/AArch64/neon-extract.ll @@ -209,7 +209,7 @@ entry: define <4 x i16> @test_undef_vext_s16(<4 x i16> %a) { ; CHECK-LABEL: test_undef_vext_s16: -; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}} +; CHECK: dup v{{[0-9]+}}.2s, {{v[0-9]+}}.s[1] entry: %vext = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> ret <4 x i16> %vext diff --git a/llvm/test/CodeGen/AArch64/neon-wide-splat.ll b/llvm/test/CodeGen/AArch64/neon-wide-splat.ll new file mode 100644 index 0000000000000..6290f85dc1cec --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-wide-splat.ll @@ -0,0 +1,122 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <4 x i16> @shuffle1(<4 x i16> %v) { +; CHECK-LABEL: shuffle1: +; CHECK: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + ret <4 x i16> %res +} + +define <4 x i16> @shuffle2(<4 x i16> %v) { +; CHECK-LABEL: shuffle2: +; CHECK: dup v0.2s, v0.s[1] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + ret <4 x i16> %res +} + +define <8 x i16> @shuffle3(<8 x i16> %v) { +; CHECK-LABEL: shuffle3: +; CHECK: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + ret <8 x i16> %res +} + +define <4 x i32> @shuffle4(<4 x i32> %v) { +; CHECK-LABEL: shuffle4: +; CHECK: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> + ret <4 x i32> %res +} + +define <16 x i8> @shuffle5(<16 x i8> %v) { +; CHECK-LABEL: shuffle5: +; CHECK: dup v0.4s, v0.s[2] +; CHECK-NEXT: ret +entry: + %res = shufflevector <16 x i8> %v, <16 x i8> undef, <16 x i32> + ret <16 x i8> %res +} + +define <16 x i8> @shuffle6(<16 x i8> %v) { +; CHECK-LABEL: shuffle6: +; CHECK: dup v0.2d, v0.d[1] +; CHECK-NEXT: ret +entry: + %res = shufflevector <16 x i8> %v, <16 x i8> undef, <16 x i32> + ret <16 x i8> %res +} + +define <8 x i8> @shuffle7(<8 x i8> %v) { +; CHECK-LABEL: shuffle7: +; CHECK: dup v0.2s, v0.s[1] +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i8> %v, <8 x i8> undef, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i8> @shuffle8(<8 x i8> %v) { +; CHECK-LABEL: shuffle8: +; CHECK: dup v0.4h, v0.h[3] +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i8> %v, <8 x i8> undef, <8 x i32> + ret <8 x i8> %res +} + +; No blocks +define <8 x i8> @shuffle_not1(<16 x i8> %v) { +; CHECK-LABEL: shuffle_not1: +; CHECK: ext v0.16b, v0.16b, v0.16b, #2 + %res = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> + ret <8 x i8> %res +} + +; Block is not a proper lane +define <4 x i32> @shuffle_not2(<4 x i32> %v) { +; CHECK-LABEL: shuffle_not2: +; CHECK-NOT: dup +; CHECK: ext +; CHECK: ret +entry: + %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> + ret <4 x i32> %res +} + +; Block size is equal to vector size +define <4 x i16> @shuffle_not3(<4 x i16> %v) { +; CHECK-LABEL: shuffle_not3: +; CHECK-NOT: dup +; CHECK: ret +entry: + %res = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + ret <4 x i16> %res +} + +; Blocks mismatch +define <8 x i8> @shuffle_not4(<8 x i8> %v) { +; CHECK-LABEL: shuffle_not4: +; CHECK-NOT: dup +; CHECK: ret +entry: + %res = shufflevector <8 x i8> %v, <8 x i8> undef, <8 x i32> + ret <8 x i8> %res +} diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll index be13a2ec84cbe..3a2a340f4553d 100644 --- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -448,8 +448,7 @@ define <4 x double> @d4rsqrt(<4 x double> %a) #0 { define double @sqrt_fdiv_common_operand(double %x) nounwind { ; FAULT-LABEL: sqrt_fdiv_common_operand: ; FAULT: // %bb.0: -; FAULT-NEXT: fsqrt d1, d0 -; FAULT-NEXT: fdiv d0, d0, d1 +; FAULT-NEXT: fsqrt d0, d0 ; FAULT-NEXT: ret ; ; CHECK-LABEL: sqrt_fdiv_common_operand: @@ -474,8 +473,7 @@ define double @sqrt_fdiv_common_operand(double %x) nounwind { define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind { ; FAULT-LABEL: sqrt_fdiv_common_operand_vec: ; FAULT: // %bb.0: -; FAULT-NEXT: fsqrt v1.2d, v0.2d -; FAULT-NEXT: fdiv v0.2d, v0.2d, v1.2d +; FAULT-NEXT: fsqrt v0.2d, v0.2d ; FAULT-NEXT: ret ; ; CHECK-LABEL: sqrt_fdiv_common_operand_vec: @@ -493,16 +491,15 @@ define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind { ; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) - %r = fdiv arcp reassoc <2 x double> %x, %sqrt + %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt ret <2 x double> %r } define double @sqrt_fdiv_common_operand_extra_use(double %x, double* %p) nounwind { ; FAULT-LABEL: sqrt_fdiv_common_operand_extra_use: ; FAULT: // %bb.0: -; FAULT-NEXT: fsqrt d1, d0 -; FAULT-NEXT: fdiv d0, d0, d1 -; FAULT-NEXT: str d1, [x0] +; FAULT-NEXT: fsqrt d0, d0 +; FAULT-NEXT: str d0, [x0] ; FAULT-NEXT: ret ; ; CHECK-LABEL: sqrt_fdiv_common_operand_extra_use: diff --git a/llvm/test/CodeGen/AArch64/stack-guard-reassign.ll b/llvm/test/CodeGen/AArch64/stack-guard-reassign.ll index 6b00eda6233a2..71d3d27a1b649 100644 --- a/llvm/test/CodeGen/AArch64/stack-guard-reassign.ll +++ b/llvm/test/CodeGen/AArch64/stack-guard-reassign.ll @@ -3,6 +3,6 @@ ; Verify that the offset assigned to the stack protector is at the top of the ; frame, covering the locals. ; CHECK-LABEL: fn: -; CHECK: adrp x8, __stack_chk_guard -; CHECK-NEXT: ldr x8, [x8, :lo12:__stack_chk_guard] -; CHECK-NEXT: stur x8, [x29, #-8] +; CHECK: adrp [[REG:x[0-9]+]], __stack_chk_guard +; CHECK-NEXT: ldr [[REG]], {{\[}}[[REG]], :lo12:__stack_chk_guard] +; CHECK-NEXT: stur [[REG]], [x29, #-8] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll index cdeac8008133e..3e009391c3ae3 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -20,6 +20,48 @@ target triple = "aarch64-unknown-linux-gnu" ; Don't use SVE when its registers are no bigger than NEON. ; NO_SVE-NOT: z{0-9} +; +; sext i1 -> i32 +; + +; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg +; type's element type is not byte based and thus cannot be lowered directly to +; an SVE instruction. +define void @sext_v8i1_v8i32(<8 x i1> %a, <8 x i32>* %out) #0 { +; CHECK-LABEL: sext_v8i1_v8i32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 +; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b +; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h +; CHECK-NEXT: lsl [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31 +; CHECK-NEXT: asr [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31 +; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0] +; CHECK-NEXT: ret + %b = sext <8 x i1> %a to <8 x i32> + store <8 x i32> %b, <8 x i32>* %out + ret void +} + +; +; sext i3 -> i64 +; + +; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg +; type's element type is not power-of-2 based and thus cannot be lowered +; directly to an SVE instruction. +define void @sext_v4i3_v4i64(<4 x i3> %a, <4 x i64>* %out) #0 { +; CHECK-LABEL: sext_v4i3_v4i64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h +; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s +; CHECK-NEXT: lsl [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61 +; CHECK-NEXT: asr [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61 +; CHECK-NEXT: st1d { [[A_WORDS]].d }, [[PG]], [x0] +; CHECK-NEXT: ret + %b = sext <4 x i3> %a to <4 x i64> + store <4 x i64> %b, <4 x i64>* %out + ret void +} + ; ; sext i8 -> i16 ; diff --git a/llvm/test/CodeGen/AArch64/sve-fp.ll b/llvm/test/CodeGen/AArch64/sve-fp.ll index 891a5c144234d..2afecdfc826d4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp.ll @@ -323,6 +323,78 @@ define @fma_nxv2f64_3( %a, %r } +define @fneg_nxv8f16( %a) { +; CHECK-LABEL: fneg_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fneg z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %minus.one = insertelement undef, half -1.0, i64 0 + %minus.one.vec = shufflevector %minus.one, undef, zeroinitializer + %neg = fmul %a, %minus.one.vec + ret %neg +} + +define @fneg_nxv4f16( %a) { +; CHECK-LABEL: fneg_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fneg z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %minus.one = insertelement undef, half -1.0, i64 0 + %minus.one.vec = shufflevector %minus.one, undef, zeroinitializer + %neg = fmul %a, %minus.one.vec + ret %neg +} + +define @fneg_nxv2f16( %a) { +; CHECK-LABEL: fneg_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fneg z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %minus.one = insertelement undef, half -1.0, i64 0 + %minus.one.vec = shufflevector %minus.one, undef, zeroinitializer + %neg = fmul %a, %minus.one.vec + ret %neg +} + +define @fneg_nxv4f32( %a) { +; CHECK-LABEL: fneg_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fneg z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %minus.one = insertelement undef, float -1.0, i64 0 + %minus.one.vec = shufflevector %minus.one, undef, zeroinitializer + %neg = fmul %a, %minus.one.vec + ret %neg +} + +define @fneg_nxv2f32( %a) { +; CHECK-LABEL: fneg_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fneg z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %minus.one = insertelement undef, float -1.0, i64 0 + %minus.one.vec = shufflevector %minus.one, undef, zeroinitializer + %neg = fmul %a, %minus.one.vec + ret %neg +} + +define @fneg_nxv2f64( %a) { +; CHECK-LABEL: fneg_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fneg z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %minus.one = insertelement undef, double -1.0, i64 0 + %minus.one.vec = shufflevector %minus.one, undef, zeroinitializer + %neg = fmul %a, %minus.one.vec + ret %neg +} + define @frecps_h( %a, %b) { ; CHECK-LABEL: frecps_h: ; CHECK: // %bb.0: @@ -408,6 +480,62 @@ define void @float_copy(* %P1, * %P2) { ret void } +; FCEIL + +define @frintp_nxv8f16( %a) { +; CHECK-LABEL: frintp_nxv8f16: +; CHECK: ptrue p0.h +; CHECK-NEXT: frintp z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %res = call @llvm.ceil.nxv8f16( %a) + ret %res +} + +define @frintp_nxv4f16( %a) { +; CHECK-LABEL: frintp_nxv4f16: +; CHECK: ptrue p0.s +; CHECK-NEXT: frintp z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %res = call @llvm.ceil.nxv4f16( %a) + ret %res +} + +define @frintp_nxv2f16( %a) { +; CHECK-LABEL: frintp_nxv2f16: +; CHECK: ptrue p0.d +; CHECK-NEXT: frintp z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %res = call @llvm.ceil.nxv2f16( %a) + ret %res +} + +define @frintp_nxv4f32( %a) { +; CHECK-LABEL: frintp_nxv4f32: +; CHECK: ptrue p0.s +; CHECK-NEXT: frintp z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.ceil.nxv4f32( %a) + ret %res +} + +define @frintp_nxv2f32( %a) { +; CHECK-LABEL: frintp_nxv2f32: +; CHECK: ptrue p0.d +; CHECK-NEXT: frintp z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.ceil.nxv2f32( %a) + ret %res +} + +define @frintp_nxv2f64( %a) { +; CHECK-LABEL: frintp_nxv2f64: +; CHECK: ptrue p0.d +; CHECK-NEXT: frintp z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %res = call @llvm.ceil.nxv2f64( %a) + ret %res +} + declare @llvm.aarch64.sve.frecps.x.nxv8f16(, ) declare @llvm.aarch64.sve.frecps.x.nxv4f32( , ) declare @llvm.aarch64.sve.frecps.x.nxv2f64(, ) @@ -423,5 +551,12 @@ declare @llvm.fma.nxv8f16(, @llvm.fma.nxv4f16(, , ) declare @llvm.fma.nxv2f16(, , ) +declare @llvm.ceil.nxv8f16( ) +declare @llvm.ceil.nxv4f16( ) +declare @llvm.ceil.nxv2f16( ) +declare @llvm.ceil.nxv4f32() +declare @llvm.ceil.nxv2f32() +declare @llvm.ceil.nxv2f64() + ; Function Attrs: nounwind readnone declare double @llvm.aarch64.sve.faddv.nxv2f64(, ) #2 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll index c669a55519d86..4d888317b343e 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll @@ -77,7 +77,7 @@ define float @test_v16f32(<16 x float> %a) nounwind { ; CHECK-NEXT: fmaxnm v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v2.4s ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup v1.2d, v0.d[1] ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, v0.s[1] ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AArch64/wineh5.mir b/llvm/test/CodeGen/AArch64/wineh5.mir index 1280997fbb3c4..6fe6b66fc2e54 100644 --- a/llvm/test/CodeGen/AArch64/wineh5.mir +++ b/llvm/test/CodeGen/AArch64/wineh5.mir @@ -15,7 +15,7 @@ # CHECK-NEXT: 0xe3 ; nop # CHECK-NEXT: 0xe3 ; nop # CHECK-NEXT: 0x42 ; stp x29, x30, [sp, #16] -# CHECK-NEXT: 0xd53f ; str x28, [sp, #256]! +# CHECK-NEXT: 0xd53f ; str x28, [sp, #-256]! # CHECK-NEXT: 0xe4 ; end # CHECK-NEXT: ] # CHECK-NEXT: EpilogueScopes [ diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index 2205bfe3c71d7..721269072fa84 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -159,11 +159,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s1, 0xffc0 ; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 @@ -186,10 +185,9 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -212,10 +210,9 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_add_i32 s0, s0, 4 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -239,13 +236,11 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; ; GFX8-LABEL: s_add_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 @@ -271,13 +266,11 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg ; GFX8-LABEL: s_add_v2i16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 @@ -305,13 +298,11 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg ; GFX8-LABEL: s_add_v2i16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 @@ -343,13 +334,11 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha ; GFX8-NEXT: s_mov_b32 s2, 0x80008000 ; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: s_xor_b32 s0, s0, s2 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index 4fb1b2b8e725e..c01dd73e1e3c3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) { ; GCN-LABEL: s_andn2_i32: @@ -196,58 +196,31 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3 } define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) { -; GFX6-LABEL: s_andn2_i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_andn2_b32 s0, s2, s3 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_andn2_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s3, s0 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_andn2_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_andn2_b32 s0, s2, s3 +; GCN-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 ret i16 %and } define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) { -; GFX6-LABEL: s_andn2_i16_commute: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_andn2_b32 s0, s2, s3 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_andn2_i16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s3, s0 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_andn2_i16_commute: +; GCN: ; %bb.0: +; GCN-NEXT: s_andn2_b32 s0, s2, s3 +; GCN-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %not.src1, %src0 ret i16 %and } define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) { -; GFX6-LABEL: s_andn2_i16_multi_use: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s1, s3, -1 -; GFX6-NEXT: s_andn2_b32 s0, s2, s3 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_andn2_i16_multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s3, s0 -; GFX9-NEXT: s_xor_b32 s1, s1, s0 -; GFX9-NEXT: s_and_b32 s0, s2, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_andn2_i16_multi_use: +; GCN: ; %bb.0: +; GCN-NEXT: s_xor_b32 s1, s3, -1 +; GCN-NEXT: s_andn2_b32 s0, s2, s3 +; GCN-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0 @@ -256,23 +229,11 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg } define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) { -; GFX6-LABEL: s_andn2_i16_multi_foldable_use: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_andn2_b32 s0, s2, s4 -; GFX6-NEXT: s_andn2_b32 s1, s3, s4 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_andn2_i16_multi_foldable_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: s_and_b32 s0, s4, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s2, s2, s1 -; GFX9-NEXT: s_and_b32 s4, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s0, s2, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_andn2_i16_multi_foldable_use: +; GCN: ; %bb.0: +; GCN-NEXT: s_andn2_b32 s0, s2, s4 +; GCN-NEXT: s_andn2_b32 s1, s3, s4 +; GCN-NEXT: ; return to shader part epilog %not.src2 = xor i16 %src2, -1 %and0 = and i16 %src0, %not.src2 %and1 = and i16 %src1, %not.src2 @@ -308,21 +269,12 @@ define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) { } define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) { -; GFX6-LABEL: v_andn2_i16_vs: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s2, -1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: v_andn2_i16_vs: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX9-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: v_andn2_i16_vs: +; GCN: ; %bb.0: +; GCN-NEXT: s_xor_b32 s0, s2, -1 +; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GCN-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %zext = zext i16 %and to i32 @@ -346,8 +298,7 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1 ; ; GFX9-LABEL: s_andn2_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s0, s3, -1 -; GFX9-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NEXT: s_andn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %src0, %not.src1 @@ -371,8 +322,7 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr ; ; GFX9-LABEL: s_andn2_v2i16_commute: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s0, s3, -1 -; GFX9-NEXT: s_and_b32 s0, s0, s2 +; GFX9-NEXT: s_andn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %not.src1, %src0 @@ -397,7 +347,7 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 ; GFX9-LABEL: s_andn2_v2i16_multi_use: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_xor_b32 s1, s3, -1 -; GFX9-NEXT: s_and_b32 s0, s2, s1 +; GFX9-NEXT: s_andn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %src0, %not.src1 @@ -429,9 +379,8 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg ; ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s1, s4, -1 -; GFX9-NEXT: s_and_b32 s0, s2, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s1 +; GFX9-NEXT: s_andn2_b32 s0, s2, s4 +; GFX9-NEXT: s_andn2_b32 s1, s3, s4 ; GFX9-NEXT: ; return to shader part epilog %not.src2 = xor <2 x i16> %src2, %and0 = and <2 x i16> %src0, %not.src2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir index 137c785231b8e..6313d5820a420 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck %s +# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s --- name: test_unmerge_values_s1_trunc_v2s1_of_build_vector_v2s32 @@ -583,13 +583,59 @@ body: | ; CHECK: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>), [[COPY2]](<4 x s16>) ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; CHECK: S_ENDPGM 0, implicit [[UV]](<3 x s16>), implicit [[UV1]](<3 x s16>), implicit [[UV2]](<3 x s16>), implicit [[UV3]](<3 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV1]](<3 x s16>), 0 + ; CHECK: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; CHECK: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV3]](<3 x s16>), 0 + ; CHECK: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) + ; CHECK: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>), implicit [[BUILD_VECTOR1]](<3 x s32>), implicit [[BUILD_VECTOR2]](<3 x s32>), implicit [[BUILD_VECTOR3]](<3 x s32>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<4 x s16>) = COPY $vgpr4_vgpr5 %3:_(<12 x s16>) = G_CONCAT_VECTORS %0, %1, %2 %4:_(<3 x s16>), %5:_(<3 x s16>), %6:_(<3 x s16>), %7:_(<3 x s16>) = G_UNMERGE_VALUES %3 - S_ENDPGM 0, implicit %4, implicit %5, implicit %6, implicit %7 + %8:_(<3 x s32>) = G_ANYEXT %4 + %9:_(<3 x s32>) = G_ANYEXT %5 + %10:_(<3 x s32>) = G_ANYEXT %6 + %11:_(<3 x s32>) = G_ANYEXT %7 + S_ENDPGM 0, implicit %8, implicit %9, implicit %10, implicit %11 ... --- @@ -1080,13 +1126,15 @@ body: | ; CHECK-LABEL: name: test_unmerge_values_v2s8_v4s8_trunc_v4s32 ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) - ; CHECK: [[TRUNC:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[UV]](<2 x s32>) - ; CHECK: [[TRUNC1:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[UV1]](<2 x s32>) - ; CHECK: S_ENDPGM 0, implicit [[TRUNC]](<2 x s8>), implicit [[TRUNC1]](<2 x s8>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[UV]](<2 x s32>) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[UV1]](<2 x s32>) + ; CHECK: S_ENDPGM 0, implicit [[TRUNC]](<2 x s16>), implicit [[TRUNC1]](<2 x s16>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<4 x s8>) = G_TRUNC %0 %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %1 - S_ENDPGM 0, implicit %2, implicit %3 + %4:_(<2 x s16>) = G_ANYEXT %2 + %5:_(<2 x s16>) = G_ANYEXT %3 + S_ENDPGM 0, implicit %4, implicit %5 ... @@ -1231,13 +1279,15 @@ body: | ; CHECK-LABEL: name: test_unmerge_values_v2s8_v4s8_trunc_v4s16 ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) - ; CHECK: [[TRUNC:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[UV]](<2 x s16>) - ; CHECK: [[TRUNC1:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[UV1]](<2 x s16>) - ; CHECK: S_ENDPGM 0, implicit [[TRUNC]](<2 x s8>), implicit [[TRUNC1]](<2 x s8>) + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY [[UV]](<2 x s16>) + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[UV1]](<2 x s16>) + ; CHECK: S_ENDPGM 0, implicit [[COPY1]](<2 x s16>), implicit [[COPY2]](<2 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s8>) = G_TRUNC %0 %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %1 - S_ENDPGM 0, implicit %2, implicit %3 + %4:_(<2 x s16>) = G_ANYEXT %2 + %5:_(<2 x s16>) = G_ANYEXT %3 + S_ENDPGM 0, implicit %4, implicit %5 ... @@ -1342,3 +1392,80 @@ body: | S_ENDPGM 0, implicit %2, implicit %3 ... + +--- +name: test_unmerge_values_s16_from_v3s16_from_v6s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2 + + ; CHECK-LABEL: name: test_unmerge_values_s16_from_v3s16_from_v6s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV1]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; CHECK: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; CHECK: S_ENDPGM 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16), implicit [[TRUNC2]](s16), implicit [[TRUNC3]](s16), implicit [[TRUNC4]](s16), implicit [[TRUNC5]](s16) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %3:_(s16), %4:_(s16), %5:_(s16) = G_UNMERGE_VALUES %1 + %6:_(s16), %7:_(s16), %8:_(s16) = G_UNMERGE_VALUES %2 + S_ENDPGM 0, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8 + +... + +--- +name: test_unmerge_values_s16_from_v3s16_from_v6s16_other_def_use +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2 + + ; CHECK-LABEL: name: test_unmerge_values_s16_from_v3s16_from_v6s16_other_def_use + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV1]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: S_ENDPGM 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16), implicit [[TRUNC2]](s16), implicit [[BUILD_VECTOR]](<3 x s32>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %3:_(s16), %4:_(s16), %5:_(s16) = G_UNMERGE_VALUES %1 + %6:_(<3 x s32>) = G_ANYEXT %2 + S_ENDPGM 0, implicit %3, implicit %4, implicit %5, implicit %6 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index c1896f81ef296..12f4d441f7cc3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -47,8 +47,8 @@ define i8 @v_ashr_i8_7(i8 %value) { ; GFX9-LABEL: v_ashr_i8_7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 7 -; GFX9-NEXT: v_ashrrev_i16_sdwa v0, s4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v1, 7 +; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = ashr i8 %value, 7 ret i8 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll index a6c2524f1962b..e3775b62a4d2a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -514,7 +514,6 @@ define i64 @v_bswap_i48(i64 %src) { ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; GFX7-NEXT: v_bfi_b32 v2, s4, v0, v2 ; GFX7-NEXT: v_lshr_b64 v[0:1], v[1:2], 16 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bswap_i48: @@ -524,7 +523,6 @@ define i64 @v_bswap_i48(i64 %src) { ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4 ; GFX8-NEXT: v_perm_b32 v2, 0, v0, s4 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_bswap_i48: @@ -534,7 +532,6 @@ define i64 @v_bswap_i48(i64 %src) { ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 ; GFX9-NEXT: v_perm_b32 v2, 0, v0, s4 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %trunc = trunc i64 %src to i48 %bswap = call i48 @llvm.bswap.i48(i48 %trunc) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-add-to-ptradd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-add-to-ptradd.mir new file mode 100644 index 0000000000000..1508d03a1da26 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-add-to-ptradd.mir @@ -0,0 +1,180 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: add_ptrtoint_p1_to_s64_lhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: add_ptrtoint_p1_to_s64_lhs + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) + ; GCN: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[PTR_ADD]](p1) + ; GCN: $vgpr0_vgpr1 = COPY [[PTRTOINT]](s64) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_PTRTOINT %0 + %3:_(s64) = G_ADD %2, %1 + $vgpr0_vgpr1 = COPY %3 + +... + +--- +name: add_ptrtoint_p1_to_s32_lhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GCN-LABEL: name: add_ptrtoint_p1_to_s32_lhs + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p1) + ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PTRTOINT]], [[COPY1]] + ; GCN: $vgpr0 = COPY [[ADD]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s32) = G_PTRTOINT %0 + %3:_(s32) = G_ADD %2, %1 + $vgpr0 = COPY %3 + +... + +--- +name: add_ptrtoint_p3_to_s32_lhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: add_ptrtoint_p3_to_s32_lhs + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[COPY1]](s32) + ; GCN: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[PTR_ADD]](p3) + ; GCN: $vgpr0 = COPY [[PTRTOINT]](s32) + %0:_(p3) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_PTRTOINT %0 + %3:_(s32) = G_ADD %2, %1 + $vgpr0 = COPY %3 + +... + +--- +name: inttoptr_add_ptrtoint_p1_to_s64_lhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: inttoptr_add_ptrtoint_p1_to_s64_lhs + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) + ; GCN: $vgpr0_vgpr1 = COPY [[PTR_ADD]](p1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_PTRTOINT %0 + %3:_(s64) = G_ADD %2, %1 + %4:_(p1) = G_INTTOPTR %3 + $vgpr0_vgpr1 = COPY %4 + +... + +--- +name: add_ptrtoint_v2p3_to_v2s32_lhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: add_ptrtoint_v2p3_to_v2s32_lhs + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 + ; GCN: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(<2 x p3>) = G_PTR_ADD [[COPY]], [[COPY1]](<2 x s32>) + ; GCN: [[PTRTOINT:%[0-9]+]]:_(<2 x s32>) = G_PTRTOINT [[PTR_ADD]](<2 x p3>) + ; GCN: $vgpr0_vgpr1 = COPY [[PTRTOINT]](<2 x s32>) + %0:_(<2 x p3>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(<2 x s32>) = G_PTRTOINT %0 + %3:_(<2 x s32>) = G_ADD %2, %1 + $vgpr0_vgpr1 = COPY %3 + +... + +--- +name: add_ptrtoint_v2p1_to_v2s32_lhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5 + + ; GCN-LABEL: name: add_ptrtoint_v2p1_to_v2s32_lhs + ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5 + ; GCN: [[COPY:%[0-9]+]]:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 + ; GCN: [[PTRTOINT:%[0-9]+]]:_(<2 x s32>) = G_PTRTOINT [[COPY]](<2 x p1>) + ; GCN: [[ADD:%[0-9]+]]:_(<2 x s32>) = G_ADD [[PTRTOINT]], [[COPY1]] + ; GCN: $vgpr0_vgpr1 = COPY [[ADD]](<2 x s32>) + %0:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s32>) = COPY $vgpr4_vgpr5 + %2:_(<2 x s32>) = G_PTRTOINT %0 + %3:_(<2 x s32>) = G_ADD %2, %1 + $vgpr0_vgpr1 = COPY %3 + +... + +--- +name: add_ptrtoint_p1_to_s64_rhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: add_ptrtoint_p1_to_s64_rhs + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) + ; GCN: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[PTR_ADD]](p1) + ; GCN: $vgpr0_vgpr1 = COPY [[PTRTOINT]](s64) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_PTRTOINT %0 + %3:_(s64) = G_ADD %1, %2 + $vgpr0_vgpr1 = COPY %3 + +... + +--- +name: add_ptrtoint_p1_to_s64_lhs_rhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: add_ptrtoint_p1_to_s64_lhs_rhs + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[COPY1:%[0-9]+]]:_(p1) = COPY $vgpr2_vgpr3 + ; GCN: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p1) + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[PTRTOINT]](s64) + ; GCN: [[PTRTOINT1:%[0-9]+]]:_(s64) = G_PTRTOINT [[PTR_ADD]](p1) + ; GCN: $vgpr0_vgpr1 = COPY [[PTRTOINT1]](s64) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(p1) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_PTRTOINT %0 + %3:_(s64) = G_PTRTOINT %1 + %4:_(s64) = G_ADD %2, %3 + $vgpr0_vgpr1 = COPY %4 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir new file mode 100644 index 0000000000000..ca498f5c0b7f4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir @@ -0,0 +1,311 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s + +# Can't narrow this; need known bits +--- +name: shl_s64_by_2_from_anyext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: shl_s64_by_2_from_anyext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX6: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32) + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY [[SHL]](s64) + ; GFX9-LABEL: name: shl_s64_by_2_from_anyext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[SHL]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_ANYEXT %0 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s64) = G_SHL %1, %2 + $vgpr0_vgpr1 = COPY %3 +... + +# Can't narrow this; need known bits +--- +name: shl_s64_by_2_from_sext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: shl_s64_by_2_from_sext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX6: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32) + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY [[SHL]](s64) + ; GFX9-LABEL: name: shl_s64_by_2_from_sext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[SHL]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_SEXT %0 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s64) = G_SHL %1, %2 + $vgpr0_vgpr1 = COPY %3 +... + +# Can't narrow this; need known bits +--- +name: shl_s64_by_2_from_zext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: shl_s64_by_2_from_zext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX6: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY [[SHL]](s64) + ; GFX9-LABEL: name: shl_s64_by_2_from_zext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[SHL]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_ZEXT %0 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s64) = G_SHL %1, %2 + $vgpr0_vgpr1 = COPY %3 +... + +--- +name: narrow_shl_s64_by_2_from_anyext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_anyext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(s32) = COPY $vgpr0 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_anyext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(s32) = COPY $vgpr0 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %narrow:_(s32) = COPY $vgpr0 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masked:_(s32) = G_AND %narrow, %masklow30 + %extend:_(s64) = G_ANYEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_zext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_zext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(s32) = COPY $vgpr0 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_zext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(s32) = COPY $vgpr0 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %narrow:_(s32) = COPY $vgpr0 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masked:_(s32) = G_AND %narrow, %masklow30 + %extend:_(s64) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_sext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_sext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(s32) = COPY $vgpr0 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_sext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(s32) = COPY $vgpr0 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %narrow:_(s32) = COPY $vgpr0 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masked:_(s32) = G_AND %narrow, %masklow30 + %extend:_(s64) = G_SEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_zext_s32_lookthrough_amount +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_zext_s32_lookthrough_amount + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(s32) = COPY $vgpr0 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_zext_s32_lookthrough_amount + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(s32) = COPY $vgpr0 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %narrow:_(s32) = COPY $vgpr0 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masked:_(s32) = G_AND %narrow, %masklow30 + %extend:_(s64) = G_ZEXT %masked + %shiftamt64:_(s64) = G_CONSTANT i64 2 + %shiftamt:_(s32) = G_TRUNC %shiftamt64 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +# Can't introduce a 16-bit shift before gfx8 +--- +name: narrow_shl_s32_by_2_from_zext_s16 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s32_by_2_from_zext_s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX6: %extend:_(s32) = G_ZEXT %masked(s16) + ; GFX6: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX6: %shl:_(s32) = G_SHL %extend, %shiftamt(s32) + ; GFX6: $vgpr0 = COPY %shl(s32) + ; GFX9-LABEL: name: narrow_shl_s32_by_2_from_zext_s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9: %shl:_(s32) = G_ZEXT [[SHL]](s16) + ; GFX9: $vgpr0 = COPY %shl(s32) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s16) = G_TRUNC %argument + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masked:_(s16) = G_AND %narrow, %masklow14 + %extend:_(s32) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s32) = G_SHL %extend, %shiftamt + $vgpr0 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_zext_s16 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_zext_s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX6: %extend:_(s64) = G_ZEXT %masked(s16) + ; GFX6: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX6: %shl:_(s64) = G_SHL %extend, %shiftamt(s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_zext_s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s16) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s16) = G_TRUNC %argument + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masked:_(s16) = G_AND %narrow, %masklow14 + %extend:_(s64) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir new file mode 100644 index 0000000000000..0164a5879b5b6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir @@ -0,0 +1,197 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s + +--- +name: narrow_shl_s32_by_2_from_zext_s16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s32_by_2_from_zext_s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX6: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX6: %shl:_(s32) = G_ZEXT [[SHL]](s16) + ; GFX6: $vgpr0 = COPY %shl(s32) + ; GFX9-LABEL: name: narrow_shl_s32_by_2_from_zext_s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9: %shl:_(s32) = G_ZEXT [[SHL]](s16) + ; GFX9: $vgpr0 = COPY %shl(s32) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s16) = G_TRUNC %argument + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masked:_(s16) = G_AND %narrow, %masklow14 + %extend:_(s32) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s32) = G_SHL %extend, %shiftamt + $vgpr0 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_zext_s16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_zext_s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX6: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s16) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_zext_s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s16) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s16) = G_TRUNC %argument + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masked:_(s16) = G_AND %narrow, %masklow14 + %extend:_(s64) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_s16_by_2_from_zext_s8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s16_by_2_from_zext_s8 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s8) = G_TRUNC %argument(s32) + ; GFX6: %masklow6:_(s8) = G_CONSTANT i8 63 + ; GFX6: %masked:_(s8) = G_AND %narrow, %masklow6 + ; GFX6: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s8) = G_SHL %masked, [[C]](s8) + ; GFX6: %shl:_(s16) = G_ZEXT [[SHL]](s8) + ; GFX6: %result:_(s32) = G_ANYEXT %shl(s16) + ; GFX6: $vgpr0 = COPY %result(s32) + ; GFX9-LABEL: name: narrow_shl_s16_by_2_from_zext_s8 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s8) = G_TRUNC %argument(s32) + ; GFX9: %masklow6:_(s8) = G_CONSTANT i8 63 + ; GFX9: %masked:_(s8) = G_AND %narrow, %masklow6 + ; GFX9: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s8) = G_SHL %masked, [[C]](s8) + ; GFX9: %shl:_(s16) = G_ZEXT [[SHL]](s8) + ; GFX9: %result:_(s32) = G_ANYEXT %shl(s16) + ; GFX9: $vgpr0 = COPY %result(s32) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s8) = G_TRUNC %argument + %masklow6:_(s8) = G_CONSTANT i8 63 + %masked:_(s8) = G_AND %narrow, %masklow6 + %extend:_(s16) = G_ZEXT %masked + %shiftamt:_(s16) = G_CONSTANT i16 2 + %shl:_(s16) = G_SHL %extend, %shiftamt + %result:_(s32) = G_ANYEXT %shl + $vgpr0 = COPY %result +... + +--- +name: narrow_shl_v2s32_by_2_from_zext_v2s16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_v2s32_by_2_from_zext_v2s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(<2 x s16>) = COPY $vgpr0 + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masklow14vec:_(<2 x s16>) = G_BUILD_VECTOR %masklow14(s16), %masklow14(s16) + ; GFX6: %masked:_(<2 x s16>) = G_AND %narrow, %masklow14vec + ; GFX6: %extend:_(<2 x s32>) = G_ZEXT %masked(<2 x s16>) + ; GFX6: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX6: %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt(s32), %shiftamt(s32) + ; GFX6: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvec(<2 x s32>) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(<2 x s32>) + ; GFX9-LABEL: name: narrow_shl_v2s32_by_2_from_zext_v2s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(<2 x s16>) = COPY $vgpr0 + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masklow14vec:_(<2 x s16>) = G_BUILD_VECTOR %masklow14(s16), %masklow14(s16) + ; GFX9: %masked:_(<2 x s16>) = G_AND %narrow, %masklow14vec + ; GFX9: %extend:_(<2 x s32>) = G_ZEXT %masked(<2 x s16>) + ; GFX9: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX9: %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt(s32), %shiftamt(s32) + ; GFX9: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvec(<2 x s32>) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(<2 x s32>) + %narrow:_(<2 x s16>) = COPY $vgpr0 + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masklow14vec:_(<2 x s16>) = G_BUILD_VECTOR %masklow14, %masklow14 + %masked:_(<2 x s16>) = G_AND %narrow, %masklow14vec + %extend:_(<2 x s32>) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt, %shiftamt + %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvec + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_v2s64_by_2_from_anyext_v2s32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX6-LABEL: name: narrow_shl_v2s64_by_2_from_anyext_v2s32 + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6: %narrow:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masklow30vec:_(<2 x s32>) = G_BUILD_VECTOR %masklow30(s32), %masklow30(s32) + ; GFX6: %masked:_(<2 x s32>) = G_AND %narrow, %masklow30vec + ; GFX6: %extend:_(<2 x s64>) = G_ANYEXT %masked(<2 x s32>) + ; GFX6: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX6: %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt(s32), %shiftamt(s32) + ; GFX6: %shl:_(<2 x s64>) = G_SHL %extend, %shiftamtvec(<2 x s32>) + ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %shl(<2 x s64>) + ; GFX9-LABEL: name: narrow_shl_v2s64_by_2_from_anyext_v2s32 + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9: %narrow:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masklow30vec:_(<2 x s32>) = G_BUILD_VECTOR %masklow30(s32), %masklow30(s32) + ; GFX9: %masked:_(<2 x s32>) = G_AND %narrow, %masklow30vec + ; GFX9: %extend:_(<2 x s64>) = G_ANYEXT %masked(<2 x s32>) + ; GFX9: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX9: %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt(s32), %shiftamt(s32) + ; GFX9: %shl:_(<2 x s64>) = G_SHL %extend, %shiftamtvec(<2 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %shl(<2 x s64>) + %narrow:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masklow30vec:_(<2 x s32>) = G_BUILD_VECTOR %masklow30, %masklow30 + %masked:_(<2 x s32>) = G_AND %narrow, %masklow30vec + %extend:_(<2 x s64>) = G_ANYEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt, %shiftamt + %shl:_(<2 x s64>) = G_SHL %extend, %shiftamtvec + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %shl +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index d5d991288ccee..f0f0edcb57f84 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -224,30 +224,28 @@ define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind { ; SI-LABEL: v_uitofp_v4i8_to_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: s_movk_i32 s4, 0xff +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v3, s4, v0 ; SI-NEXT: v_and_b32_e32 v1, s4, v1 ; SI-NEXT: v_and_b32_e32 v2, s4, v2 -; SI-NEXT: v_and_b32_e32 v3, s4, v3 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_uitofp_v4i8_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_movk_i32 s4, 0xff -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; VI-NEXT: v_and_b32_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -581,8 +579,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -612,13 +610,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 @@ -688,8 +685,8 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -707,13 +704,12 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 @@ -737,8 +733,8 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -755,13 +751,12 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xff00, v0 @@ -830,8 +825,8 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -861,13 +856,12 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 @@ -904,8 +898,8 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -922,13 +916,12 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 @@ -950,8 +943,8 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -969,13 +962,12 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 @@ -999,8 +991,8 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1018,13 +1010,12 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, 0xff ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1049,8 +1040,8 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1066,13 +1057,12 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 @@ -1094,8 +1084,8 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float a ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1114,13 +1104,12 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float a ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll index 95b4177abbcab..cc383ff8806d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -19,8 +19,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(<4 x i8> addrspace(4)* in ; GCN-NEXT: s_and_b32 s1, s2, s5 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s5 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_and_b32 s1, s4, 3 ; GCN-NEXT: s_lshl_b32 s1, s1, 3 @@ -40,11 +39,12 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %p ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v2 ; GFX9-NEXT: s_lshl_b32 s0, s2, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -61,11 +61,12 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %p ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -86,7 +87,6 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %p ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -112,11 +112,12 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v2 -; GFX9-NEXT: v_or3_b32 v0, v0, v3, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v4, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -131,11 +132,12 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -158,7 +160,6 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -189,8 +190,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in ; GFX9-NEXT: s_and_b32 s1, s2, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s3, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -213,8 +213,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in ; GFX8-NEXT: s_and_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -237,8 +236,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in ; GFX7-NEXT: s_and_b32 s1, s2, s4 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s3, s4 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s3, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -259,14 +257,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(<4 x i8> addrspace(4)* inreg ; GCN-NEXT: s_lshr_b32 s3, s1, 16 ; GCN-NEXT: s_lshr_b32 s4, s1, 24 ; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_and_b32 s0, s3, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s3, s0 -; GCN-NEXT: s_and_b32 s0, s4, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s1, s4, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 0 @@ -284,14 +281,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(<4 x i8> addrspace(4)* inreg ; GCN-NEXT: s_lshr_b32 s3, s1, 16 ; GCN-NEXT: s_lshr_b32 s4, s1, 24 ; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_and_b32 s0, s3, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s3, s0 -; GCN-NEXT: s_and_b32 s0, s4, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s1, s4, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 8 ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -310,14 +306,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(<4 x i8> addrspace(4)* inreg ; GCN-NEXT: s_lshr_b32 s3, s1, 16 ; GCN-NEXT: s_lshr_b32 s4, s1, 24 ; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_and_b32 s0, s3, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s3, s0 -; GCN-NEXT: s_and_b32 s0, s4, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s1, s4, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -336,14 +331,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(<4 x i8> addrspace(4)* inreg ; GCN-NEXT: s_lshr_b32 s3, s1, 16 ; GCN-NEXT: s_lshr_b32 s4, s1, 24 ; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_and_b32 s0, s3, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s3, s0 -; GCN-NEXT: s_and_b32 s0, s4, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s1, s4, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 24 ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -360,11 +354,12 @@ define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i8_idx0: @@ -377,11 +372,12 @@ define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i8_idx0: @@ -396,11 +392,10 @@ define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -421,11 +416,12 @@ define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -439,11 +435,12 @@ define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -459,11 +456,10 @@ define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -485,11 +481,12 @@ define i8 @extractelement_vgpr_v4i8_idx2(<4 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -503,11 +500,12 @@ define i8 @extractelement_vgpr_v4i8_idx2(<4 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -523,11 +521,10 @@ define i8 @extractelement_vgpr_v4i8_idx2(<4 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -549,11 +546,12 @@ define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -567,11 +565,12 @@ define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -587,11 +586,10 @@ define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -620,8 +618,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in ; GCN-NEXT: s_and_b32 s2, s3, s9 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s0, s0, s2 -; GCN-NEXT: s_and_b32 s2, s5, s9 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshl_b32 s2, s5, 24 ; GCN-NEXT: s_lshr_b32 s6, s1, 8 ; GCN-NEXT: s_or_b32 s0, s0, s2 ; GCN-NEXT: s_and_b32 s2, s6, s9 @@ -633,8 +630,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in ; GCN-NEXT: s_and_b32 s2, s7, s9 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s8, s9 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshl_b32 s2, s8, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: s_lshr_b32 s2, s4, 2 ; GCN-NEXT: s_cmp_eq_u32 s2, 1 @@ -653,24 +649,25 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: s_movk_i32 s1, 0xff ; GFX9-NEXT: s_lshr_b32 s3, s2, 2 ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v2 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v6, v3 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: s_lshl_b32 s0, s2, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 @@ -690,19 +687,21 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p ; GFX8-NEXT: s_lshl_b32 s0, s1, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -730,11 +729,9 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p ; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -761,24 +758,25 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_movk_i32 s5, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v7, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v9, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s5, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v8 -; GFX9-NEXT: v_or3_b32 v1, v1, v9, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s5, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -797,19 +795,21 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v11, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v10 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -841,8 +841,6 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -881,8 +879,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in ; GCN-NEXT: s_and_b32 s2, s3, s8 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s0, s0, s2 -; GCN-NEXT: s_and_b32 s2, s4, s8 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshl_b32 s2, s4, 24 ; GCN-NEXT: s_lshr_b32 s5, s1, 8 ; GCN-NEXT: s_or_b32 s0, s0, s2 ; GCN-NEXT: s_and_b32 s2, s5, s8 @@ -894,8 +891,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in ; GCN-NEXT: s_and_b32 s2, s6, s8 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_and_b32 s2, s7, s8 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshl_b32 s2, s7, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 @@ -925,8 +921,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(<8 x i8> addrspace(4)* inreg ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -950,8 +945,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(<8 x i8> addrspace(4)* inreg ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 8 ; GCN-NEXT: ; return to shader part epilog @@ -976,8 +970,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(<8 x i8> addrspace(4)* inreg ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog @@ -1002,8 +995,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(<8 x i8> addrspace(4)* inreg ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 24 ; GCN-NEXT: ; return to shader part epilog @@ -1028,8 +1020,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(<8 x i8> addrspace(4)* inreg ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -1053,8 +1044,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(<8 x i8> addrspace(4)* inreg ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 8 ; GCN-NEXT: ; return to shader part epilog @@ -1079,8 +1069,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(<8 x i8> addrspace(4)* inreg ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog @@ -1105,8 +1094,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(<8 x i8> addrspace(4)* inreg ; GCN-NEXT: s_and_b32 s1, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_and_b32 s1, s3, s4 -; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 24 ; GCN-NEXT: ; return to shader part epilog @@ -1124,11 +1112,12 @@ define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx0: @@ -1141,11 +1130,12 @@ define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_idx0: @@ -1160,11 +1150,10 @@ define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1185,11 +1174,12 @@ define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1203,11 +1193,12 @@ define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1223,11 +1214,10 @@ define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1249,11 +1239,12 @@ define i8 @extractelement_vgpr_v8i8_idx2(<8 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1267,11 +1258,12 @@ define i8 @extractelement_vgpr_v8i8_idx2(<8 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1287,11 +1279,10 @@ define i8 @extractelement_vgpr_v8i8_idx2(<8 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1313,11 +1304,12 @@ define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1331,11 +1323,12 @@ define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1351,11 +1344,10 @@ define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1378,10 +1370,11 @@ define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx4: @@ -1394,10 +1387,11 @@ define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1413,11 +1407,10 @@ define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1439,10 +1432,11 @@ define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1456,10 +1450,11 @@ define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1476,11 +1471,10 @@ define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1503,10 +1497,11 @@ define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1520,10 +1515,11 @@ define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1540,11 +1536,10 @@ define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1567,10 +1562,11 @@ define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1584,10 +1580,11 @@ define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1604,11 +1601,10 @@ define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1637,8 +1633,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)* ; GCN-NEXT: s_and_b32 s5, s6, s17 ; GCN-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NEXT: s_or_b32 s0, s0, s5 -; GCN-NEXT: s_and_b32 s5, s7, s17 -; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshl_b32 s5, s7, 24 ; GCN-NEXT: s_lshr_b32 s8, s1, 8 ; GCN-NEXT: s_or_b32 s0, s0, s5 ; GCN-NEXT: s_and_b32 s5, s8, s17 @@ -1650,8 +1645,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)* ; GCN-NEXT: s_and_b32 s5, s9, s17 ; GCN-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NEXT: s_or_b32 s1, s1, s5 -; GCN-NEXT: s_and_b32 s5, s10, s17 -; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshl_b32 s5, s10, 24 ; GCN-NEXT: s_lshr_b32 s11, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s5 ; GCN-NEXT: s_and_b32 s5, s11, s17 @@ -1663,8 +1657,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)* ; GCN-NEXT: s_and_b32 s5, s12, s17 ; GCN-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NEXT: s_or_b32 s2, s2, s5 -; GCN-NEXT: s_and_b32 s5, s13, s17 -; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshl_b32 s5, s13, 24 ; GCN-NEXT: s_lshr_b32 s14, s3, 8 ; GCN-NEXT: s_or_b32 s2, s2, s5 ; GCN-NEXT: s_and_b32 s5, s14, s17 @@ -1676,8 +1669,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)* ; GCN-NEXT: s_and_b32 s5, s15, s17 ; GCN-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NEXT: s_or_b32 s3, s3, s5 -; GCN-NEXT: s_and_b32 s5, s16, s17 -; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshl_b32 s5, s16, 24 ; GCN-NEXT: s_or_b32 s3, s3, s5 ; GCN-NEXT: s_lshr_b32 s5, s4, 2 ; GCN-NEXT: s_cmp_eq_u32 s5, 1 @@ -1702,38 +1694,42 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_movk_i32 s1, 0xff -; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: s_lshr_b32 s3, s2, 2 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v10, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_and_b32_sdwa v14, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v6 -; GFX9-NEXT: v_and_b32_sdwa v12, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v7 -; GFX9-NEXT: v_and_b32_sdwa v14, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v2, v2, s1, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v11 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v8 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v2, v2, s1, v10 +; GFX9-NEXT: v_or3_b32 v0, v0, v14, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 +; GFX9-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v13 ; GFX9-NEXT: v_and_or_b32 v3, v3, v4, v5 -; GFX9-NEXT: v_or3_b32 v2, v2, v14, v15 +; GFX9-NEXT: v_or3_b32 v2, v2, v16, v11 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_or3_b32 v3, v3, v16, v17 +; GFX9-NEXT: v_or3_b32 v3, v3, v17, v12 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_lshl_b32 s0, s2, 3 @@ -1754,37 +1750,41 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* ; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v12, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX8-NEXT: v_and_b32_sdwa v16, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 -; GFX8-NEXT: v_and_b32_sdwa v15, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 -; GFX8-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v6, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v16 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v17, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v12 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_lshl_b32 s0, s1, 3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 @@ -1816,27 +1816,24 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 8, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 -; GFX7-NEXT: v_and_b32_e32 v11, v11, v4 +; GFX7-NEXT: v_and_b32_e32 v11, s0, v11 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v10 ; GFX7-NEXT: v_and_b32_e32 v12, v12, v4 ; GFX7-NEXT: v_and_b32_e32 v14, v14, v4 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX7-NEXT: v_and_b32_e32 v13, v13, v4 -; GFX7-NEXT: v_and_b32_e32 v15, v15, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, v15, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 @@ -1844,20 +1841,19 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* ; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v11 -; GFX7-NEXT: v_and_b32_e32 v4, v16, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v14 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 24, v16 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v13 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v15 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7-NEXT: s_lshl_b32 s0, s2, 3 @@ -1877,38 +1873,42 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_movk_i32 s5, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v12, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v4, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX9-NEXT: v_and_b32_sdwa v16, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v3, v3, s5, v8 -; GFX9-NEXT: v_and_or_b32 v4, v4, s5, v9 -; GFX9-NEXT: v_and_b32_sdwa v16, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v4, v4, s5, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: v_and_b32_sdwa v18, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v6, v0, v1 -; GFX9-NEXT: v_or3_b32 v1, v3, v12, v13 -; GFX9-NEXT: v_or3_b32 v3, v4, v14, v15 -; GFX9-NEXT: v_and_or_b32 v5, v5, s5, v10 +; GFX9-NEXT: v_or3_b32 v1, v3, v16, v9 +; GFX9-NEXT: v_or3_b32 v3, v4, v17, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX9-NEXT: v_and_or_b32 v5, v5, s5, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 24, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_or3_b32 v4, v5, v16, v17 +; GFX9-NEXT: v_or3_b32 v4, v5, v18, v13 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_or3_b32 v0, v0, v18, v19 +; GFX9-NEXT: v_or3_b32 v0, v0, v19, v14 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 @@ -1929,36 +1929,40 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v14, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v5 +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 -; GFX8-NEXT: v_and_b32_sdwa v8, v4, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX8-NEXT: v_and_b32_sdwa v17, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v18, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX8-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v8, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v18 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 24, v5 +; GFX8-NEXT: v_and_b32_sdwa v19, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v15 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX8-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 24, v17 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 @@ -1991,27 +1995,24 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 8, v6 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 -; GFX7-NEXT: v_and_b32_e32 v12, v12, v0 +; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 24, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 ; GFX7-NEXT: v_and_b32_e32 v13, v13, v0 ; GFX7-NEXT: v_and_b32_e32 v15, v15, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_and_b32_e32 v14, v14, v0 -; GFX7-NEXT: v_and_b32_e32 v16, v16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 24, v6 ; GFX7-NEXT: v_and_b32_e32 v6, v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 24, v11 @@ -2019,20 +2020,19 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_and_b32_e32 v0, v17, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v15 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v17 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v14 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v18 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v18 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 @@ -2062,8 +2062,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* ; GCN-NEXT: s_and_b32 s4, s5, s16 ; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_or_b32 s0, s0, s4 -; GCN-NEXT: s_and_b32 s4, s6, s16 -; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshl_b32 s4, s6, 24 ; GCN-NEXT: s_lshr_b32 s7, s1, 8 ; GCN-NEXT: s_or_b32 s0, s0, s4 ; GCN-NEXT: s_and_b32 s4, s7, s16 @@ -2075,8 +2074,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* ; GCN-NEXT: s_and_b32 s4, s8, s16 ; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_or_b32 s1, s1, s4 -; GCN-NEXT: s_and_b32 s4, s9, s16 -; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshl_b32 s4, s9, 24 ; GCN-NEXT: s_lshr_b32 s10, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s4 ; GCN-NEXT: s_and_b32 s4, s10, s16 @@ -2088,8 +2086,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* ; GCN-NEXT: s_and_b32 s4, s11, s16 ; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_or_b32 s2, s2, s4 -; GCN-NEXT: s_and_b32 s4, s12, s16 -; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshl_b32 s4, s12, 24 ; GCN-NEXT: s_lshr_b32 s13, s3, 8 ; GCN-NEXT: s_or_b32 s2, s2, s4 ; GCN-NEXT: s_and_b32 s4, s13, s16 @@ -2101,8 +2098,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* ; GCN-NEXT: s_and_b32 s4, s14, s16 ; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_or_b32 s3, s3, s4 -; GCN-NEXT: s_and_b32 s4, s15, s16 -; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshl_b32 s4, s15, 24 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: s_or_b32 s3, s3, s4 @@ -2131,11 +2127,12 @@ define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx0: @@ -2148,11 +2145,12 @@ define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx0: @@ -2167,11 +2165,10 @@ define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2192,11 +2189,12 @@ define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2210,11 +2208,12 @@ define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2230,11 +2229,10 @@ define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2256,11 +2254,12 @@ define i8 @extractelement_vgpr_v16i8_idx2(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,11 +2273,12 @@ define i8 @extractelement_vgpr_v16i8_idx2(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2294,11 +2294,10 @@ define i8 @extractelement_vgpr_v16i8_idx2(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2320,11 +2319,12 @@ define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2338,11 +2338,12 @@ define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2358,11 +2359,10 @@ define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2385,10 +2385,11 @@ define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx4: @@ -2401,10 +2402,11 @@ define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2420,11 +2422,10 @@ define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2446,10 +2447,11 @@ define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2463,10 +2465,11 @@ define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2483,11 +2486,10 @@ define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2510,10 +2512,11 @@ define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2527,10 +2530,11 @@ define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2547,11 +2551,10 @@ define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2574,10 +2577,11 @@ define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,10 +2595,11 @@ define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2611,11 +2616,10 @@ define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2637,11 +2641,12 @@ define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx8: @@ -2654,10 +2659,11 @@ define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2673,11 +2679,10 @@ define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2698,11 +2703,12 @@ define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2716,10 +2722,11 @@ define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2736,11 +2743,10 @@ define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2762,11 +2768,12 @@ define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2780,10 +2787,11 @@ define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2800,11 +2808,10 @@ define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2826,11 +2833,12 @@ define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2844,10 +2852,11 @@ define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2864,11 +2873,10 @@ define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2890,11 +2898,12 @@ define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx12: @@ -2907,10 +2916,11 @@ define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2926,11 +2936,10 @@ define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2951,11 +2960,12 @@ define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2969,10 +2979,11 @@ define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2989,11 +3000,10 @@ define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3015,11 +3025,12 @@ define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3033,10 +3044,11 @@ define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3053,11 +3065,10 @@ define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3079,11 +3090,12 @@ define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) { ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 -; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3097,10 +3109,11 @@ define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) { ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3117,11 +3130,10 @@ define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index 7c3e74dfcf695..2951bb86a16fd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -596,18 +596,20 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v4 +; GFX9-NEXT: v_or3_b32 v2, v0, v4, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -618,8 +620,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: s_and_b32 s1, s3, 3 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: s_and_b32 s2, s2, s0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 @@ -629,20 +631,22 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -670,7 +674,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -685,7 +688,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -717,8 +719,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s3, s6 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s5, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s4, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 @@ -727,11 +728,12 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, s2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v0, s6, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -741,7 +743,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: s_movk_i32 s5, 0xff -; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s1, s0, 8 ; GFX8-NEXT: s_and_b32 s1, s1, s5 @@ -753,8 +755,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s1, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s4, 3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 @@ -764,13 +765,14 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -792,8 +794,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s1, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s3, s5 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s3, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s4, 3 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 @@ -811,8 +812,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_and_b32_e32 v1, s5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -844,8 +844,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s3, s6 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s5, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s4, s6 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s2 @@ -853,11 +852,12 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_or_b32 v0, s1, v0, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v0, s6, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -869,7 +869,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_movk_i32 s5, 0xff ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s1, s0, 8 ; GFX8-NEXT: s_and_b32 s1, s1, s5 @@ -881,8 +881,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s1, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s4, s5 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s1 @@ -891,13 +890,14 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -920,8 +920,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s1, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s3, s5 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s3, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s4, s5 ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 @@ -939,8 +938,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_and_b32_e32 v1, s5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -971,20 +969,20 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s3, s5 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 -; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s4, s5 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s5 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s2, s4, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v0, s1, v1, v0 ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v1, v2, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1007,23 +1005,23 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1047,8 +1045,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s1, s2, s4 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s3, s4 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s3, 24 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, s4, v1 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -1065,8 +1062,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1094,18 +1090,20 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v6, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v4 +; GFX9-NEXT: v_or3_b32 v2, v0, v4, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1116,8 +1114,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v5, v2, s1 @@ -1127,20 +1125,22 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1168,7 +1168,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 @@ -1183,7 +1182,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1211,18 +1209,20 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v4 +; GFX9-NEXT: v_or3_b32 v2, v0, v4, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1244,20 +1244,22 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1285,7 +1287,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -1300,7 +1301,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1329,18 +1329,20 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v7, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v6 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v5, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 -; GFX9-NEXT: v_or3_b32 v2, v0, v3, v5 +; GFX9-NEXT: v_or3_b32 v2, v0, v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1351,8 +1353,8 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 @@ -1362,20 +1364,22 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX8-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1403,7 +1407,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-NEXT: v_and_b32_e32 v6, s2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 @@ -1412,18 +1415,17 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1451,8 +1453,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s3, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s6, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s7, s10 @@ -1464,8 +1465,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s8, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s9, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s9, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s5, 2 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 @@ -1491,8 +1491,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s3, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s4, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s4, 24 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s5, s10 @@ -1504,8 +1503,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s6, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s7, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s7, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1529,8 +1527,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s3, s10 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s6, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s6, 24 ; GFX8-NEXT: s_lshr_b32 s7, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s7, s10 @@ -1542,8 +1539,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s8, s10 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s9, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s9, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_lshr_b32 s2, s5, 2 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 @@ -1569,8 +1565,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s3, s10 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s4, 24 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s5, s10 @@ -1582,8 +1577,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s6, s10 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s7, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s7, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1605,8 +1599,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s2, s3, s10 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s6, s10 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s6, 24 ; GFX7-NEXT: s_lshr_b32 s7, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s7, s10 @@ -1618,8 +1611,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s2, s8, s10 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s2, s9, s10 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s9, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_lshr_b32 s2, s5, 2 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 @@ -1645,8 +1637,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s4, s5, s10 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s4 -; GFX7-NEXT: s_and_b32 s4, s6, s10 -; GFX7-NEXT: s_lshl_b32 s4, s4, 24 +; GFX7-NEXT: s_lshl_b32 s4, s6, 24 ; GFX7-NEXT: s_lshr_b32 s7, s3, 8 ; GFX7-NEXT: s_or_b32 s2, s2, s4 ; GFX7-NEXT: s_and_b32 s4, s7, s10 @@ -1658,8 +1649,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s4, s8, s10 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 ; GFX7-NEXT: s_or_b32 s3, s3, s4 -; GFX7-NEXT: s_and_b32 s4, s9, s10 -; GFX7-NEXT: s_lshl_b32 s4, s4, 24 +; GFX7-NEXT: s_lshl_b32 s4, s9, 24 ; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 @@ -1679,7 +1669,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_lshr_b32 s1, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 @@ -1690,37 +1679,42 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX9-NEXT: s_not_b32 s3, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v7, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v3, v4, s3, v3 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v3 -; GFX9-NEXT: v_and_b32_sdwa v7, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm @@ -1742,19 +1736,21 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_and_b32_sdwa v10, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 @@ -1762,20 +1758,22 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -1810,8 +1808,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -1839,11 +1835,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s6, v6 ; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -1881,8 +1875,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s3, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s6, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s7, s10 @@ -1894,8 +1887,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s8, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s9, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s9, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s4, 2 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 @@ -1913,17 +1905,19 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s10, v2 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-NEXT: v_and_or_b32 v2, v1, s10, v2 -; GFX9-NEXT: v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -1933,7 +1927,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_movk_i32 s9, 0xff -; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_and_b32 s2, s2, s9 @@ -1945,8 +1940,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s3, s9 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s5, s9 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s5, 24 ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s6, s9 @@ -1958,8 +1952,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s7, s9 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s8, s9 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s8, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_lshr_b32 s2, s4, 2 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 @@ -1978,21 +1971,21 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2014,8 +2007,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s2, s3, s9 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s5, s9 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s5, 24 ; GFX7-NEXT: s_lshr_b32 s6, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s6, s9 @@ -2027,8 +2019,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s2, s7, s9 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s2, s8, s9 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s8, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_lshr_b32 s2, s4, 2 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 @@ -2055,8 +2046,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_and_b32_e32 v2, s9, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s9, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s9, v5 @@ -2068,8 +2058,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_and_b32_e32 v2, s9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s9, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2101,8 +2090,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s3, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s6, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s7, s10 @@ -2114,8 +2102,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s8, s10 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s9, s10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s9, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -2133,17 +2120,19 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_mov_b32 s5, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s10, v2 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-NEXT: v_and_or_b32 v2, v1, s10, v2 -; GFX9-NEXT: v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -2167,8 +2156,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s3, s9 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s5, s9 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s5, 24 ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s6, s9 @@ -2180,8 +2168,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s7, s9 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s8, s9 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s8, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -2198,23 +2185,24 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2238,8 +2226,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s2, s3, s9 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s5, s9 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s5, 24 ; GFX7-NEXT: s_lshr_b32 s6, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s6, s9 @@ -2251,8 +2238,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s2, s7, s9 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s2, s8, s9 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s8, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 @@ -2279,8 +2265,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_and_b32_e32 v2, s9, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s9, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s9, v5 @@ -2292,8 +2277,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_and_b32_e32 v2, s9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s9, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2325,8 +2309,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s3, s9 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s9 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s5, 24 ; GFX9-NEXT: s_lshr_b32 s6, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s6, s9 @@ -2338,8 +2321,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_and_b32 s2, s7, s9 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s8, s9 -; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshl_b32 s2, s8, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s9 @@ -2356,17 +2338,19 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s9, v2 -; GFX9-NEXT: v_and_b32_sdwa v4, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-NEXT: v_and_or_b32 v2, v1, s9, v2 -; GFX9-NEXT: v_and_b32_sdwa v3, v1, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -2390,8 +2374,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s3, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s8 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s4, 24 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s5, s8 @@ -2403,8 +2386,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s2, s6, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s7, s8 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshl_b32 s2, s7, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 @@ -2420,23 +2402,24 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2460,8 +2443,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s2, s3, s8 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s4, s8 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s4, 24 ; GFX7-NEXT: s_lshr_b32 s5, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s5, s8 @@ -2473,8 +2455,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s2, s6, s8 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s2, s7, s8 -; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshl_b32 s2, s7, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 @@ -2501,8 +2482,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_and_b32_e32 v2, s8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s8, v5 @@ -2514,8 +2494,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_and_b32_e32 v2, s8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2533,7 +2512,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 @@ -2543,37 +2521,42 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v10, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 -; GFX9-NEXT: v_and_b32_sdwa v10, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v8, v9 -; GFX9-NEXT: v_or3_b32 v1, v1, v10, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v11, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v2, v6, v2, v5 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v2 -; GFX9-NEXT: v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm @@ -2596,39 +2579,43 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v12, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_and_b32_sdwa v13, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -2665,8 +2652,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 -; GFX7-NEXT: v_and_b32_e32 v10, s3, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -2696,8 +2681,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -2721,7 +2704,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: s_lshr_b32 s1, s2, 2 ; GFX9-NEXT: s_and_b32 s2, s2, 3 @@ -2730,37 +2712,42 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v7, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v2, v4, s2, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v2 -; GFX9-NEXT: v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm @@ -2782,19 +2769,21 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v10, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -2802,20 +2791,22 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -2851,8 +2842,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_and_b32_e32 v5, s3, v5 -; GFX7-NEXT: v_and_b32_e32 v8, s3, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -2883,8 +2872,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -2908,7 +2895,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_movk_i32 s1, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 @@ -2918,37 +2904,42 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v9, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v11, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v12, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v7 -; GFX9-NEXT: v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v12, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v8 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v10 -; GFX9-NEXT: v_or3_b32 v1, v1, v11, v12 +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v12, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 +; GFX9-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v6 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm @@ -2971,39 +2962,43 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v8 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v13, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -3037,11 +3032,9 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 ; GFX7-NEXT: v_and_b32_e32 v10, s0, v10 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 @@ -3061,29 +3054,27 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v6, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v7, v7, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_and_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v8, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3110,8 +3101,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s6, s7, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s6, s8, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s8, 24 ; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s6 ; GFX9-NEXT: s_and_b32 s6, s9, s18 @@ -3123,8 +3113,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s6, s10, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s6, s11, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s11, 24 ; GFX9-NEXT: s_lshr_b32 s12, s2, 8 ; GFX9-NEXT: s_or_b32 s1, s1, s6 ; GFX9-NEXT: s_and_b32 s6, s12, s18 @@ -3136,8 +3125,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s6, s13, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s6, s14, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s14, 24 ; GFX9-NEXT: s_lshr_b32 s15, s3, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_and_b32 s6, s15, s18 @@ -3149,8 +3137,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s6, s16, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s6 -; GFX9-NEXT: s_and_b32 s6, s17, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s17, 24 ; GFX9-NEXT: s_or_b32 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s6, s5, 2 ; GFX9-NEXT: s_cmp_eq_u32 s6, 1 @@ -3184,8 +3171,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s4, s5, s18 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s6, s18 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s4 ; GFX9-NEXT: s_and_b32 s4, s7, s18 @@ -3197,8 +3183,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s4, s8, s18 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s4, s9, s18 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s9, 24 ; GFX9-NEXT: s_lshr_b32 s10, s2, 8 ; GFX9-NEXT: s_or_b32 s1, s1, s4 ; GFX9-NEXT: s_and_b32 s4, s10, s18 @@ -3210,8 +3195,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s4, s11, s18 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s4, s12, s18 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s12, 24 ; GFX9-NEXT: s_lshr_b32 s13, s3, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s4 ; GFX9-NEXT: s_and_b32 s4, s13, s18 @@ -3223,8 +3207,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s4, s14, s18 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s4 -; GFX9-NEXT: s_and_b32 s4, s15, s18 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s15, 24 ; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -3250,8 +3233,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s6, s7, s18 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s8, s18 -; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshl_b32 s6, s8, 24 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s9, s18 @@ -3263,8 +3245,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s6, s10, s18 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_and_b32 s6, s11, s18 -; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshl_b32 s6, s11, 24 ; GFX8-NEXT: s_lshr_b32 s12, s2, 8 ; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_and_b32 s6, s12, s18 @@ -3276,8 +3257,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s6, s13, s18 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s6, s14, s18 -; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshl_b32 s6, s14, 24 ; GFX8-NEXT: s_lshr_b32 s15, s3, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_and_b32 s6, s15, s18 @@ -3289,8 +3269,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s6, s16, s18 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s6 -; GFX8-NEXT: s_and_b32 s6, s17, s18 -; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshl_b32 s6, s17, 24 ; GFX8-NEXT: s_or_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s6, s5, 2 ; GFX8-NEXT: s_cmp_eq_u32 s6, 1 @@ -3324,8 +3303,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s4, s5, s18 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s4, s6, s18 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s6, 24 ; GFX8-NEXT: s_lshr_b32 s7, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s4, s7, s18 @@ -3337,8 +3315,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s4, s8, s18 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, s9, s18 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s9, 24 ; GFX8-NEXT: s_lshr_b32 s10, s2, 8 ; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_and_b32 s4, s10, s18 @@ -3350,8 +3327,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s4, s11, s18 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_and_b32 s4, s12, s18 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s12, 24 ; GFX8-NEXT: s_lshr_b32 s13, s3, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, s13, s18 @@ -3363,8 +3339,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s4, s14, s18 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s4 -; GFX8-NEXT: s_and_b32 s4, s15, s18 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s15, 24 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3388,8 +3363,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s6, s7, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s6 -; GFX7-NEXT: s_and_b32 s6, s8, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s8, 24 ; GFX7-NEXT: s_lshr_b32 s9, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s6 ; GFX7-NEXT: s_and_b32 s6, s9, s18 @@ -3401,8 +3375,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s6, s10, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s6 -; GFX7-NEXT: s_and_b32 s6, s11, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s11, 24 ; GFX7-NEXT: s_lshr_b32 s12, s2, 8 ; GFX7-NEXT: s_or_b32 s1, s1, s6 ; GFX7-NEXT: s_and_b32 s6, s12, s18 @@ -3414,8 +3387,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s6, s13, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s6 -; GFX7-NEXT: s_and_b32 s6, s14, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s14, 24 ; GFX7-NEXT: s_lshr_b32 s15, s3, 8 ; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_and_b32 s6, s15, s18 @@ -3427,8 +3399,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s6, s16, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s3, s3, s6 -; GFX7-NEXT: s_and_b32 s6, s17, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s17, 24 ; GFX7-NEXT: s_or_b32 s3, s3, s6 ; GFX7-NEXT: s_lshr_b32 s6, s5, 2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 1 @@ -3460,12 +3431,11 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_lshl_b32 s4, s4, 8 ; GFX7-NEXT: s_or_b32 s4, s5, s4 ; GFX7-NEXT: s_and_b32 s5, s6, s18 -; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_lshr_b32 s9, s7, 8 -; GFX7-NEXT: s_or_b32 s4, s4, s5 -; GFX7-NEXT: s_and_b32 s5, s8, s18 +; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_and_b32 s6, s9, s18 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_or_b32 s4, s4, s5 +; GFX7-NEXT: s_lshl_b32 s5, s8, 24 ; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_lshr_b32 s10, s7, 16 ; GFX7-NEXT: s_and_b32 s5, s7, s18 @@ -3475,8 +3445,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_lshr_b32 s11, s7, 24 ; GFX7-NEXT: s_or_b32 s5, s5, s6 -; GFX7-NEXT: s_and_b32 s6, s11, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s11, 24 ; GFX7-NEXT: s_lshr_b32 s12, s2, 8 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_and_b32 s6, s12, s18 @@ -3488,8 +3457,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s6, s13, s18 ; GFX7-NEXT: s_lshl_b32 s6, s6, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s6 -; GFX7-NEXT: s_and_b32 s6, s14, s18 -; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshl_b32 s6, s14, 24 ; GFX7-NEXT: s_lshr_b32 s15, s3, 8 ; GFX7-NEXT: s_or_b32 s6, s2, s6 ; GFX7-NEXT: s_lshr_b32 s16, s3, 16 @@ -3501,8 +3469,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s3, s16, s18 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s3 -; GFX7-NEXT: s_and_b32 s3, s17, s18 -; GFX7-NEXT: s_lshl_b32 s3, s3, 24 +; GFX7-NEXT: s_lshl_b32 s3, s17, 24 ; GFX7-NEXT: s_or_b32 s7, s2, s3 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 @@ -3538,31 +3505,35 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v3 -; GFX9-NEXT: v_and_b32_sdwa v10, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v6 -; GFX9-NEXT: v_and_b32_sdwa v12, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v7 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v14, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v11 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v13 -; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v9 -; GFX9-NEXT: v_or3_b32 v2, v2, v14, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX9-NEXT: v_and_b32_sdwa v14, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v10 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v0, v0, v14, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 +; GFX9-NEXT: v_and_b32_sdwa v17, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v12 +; GFX9-NEXT: v_or3_b32 v2, v2, v16, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v3, v3, v16, v17 +; GFX9-NEXT: v_or3_b32 v3, v3, v17, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] ; GFX9-NEXT: v_and_or_b32 v5, v6, s5, v5 @@ -3572,30 +3543,34 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v9, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v13, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v5 -; GFX9-NEXT: v_and_b32_sdwa v11, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v12, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v6 -; GFX9-NEXT: v_and_b32_sdwa v13, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v7 -; GFX9-NEXT: v_and_b32_sdwa v15, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v10 -; GFX9-NEXT: v_or3_b32 v1, v1, v11, v12 -; GFX9-NEXT: v_or3_b32 v2, v2, v13, v14 -; GFX9-NEXT: v_or3_b32 v3, v3, v15, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v9 +; GFX9-NEXT: v_or3_b32 v0, v0, v13, v6 +; GFX9-NEXT: v_or3_b32 v1, v1, v14, v8 +; GFX9-NEXT: v_or3_b32 v2, v2, v15, v10 +; GFX9-NEXT: v_or3_b32 v3, v3, v16, v11 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm @@ -3619,34 +3594,38 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX8-NEXT: v_and_b32_sdwa v15, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_and_b32_sdwa v15, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 -; GFX8-NEXT: v_and_b32_sdwa v17, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v18, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_and_b32_sdwa v16, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX8-NEXT: v_and_b32_sdwa v17, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v14 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v4, s6, v4 @@ -3657,34 +3636,38 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v14, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v14 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -3728,8 +3711,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v6 -; GFX7-NEXT: v_and_b32_e32 v9, s6, v9 ; GFX7-NEXT: v_and_b32_e32 v11, s6, v11 ; GFX7-NEXT: v_and_b32_e32 v13, s6, v13 ; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 @@ -3739,7 +3720,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v12 ; GFX7-NEXT: v_and_b32_e32 v14, s6, v14 ; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -3749,7 +3729,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX7-NEXT: v_and_b32_e32 v15, s6, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 @@ -3774,16 +3753,10 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 @@ -3793,8 +3766,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v4, s6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v10 @@ -3806,8 +3778,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v4, s6, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v13 @@ -3815,12 +3786,16 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -3848,8 +3823,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s6, s7, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s6, s8, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s8, 24 ; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s6 ; GFX9-NEXT: s_and_b32 s6, s9, s18 @@ -3861,8 +3835,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s6, s10, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s6, s11, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s11, 24 ; GFX9-NEXT: s_lshr_b32 s12, s2, 8 ; GFX9-NEXT: s_or_b32 s1, s1, s6 ; GFX9-NEXT: s_and_b32 s6, s12, s18 @@ -3874,8 +3847,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s6, s13, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s6, s14, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s14, 24 ; GFX9-NEXT: s_lshr_b32 s15, s3, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_and_b32 s6, s15, s18 @@ -3887,8 +3859,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s6, s16, s18 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s6 -; GFX9-NEXT: s_and_b32 s6, s17, s18 -; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshl_b32 s6, s17, 24 ; GFX9-NEXT: s_or_b32 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s6, s4, 2 ; GFX9-NEXT: s_cmp_eq_u32 s6, 1 @@ -3916,30 +3887,34 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v4, v0, s18, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v4, v8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v4, v0, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v5, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v5, v2, s18, v5 -; GFX9-NEXT: v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v5, v6, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 ; GFX9-NEXT: v_and_or_b32 v4, v3, s18, v4 -; GFX9-NEXT: v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v3, v4, v5, v3 +; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v11 +; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -3949,7 +3924,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_movk_i32 s17, 0xff -; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s5, s0, 8 ; GFX8-NEXT: s_and_b32 s5, s5, s17 @@ -3961,8 +3936,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s5, s6, s17 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_and_b32 s5, s7, s17 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s7, 24 ; GFX8-NEXT: s_lshr_b32 s8, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s5 ; GFX8-NEXT: s_and_b32 s5, s8, s17 @@ -3974,8 +3948,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s5, s9, s17 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s5 -; GFX8-NEXT: s_and_b32 s5, s10, s17 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s10, 24 ; GFX8-NEXT: s_lshr_b32 s11, s2, 8 ; GFX8-NEXT: s_or_b32 s1, s1, s5 ; GFX8-NEXT: s_and_b32 s5, s11, s17 @@ -3987,8 +3960,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s5, s12, s17 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s5 -; GFX8-NEXT: s_and_b32 s5, s13, s17 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s13, 24 ; GFX8-NEXT: s_lshr_b32 s14, s3, 8 ; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_and_b32 s5, s14, s17 @@ -4000,8 +3972,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s5, s15, s17 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s5 -; GFX8-NEXT: s_and_b32 s5, s16, s17 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s16, 24 ; GFX8-NEXT: s_or_b32 s3, s3, s5 ; GFX8-NEXT: s_lshr_b32 s5, s4, 2 ; GFX8-NEXT: s_cmp_eq_u32 s5, 1 @@ -4030,35 +4001,39 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v8, s17 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v12, s17 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4080,8 +4055,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s5, s6, s17 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s5 -; GFX7-NEXT: s_and_b32 s5, s7, s17 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s7, 24 ; GFX7-NEXT: s_lshr_b32 s8, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s5 ; GFX7-NEXT: s_and_b32 s5, s8, s17 @@ -4093,8 +4067,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s5, s9, s17 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s1, s1, s5 -; GFX7-NEXT: s_and_b32 s5, s10, s17 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s10, 24 ; GFX7-NEXT: s_lshr_b32 s11, s2, 8 ; GFX7-NEXT: s_or_b32 s1, s1, s5 ; GFX7-NEXT: s_and_b32 s5, s11, s17 @@ -4106,8 +4079,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s5, s12, s17 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s5 -; GFX7-NEXT: s_and_b32 s5, s13, s17 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s13, 24 ; GFX7-NEXT: s_lshr_b32 s14, s3, 8 ; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_and_b32 s5, s14, s17 @@ -4119,8 +4091,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s5, s15, s17 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s3, s3, s5 -; GFX7-NEXT: s_and_b32 s5, s16, s17 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s16, 24 ; GFX7-NEXT: s_or_b32 s3, s3, s5 ; GFX7-NEXT: s_lshr_b32 s5, s4, 2 ; GFX7-NEXT: s_cmp_eq_u32 s5, 1 @@ -4157,8 +4128,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s17, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s17, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s17, v7 @@ -4170,8 +4140,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s17, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s17, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s17, v10 @@ -4183,8 +4152,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s17, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s17, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s17, v13 @@ -4196,8 +4164,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s17, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s17, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -4229,8 +4196,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s5, s7, s18 ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: s_and_b32 s5, s8, s18 -; GFX9-NEXT: s_lshl_b32 s5, s5, 24 +; GFX9-NEXT: s_lshl_b32 s5, s8, 24 ; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_or_b32 s8, s0, s5 ; GFX9-NEXT: s_lshr_b32 s10, s1, 16 @@ -4242,8 +4208,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s1, s10, s18 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s11, s18 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s11, 24 ; GFX9-NEXT: s_lshr_b32 s12, s2, 8 ; GFX9-NEXT: s_or_b32 s9, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s12, s18 @@ -4255,8 +4220,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s14, s2, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s14, s18 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s14, 24 ; GFX9-NEXT: s_lshr_b32 s15, s3, 8 ; GFX9-NEXT: s_or_b32 s10, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s15, s18 @@ -4268,8 +4232,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_lshr_b32 s17, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s17, s18 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s17, 24 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: s_or_b32 s11, s0, s1 @@ -4289,38 +4252,42 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX9-NEXT: s_mov_b32 s6, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s18, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v4, v8, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v4, v0, s18, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v4, v0, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v5, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v5, v2, s18, v5 -; GFX9-NEXT: v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v5, v6, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 ; GFX9-NEXT: v_and_or_b32 v4, v3, s18, v4 -; GFX9-NEXT: v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v3, v4, v5, v3 +; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v11 +; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -4344,8 +4311,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s5, s6, s18 ; GFX8-NEXT: s_lshl_b32 s5, s5, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_and_b32 s5, s7, s18 -; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s7, 24 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_or_b32 s8, s0, s5 ; GFX8-NEXT: s_lshr_b32 s10, s1, 16 @@ -4357,8 +4323,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s1, s10, s18 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s11, s18 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s11, 24 ; GFX8-NEXT: s_lshr_b32 s12, s2, 8 ; GFX8-NEXT: s_or_b32 s9, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s12, s18 @@ -4370,8 +4335,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_lshr_b32 s14, s2, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s14, s18 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s14, 24 ; GFX8-NEXT: s_lshr_b32 s15, s3, 8 ; GFX8-NEXT: s_or_b32 s10, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s15, s18 @@ -4383,8 +4347,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_lshr_b32 s17, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s17, s18 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s17, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: s_or_b32 s11, s0, s1 @@ -4406,42 +4369,46 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v8, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v8, s18 +; GFX8-NEXT: v_mov_b32_e32 v12, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v12, s18 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4465,8 +4432,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s5, s6, s18 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s5 -; GFX7-NEXT: s_and_b32 s5, s7, s18 -; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s7, 24 ; GFX7-NEXT: s_lshr_b32 s9, s1, 8 ; GFX7-NEXT: s_or_b32 s8, s0, s5 ; GFX7-NEXT: s_lshr_b32 s10, s1, 16 @@ -4478,8 +4444,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s1, s10, s18 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s11, s18 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s11, 24 ; GFX7-NEXT: s_lshr_b32 s12, s2, 8 ; GFX7-NEXT: s_or_b32 s9, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s12, s18 @@ -4491,8 +4456,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_lshr_b32 s14, s2, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s14, s18 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s14, 24 ; GFX7-NEXT: s_lshr_b32 s15, s3, 8 ; GFX7-NEXT: s_or_b32 s10, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s15, s18 @@ -4504,8 +4468,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_lshr_b32 s17, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s17, s18 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s17, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: s_or_b32 s11, s0, s1 @@ -4542,8 +4505,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s18, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s18, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s18, v7 @@ -4555,8 +4517,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s18, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s18, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s18, v10 @@ -4568,8 +4529,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s18, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s18, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s18, v13 @@ -4581,8 +4541,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s18, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s18, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -4614,8 +4573,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s4, s5, s17 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s6, s17 -; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshl_b32 s4, s6, 24 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_or_b32 s4, s0, s4 ; GFX9-NEXT: s_lshr_b32 s9, s1, 16 @@ -4627,8 +4585,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_and_b32 s1, s9, s17 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s10, s17 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s10, 24 ; GFX9-NEXT: s_lshr_b32 s11, s2, 8 ; GFX9-NEXT: s_or_b32 s5, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s11, s17 @@ -4640,8 +4597,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s13, s2, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s13, s17 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s13, 24 ; GFX9-NEXT: s_lshr_b32 s14, s3, 8 ; GFX9-NEXT: s_or_b32 s6, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s14, s17 @@ -4653,8 +4609,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_lshr_b32 s16, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s16, s17 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s16, 24 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_or_b32 s7, s0, s1 @@ -4678,33 +4633,37 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX9-NEXT: s_mov_b32 s8, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s17, v4 -; GFX9-NEXT: v_and_b32_sdwa v8, v0, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v0, v0, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v0, v4, v8, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v4, v0, s17, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v4, v0, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NEXT: v_and_or_b32 v5, v1, s17, v5 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v1, v1, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v5, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_and_or_b32 v5, v1, s17, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v5, v2, s17, v5 -; GFX9-NEXT: v_and_b32_sdwa v6, v2, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v2, v2, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v2, v5, v6, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 ; GFX9-NEXT: v_and_or_b32 v4, v3, s17, v4 -; GFX9-NEXT: v_and_b32_sdwa v5, v3, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v3, v3, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v3, v4, v5, v3 +; GFX9-NEXT: v_and_b32_sdwa v3, v3, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v11 +; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -4728,8 +4687,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s4, s5, s16 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s4, s6, s16 -; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshl_b32 s4, s6, 24 ; GFX8-NEXT: s_lshr_b32 s7, s1, 8 ; GFX8-NEXT: s_or_b32 s4, s0, s4 ; GFX8-NEXT: s_lshr_b32 s8, s1, 16 @@ -4741,8 +4699,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s1, s8, s16 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s9, s16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s9, 24 ; GFX8-NEXT: s_lshr_b32 s10, s2, 8 ; GFX8-NEXT: s_or_b32 s5, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s10, s16 @@ -4754,8 +4711,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_lshr_b32 s12, s2, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s12, s16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s12, 24 ; GFX8-NEXT: s_lshr_b32 s13, s3, 8 ; GFX8-NEXT: s_or_b32 s6, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s13, s16 @@ -4767,8 +4723,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_lshr_b32 s15, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s15, s16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s15, 24 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_or_b32 s7, s0, s1 @@ -4792,39 +4747,43 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v8, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v8, s16 -; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v12, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v12, s16 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4848,8 +4807,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s4, s5, s16 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s4 -; GFX7-NEXT: s_and_b32 s4, s6, s16 -; GFX7-NEXT: s_lshl_b32 s4, s4, 24 +; GFX7-NEXT: s_lshl_b32 s4, s6, 24 ; GFX7-NEXT: s_lshr_b32 s7, s1, 8 ; GFX7-NEXT: s_or_b32 s4, s0, s4 ; GFX7-NEXT: s_lshr_b32 s8, s1, 16 @@ -4861,8 +4819,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s1, s8, s16 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s9, s16 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s9, 24 ; GFX7-NEXT: s_lshr_b32 s10, s2, 8 ; GFX7-NEXT: s_or_b32 s5, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s10, s16 @@ -4874,8 +4831,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_lshr_b32 s12, s2, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s12, s16 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s12, 24 ; GFX7-NEXT: s_lshr_b32 s13, s3, 8 ; GFX7-NEXT: s_or_b32 s6, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s13, s16 @@ -4887,8 +4843,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_lshr_b32 s15, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s15, s16 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshl_b32 s1, s15, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_or_b32 s7, s0, s1 @@ -4925,8 +4880,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s16, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s16, v7 @@ -4938,8 +4892,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s16, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s16, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s16, v10 @@ -4951,8 +4904,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s16, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s16, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s16, v13 @@ -4964,8 +4916,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_and_b32_e32 v4, s16, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s16, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -4997,31 +4948,35 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 -; GFX9-NEXT: v_and_b32_sdwa v12, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v8 -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v16, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v10 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or3_b32 v3, v3, v12, v13 -; GFX9-NEXT: v_or3_b32 v4, v4, v14, v15 -; GFX9-NEXT: v_and_b32_sdwa v18, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v19, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v6, v6, s6, v11 -; GFX9-NEXT: v_or3_b32 v5, v5, v16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v6 +; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v10 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: v_and_b32_sdwa v18, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v12 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v3, v3, v16, v9 +; GFX9-NEXT: v_or3_b32 v4, v4, v17, v11 +; GFX9-NEXT: v_and_b32_sdwa v19, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; GFX9-NEXT: v_and_or_b32 v6, v6, s6, v14 +; GFX9-NEXT: v_or3_b32 v5, v5, v18, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 -; GFX9-NEXT: v_or3_b32 v6, v6, v18, v19 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] ; GFX9-NEXT: v_and_or_b32 v2, v8, v2, v7 @@ -5030,29 +4985,33 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v9, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v13, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v5 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_and_or_b32 v5, v2, s6, v0 -; GFX9-NEXT: v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v16, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v7 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 -; GFX9-NEXT: v_or3_b32 v0, v1, v9, v10 -; GFX9-NEXT: v_or3_b32 v1, v3, v11, v12 -; GFX9-NEXT: v_or3_b32 v2, v4, v13, v14 -; GFX9-NEXT: v_or3_b32 v3, v5, v15, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v7 +; GFX9-NEXT: v_or3_b32 v0, v1, v13, v6 +; GFX9-NEXT: v_or3_b32 v1, v3, v14, v8 +; GFX9-NEXT: v_or3_b32 v2, v4, v15, v10 +; GFX9-NEXT: v_or3_b32 v3, v5, v16, v11 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -5077,35 +5036,39 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v14, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v5 ; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX8-NEXT: v_and_b32_sdwa v17, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX8-NEXT: v_and_b32_sdwa v18, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v19, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 -; GFX8-NEXT: v_and_b32_sdwa v6, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 8, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v4 +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v19, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX8-NEXT: v_and_b32_sdwa v10, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v18 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 24, v6 +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v15 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v17 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX8-NEXT: v_or_b32_e32 v5, v5, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v2, v6, v2 @@ -5115,34 +5078,38 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v15, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v3, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v13 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v12 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -5187,8 +5154,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v10 ; GFX7-NEXT: v_and_b32_e32 v12, s6, v12 ; GFX7-NEXT: v_and_b32_e32 v14, s6, v14 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 @@ -5198,7 +5163,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v13 ; GFX7-NEXT: v_and_b32_e32 v15, s6, v15 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 @@ -5208,7 +5172,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v16, s6, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -5232,16 +5195,10 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s6, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 @@ -5251,8 +5208,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v2, s6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 @@ -5264,8 +5220,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v3, s6, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v4 @@ -5273,12 +5228,16 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -5305,31 +5264,35 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v6 -; GFX9-NEXT: v_and_b32_sdwa v10, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX9-NEXT: v_and_b32_sdwa v12, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v3, v4, s6, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v14, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v4, v5, s6, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or3_b32 v2, v2, v10, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v12, v13 -; GFX9-NEXT: v_and_b32_sdwa v16, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v5, v6, s6, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v14, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v6 +; GFX9-NEXT: v_and_b32_sdwa v14, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_b32_sdwa v15, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v3, v4, s6, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX9-NEXT: v_and_b32_sdwa v16, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v4, v5, s6, v10 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v2, v2, v14, v7 +; GFX9-NEXT: v_or3_b32 v3, v3, v15, v9 +; GFX9-NEXT: v_and_b32_sdwa v17, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v5, v6, s6, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX9-NEXT: v_or3_b32 v4, v4, v16, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v3, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v5, v5, v16, v17 +; GFX9-NEXT: v_or3_b32 v5, v5, v17, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] ; GFX9-NEXT: v_and_or_b32 v1, v6, s5, v1 @@ -5339,29 +5302,33 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v9, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v10, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v13, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v5 +; GFX9-NEXT: v_and_b32_sdwa v16, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_and_or_b32 v5, v1, s6, v0 -; GFX9-NEXT: v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v16, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v7 -; GFX9-NEXT: v_or3_b32 v0, v2, v9, v10 -; GFX9-NEXT: v_or3_b32 v1, v3, v11, v12 -; GFX9-NEXT: v_or3_b32 v2, v4, v13, v14 -; GFX9-NEXT: v_or3_b32 v3, v5, v15, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v9 +; GFX9-NEXT: v_or3_b32 v0, v2, v13, v6 +; GFX9-NEXT: v_or3_b32 v1, v3, v14, v8 +; GFX9-NEXT: v_or3_b32 v2, v4, v15, v10 +; GFX9-NEXT: v_or3_b32 v3, v5, v16, v11 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -5386,34 +5353,38 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v6 +; GFX8-NEXT: v_and_b32_sdwa v16, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 -; GFX8-NEXT: v_and_b32_sdwa v16, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v17, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v16 -; GFX8-NEXT: v_and_b32_sdwa v18, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 -; GFX8-NEXT: v_and_b32_sdwa v19, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX8-NEXT: v_and_b32_sdwa v17, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_and_b32_sdwa v18, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX8-NEXT: v_and_b32_sdwa v19, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v15 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX8-NEXT: v_or_b32_e32 v5, v5, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v6, s5, v6 @@ -5424,34 +5395,38 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v15, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v3, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v13 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v12 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -5495,8 +5470,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v10 ; GFX7-NEXT: v_and_b32_e32 v12, s6, v12 ; GFX7-NEXT: v_and_b32_e32 v14, s6, v14 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 @@ -5506,7 +5479,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v13 ; GFX7-NEXT: v_and_b32_e32 v15, s6, v15 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 @@ -5516,7 +5488,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v16, s6, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -5541,27 +5512,20 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s6, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 @@ -5573,8 +5537,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v3, s6, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v4 @@ -5582,12 +5545,16 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -5616,29 +5583,33 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v13, v4, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v4, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v4, v4, s1, v9 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v15, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v16, v5, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v5, v5, s1, v10 ; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v17, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v6, v6, s1, v11 -; GFX9-NEXT: v_and_b32_sdwa v19, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v9, v7, v0, v12 -; GFX9-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v4, v4, v13, v14 -; GFX9-NEXT: v_or3_b32 v5, v5, v15, v16 -; GFX9-NEXT: v_or3_b32 v7, v9, v19, v7 -; GFX9-NEXT: v_or3_b32 v6, v6, v17, v18 +; GFX9-NEXT: v_and_b32_sdwa v17, v4, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v18, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v5, v5, s1, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v4, v4, s1, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_b32_sdwa v19, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v6, v6, s1, v13 +; GFX9-NEXT: v_and_or_b32 v9, v7, v0, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v14 +; GFX9-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 24, v16 +; GFX9-NEXT: v_or3_b32 v4, v4, v17, v10 +; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 +; GFX9-NEXT: v_or3_b32 v7, v9, v7, v13 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v4, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] @@ -5648,31 +5619,35 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v15, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v16, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v13, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v14, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v15, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v4, v4, v0, v7 -; GFX9-NEXT: v_and_or_b32 v5, v5, v0, v8 -; GFX9-NEXT: v_and_b32_sdwa v10, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v11, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v4, v4, v0, v8 +; GFX9-NEXT: v_and_or_b32 v5, v5, v0, v10 +; GFX9-NEXT: v_and_b32_sdwa v14, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v6 ; GFX9-NEXT: v_and_or_b32 v6, v2, v0, v1 -; GFX9-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or3_b32 v1, v4, v12, v13 -; GFX9-NEXT: v_or3_b32 v2, v5, v14, v15 +; GFX9-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX9-NEXT: v_or3_b32 v1, v4, v15, v9 +; GFX9-NEXT: v_or3_b32 v2, v5, v16, v11 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_or3_b32 v0, v3, v10, v11 -; GFX9-NEXT: v_or3_b32 v3, v6, v16, v17 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v7 +; GFX9-NEXT: v_or3_b32 v3, v6, v17, v12 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm @@ -5696,35 +5671,39 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v7 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v4, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v17, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 8, v6 ; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 -; GFX8-NEXT: v_and_b32_sdwa v9, v5, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX8-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 -; GFX8-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v6, v6, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v7 +; GFX8-NEXT: v_and_b32_sdwa v19, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v5 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v14 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_sdwa v11, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v8, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 24, v7 +; GFX8-NEXT: v_or_b32_sdwa v6, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v16 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v13 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 24, v18 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX8-NEXT: v_or_b32_e32 v6, v6, v15 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 @@ -5734,34 +5713,38 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX8-NEXT: v_and_b32_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v11, v1, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v12, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v13, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v14, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v15, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v2, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_and_b32_sdwa v14, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v12 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -5799,15 +5782,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 8, v7 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_and_b32_e32 v11, s0, v11 -; GFX7-NEXT: v_and_b32_e32 v13, v13, v8 +; GFX7-NEXT: v_and_b32_e32 v13, s0, v13 ; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 -; GFX7-NEXT: v_and_b32_e32 v12, s0, v12 ; GFX7-NEXT: v_and_b32_e32 v14, v14, v8 ; GFX7-NEXT: v_and_b32_e32 v16, v16, v8 ; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 @@ -5817,7 +5798,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 24, v7 -; GFX7-NEXT: v_and_b32_e32 v15, v15, v8 ; GFX7-NEXT: v_and_b32_e32 v17, v17, v8 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 @@ -5827,7 +5807,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_and_b32_e32 v18, v18, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 24, v15 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 @@ -5852,16 +5831,10 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v0, v0, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, v5, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, v6, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, v7, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v1 @@ -5871,8 +5844,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v2, v9, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, v10, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 @@ -5884,8 +5856,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v3, v12, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, v13, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v13 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v4 @@ -5893,12 +5864,16 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: v_and_b32_e32 v3, v4, v8 ; GFX7-NEXT: v_and_b32_e32 v4, v14, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v5, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v4, v15, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v16, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 5d1468eba04ea..43692dc81535e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -5,122 +5,61 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: v_lshlrev_b64 v[0:1], 8, v[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 +; GCN-NEXT: s_movk_i32 s4, 0x80 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_add_co_u32_e32 v8, vcc, v2, v0 -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: v_addc_co_u32_e32 v9, vcc, v3, v1, vcc -; GCN-NEXT: s_movk_i32 s0, 0x80 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v8, v2 -; GCN-NEXT: s_movk_i32 s0, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v65, s1 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_mov_b32_e32 v64, s0 -; GCN-NEXT: s_movk_i32 s0, 0x50 -; GCN-NEXT: v_mov_b32_e32 v69, s1 -; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, v9, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: v_add_co_u32_e32 v66, vcc, v4, v0 -; GCN-NEXT: v_mov_b32_e32 v68, s0 -; GCN-NEXT: s_movk_i32 s0, 0x60 -; GCN-NEXT: v_mov_b32_e32 v71, s1 -; GCN-NEXT: v_addc_co_u32_e32 v67, vcc, v5, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v70, s0 -; GCN-NEXT: s_movk_i32 s0, 0x70 -; GCN-NEXT: v_mov_b32_e32 v73, s1 -; GCN-NEXT: v_add_co_u32_e32 v74, vcc, v66, v2 -; GCN-NEXT: v_mov_b32_e32 v72, s0 -; GCN-NEXT: s_movk_i32 s0, 0x90 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_addc_co_u32_e32 v75, vcc, v67, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v76, vcc, v66, v0 -; GCN-NEXT: v_addc_co_u32_e32 v77, vcc, v67, v1, vcc -; GCN-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[0:3], v[12:13], off -; GCN-NEXT: v_add_co_u32_e32 v10, vcc, 64, v8 -; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v9, vcc -; GCN-NEXT: v_add_co_u32_e32 v28, vcc, v8, v64 -; GCN-NEXT: v_addc_co_u32_e32 v29, vcc, v9, v65, vcc -; GCN-NEXT: global_load_dwordx4 v[32:35], v[8:9], off -; GCN-NEXT: global_load_dwordx4 v[36:39], v[8:9], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[8:9], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[8:9], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[10:11], off -; GCN-NEXT: global_load_dwordx4 v[52:55], v[10:11], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[10:11], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[60:63], v[10:11], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[16:19], v[28:29], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:48 -; GCN-NEXT: s_movk_i32 s0, 0xa0 +; GCN-NEXT: v_add_co_u32_e32 v6, vcc, v0, v64 +; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v6 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 +; GCN-NEXT: s_movk_i32 s4, 0xc0 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v6, v4 +; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, v7, v5, vcc +; GCN-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[16:17], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[16:17], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[16:17], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 +; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 ; GCN-NEXT: s_waitcnt vmcnt(15) ; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: global_store_dwordx4 v[74:75], v[0:3], off -; GCN-NEXT: global_store_dwordx4 v[76:77], v[4:7], off -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v0 -; GCN-NEXT: s_movk_i32 s0, 0xb0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v1, vcc -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v66, v2 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v67, v3, vcc -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: global_store_dwordx4 v[2:3], v[12:15], off -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v64 -; GCN-NEXT: s_movk_i32 s0, 0xd0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v65, vcc -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v66, v2 -; GCN-NEXT: s_movk_i32 s0, 0xe0 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v67, v3, vcc -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: global_store_dwordx4 v[2:3], v[20:23], off -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v0 -; GCN-NEXT: s_movk_i32 s0, 0xf0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v66, v2 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v67, v3, vcc -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: global_store_dwordx4 v[2:3], v[28:31], off -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v66 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v67, vcc -; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off offset:-48 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off offset:-32 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[44:47], off offset:-16 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v68 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v69, vcc -; GCN-NEXT: global_store_dwordx4 v[66:67], v[32:35], off -; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v70 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v71, vcc -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v66, v72 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v67, v73, vcc -; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[60:63], off +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 +; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 +; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160 +; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 +; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 +; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 +; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] +; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 +; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:112 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir index a9b9464944ab2..9e4d9edd9e52c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir @@ -381,6 +381,12 @@ regBankSelected: true body: | bb.0: liveins: $sgpr0, $sgpr1 + ; GCN-LABEL: name: trunc_sgpr_s32_to_s1_use + ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GCN: $scc = COPY [[COPY]] + ; GCN: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY]], [[COPY1]], implicit $scc + ; GCN: S_ENDPGM 0, implicit [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:sgpr(s1) = G_TRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll index 0934375dc8034..6c6a6680d60f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll @@ -9,7 +9,7 @@ define amdgpu_ps void @test_sendmsg(i32 inreg %m0) { ; CHECK: liveins: $sgpr0 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 12, [[COPY]](s32) - ; CHECK: S_ENDPGM + ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.s.sendmsg(i32 12, i32 %m0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll new file mode 100644 index 0000000000000..f0e79511d9597 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll @@ -0,0 +1,288 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -march=amdgcn -O0 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s + +; Size operand should be the minimum of the two pointer sizes. + +define void @test_memcpy_p1_p3_i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) { + ; CHECK-LABEL: name: test_memcpy_p1_p3_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK: G_MEMCPY [[MV]](p1), [[COPY2]](p3), [[TRUNC]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i64 256, i1 false) + ret void +} + +define void @test_memcpy_p1_p3_i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) { + ; CHECK-LABEL: name: test_memcpy_p1_p3_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; CHECK: G_MEMCPY [[MV]](p1), [[COPY2]](p3), [[C]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 256, i1 false) + ret void +} + +define void @test_memcpy_p1_p3_i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) { + ; CHECK-LABEL: name: test_memcpy_p1_p3_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16) + ; CHECK: G_MEMCPY [[MV]](p1), [[COPY2]](p3), [[ZEXT]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memcpy.p1i8.p3i8.i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i16 256, i1 false) + ret void +} + +define void @test_memcpy_p3_p1_i64(i8 addrspace(3)* %dst, i8 addrspace(1)* %src) { + ; CHECK-LABEL: name: test_memcpy_p3_p1_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK: G_MEMCPY [[COPY]](p3), [[MV]](p1), [[TRUNC]](s32), 0 :: (store 1 into %ir.dst, addrspace 3), (load 1 from %ir.src, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i64 256, i1 false) + ret void +} + +define void @test_memcpy_p3_p1_i32(i8 addrspace(3)* %dst, i8 addrspace(1)* %src) { + ; CHECK-LABEL: name: test_memcpy_p3_p1_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; CHECK: G_MEMCPY [[COPY]](p3), [[MV]](p1), [[C]](s32), 0 :: (store 1 into %ir.dst, addrspace 3), (load 1 from %ir.src, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 256, i1 false) + ret void +} + +define void @test_memcpy_p3_p1_i16(i8 addrspace(3)* %dst, i8 addrspace(1)* %src) { + ; CHECK-LABEL: name: test_memcpy_p3_p1_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16) + ; CHECK: G_MEMCPY [[COPY]](p3), [[MV]](p1), [[ZEXT]](s32), 0 :: (store 1 into %ir.dst, addrspace 3), (load 1 from %ir.src, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memcpy.p3i8.p1i8.i16(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i16 256, i1 false) + ret void +} + +define void @test_memmove_p1_p3_i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) { + ; CHECK-LABEL: name: test_memmove_p1_p3_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK: G_MEMMOVE [[MV]](p1), [[COPY2]](p3), [[TRUNC]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memmove.p1i8.p3i8.i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i64 256, i1 false) + ret void +} + +define void @test_memmove_p1_p3_i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) { + ; CHECK-LABEL: name: test_memmove_p1_p3_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; CHECK: G_MEMMOVE [[MV]](p1), [[COPY2]](p3), [[C]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 256, i1 false) + ret void +} + +define void @test_memmove_p1_p3_i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) { + ; CHECK-LABEL: name: test_memmove_p1_p3_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16) + ; CHECK: G_MEMMOVE [[MV]](p1), [[COPY2]](p3), [[ZEXT]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memmove.p1i8.p3i8.i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i16 256, i1 false) + ret void +} + +define void @test_memset_p1_i64(i8 addrspace(1)* %dst, i8 %val) { + ; CHECK-LABEL: name: test_memset_p1_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256 + ; CHECK: G_MEMSET [[MV]](p1), [[TRUNC]](s8), [[C]](s64), 0 :: (store 1 into %ir.dst, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 256, i1 false) + ret void +} + +define void @test_memset_p1_i32(i8 addrspace(1)* %dst, i8 %val) { + ; CHECK-LABEL: name: test_memset_p1_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[C]](s32) + ; CHECK: G_MEMSET [[MV]](p1), [[TRUNC]](s8), [[ZEXT]](s64), 0 :: (store 1 into %ir.dst, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memset.p1i8.i32(i8 addrspace(1)* %dst, i8 %val, i32 256, i1 false) + ret void +} + +define void @test_memset_p1_i16(i8 addrspace(1)* %dst, i8 %val) { + ; CHECK-LABEL: name: test_memset_p1_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[C]](s16) + ; CHECK: G_MEMSET [[MV]](p1), [[TRUNC]](s8), [[ZEXT]](s64), 0 :: (store 1 into %ir.dst, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + call void @llvm.memset.p1i8.i16(i8 addrspace(1)* %dst, i8 %val, i16 256, i1 false) + ret void +} + +define void @test_memset_p3_i64(i8 addrspace(3)* %dst, i8 %val) { + ; CHECK-LABEL: name: test_memset_p3_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK: G_MEMSET [[COPY]](p3), [[TRUNC]](s8), [[TRUNC1]](s32), 0 :: (store 1 into %ir.dst, addrspace 3) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]] + call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %dst, i8 %val, i64 256, i1 false) + ret void +} + +define void @test_memset_p3_i32(i8 addrspace(3)* %dst, i8 %val) { + ; CHECK-LABEL: name: test_memset_p3_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; CHECK: G_MEMSET [[COPY]](p3), [[TRUNC]](s8), [[C]](s32), 0 :: (store 1 into %ir.dst, addrspace 3) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]] + call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %dst, i8 %val, i32 256, i1 false) + ret void +} + +define void @test_memset_p3_i16(i8 addrspace(3)* %dst, i8 %val) { + ; CHECK-LABEL: name: test_memset_p3_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16) + ; CHECK: G_MEMSET [[COPY]](p3), [[TRUNC]](s8), [[ZEXT]](s32), 0 :: (store 1 into %ir.dst, addrspace 3) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]] + call void @llvm.memset.p3i8.i16(i8 addrspace(3)* %dst, i8 %val, i16 256, i1 false) + ret void +} + +declare void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* noalias nocapture writeonly, i8 addrspace(3)* noalias nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* noalias nocapture writeonly, i8 addrspace(3)* noalias nocapture readonly, i32, i1 immarg) #0 +declare void @llvm.memcpy.p1i8.p3i8.i16(i8 addrspace(1)* noalias nocapture writeonly, i8 addrspace(3)* noalias nocapture readonly, i16, i1 immarg) #0 +declare void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* noalias nocapture writeonly, i8 addrspace(1)* noalias nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* noalias nocapture writeonly, i8 addrspace(1)* noalias nocapture readonly, i32, i1 immarg) #0 +declare void @llvm.memcpy.p3i8.p1i8.i16(i8 addrspace(3)* noalias nocapture writeonly, i8 addrspace(1)* noalias nocapture readonly, i16, i1 immarg) #0 +declare void @llvm.memmove.p1i8.p3i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1 immarg) #0 +declare void @llvm.memmove.p1i8.p3i8.i16(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i16, i1 immarg) #0 +declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture writeonly, i8, i64, i1 immarg) #1 +declare void @llvm.memset.p1i8.i32(i8 addrspace(1)* nocapture writeonly, i8, i32, i1 immarg) #1 +declare void @llvm.memset.p1i8.i16(i8 addrspace(1)* nocapture writeonly, i8, i16, i1 immarg) #1 +declare void @llvm.memset.p3i8.i64(i8 addrspace(3)* nocapture writeonly, i8, i64, i1 immarg) #1 +declare void @llvm.memset.p3i8.i32(i8 addrspace(3)* nocapture writeonly, i8, i32, i1 immarg) #1 +declare void @llvm.memset.p3i8.i16(i8 addrspace(3)* nocapture writeonly, i8, i16, i1 immarg) #1 + +attributes #0 = { argmemonly nounwind willreturn } +attributes #1 = { argmemonly nounwind willreturn writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll index 5bb4b633a841b..18e10a1fc3909 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll @@ -326,7 +326,7 @@ define i16 @ushlsat_i16(i16 %lhs, i16 %rhs) { ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[USHLSAT:%[0-9]+]]:_(s16) = G_USHLSAT [[TRUNC]], [[TRUNC1]] + ; CHECK: [[USHLSAT:%[0-9]+]]:_(s16) = G_USHLSAT [[TRUNC]], [[TRUNC1]](s16) ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USHLSAT]](s16) ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] @@ -343,7 +343,7 @@ define i32 @ushlsat_i32(i32 %lhs, i32 %rhs) { ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[USHLSAT:%[0-9]+]]:_(s32) = G_USHLSAT [[COPY]], [[COPY1]] + ; CHECK: [[USHLSAT:%[0-9]+]]:_(s32) = G_USHLSAT [[COPY]], [[COPY1]](s32) ; CHECK: $vgpr0 = COPY [[USHLSAT]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 @@ -363,7 +363,7 @@ define i64 @ushlsat_i64(i64 %lhs, i64 %rhs) { ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[USHLSAT:%[0-9]+]]:_(s64) = G_USHLSAT [[MV]], [[MV1]] + ; CHECK: [[USHLSAT:%[0-9]+]]:_(s64) = G_USHLSAT [[MV]], [[MV1]](s64) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USHLSAT]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) @@ -385,7 +385,7 @@ define <2 x i32> @ushlsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[USHLSAT:%[0-9]+]]:_(<2 x s32>) = G_USHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; CHECK: [[USHLSAT:%[0-9]+]]:_(<2 x s32>) = G_USHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s32>) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USHLSAT]](<2 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) @@ -405,7 +405,7 @@ define i16 @sshlsat_i16(i16 %lhs, i16 %rhs) { ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s16) = G_SSHLSAT [[TRUNC]], [[TRUNC1]] + ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s16) = G_SSHLSAT [[TRUNC]], [[TRUNC1]](s16) ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SSHLSAT]](s16) ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] @@ -422,7 +422,7 @@ define i32 @sshlsat_i32(i32 %lhs, i32 %rhs) { ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s32) = G_SSHLSAT [[COPY]], [[COPY1]] + ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s32) = G_SSHLSAT [[COPY]], [[COPY1]](s32) ; CHECK: $vgpr0 = COPY [[SSHLSAT]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 @@ -442,7 +442,7 @@ define i64 @sshlsat_i64(i64 %lhs, i64 %rhs) { ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s64) = G_SSHLSAT [[MV]], [[MV1]] + ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s64) = G_SSHLSAT [[MV]], [[MV1]](s64) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSHLSAT]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) @@ -464,7 +464,7 @@ define <2 x i32> @sshlsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[SSHLSAT:%[0-9]+]]:_(<2 x s32>) = G_SSHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; CHECK: [[SSHLSAT:%[0-9]+]]:_(<2 x s32>) = G_SSHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s32>) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSHLSAT]](<2 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir index 5b3d79141b877..b7e52cadd8cd1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir @@ -5,7 +5,9 @@ # ERR: remark: :0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_different_block) # ERR-NEXT: remark: :0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: si_if_not_brcond_user) # ERR-NEXT: remark: :0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: si_if_multi_user) -# ERR-NEXT: remark: :0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: si_if_not_condition) +# ERR-NEXT: remark: :0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_xor_0) +# ERR-NEXT: remark: :0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_or_neg1) +# ERR-NEXT: remark: :0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_negated_multi_use) --- @@ -55,10 +57,63 @@ body: | ... +# Make sure we only match G_XOR (if), -1 --- -name: si_if_not_condition +name: brcond_si_if_xor_0 body: | bb.0: + successors: %bb.1 + liveins: $vgpr0, $vgpr1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s1) = G_ICMP intpred(ne), %0, %1 + %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 + %5:_(s1) = G_CONSTANT i1 false + %6:_(s1) = G_XOR %3, %5 + G_BRCOND %6, %bb.2 + G_BR %bb.3 + + bb.1: + S_NOP 0 + + bb.2: + S_NOP 1 + + bb.3: + S_NOP 2 +... + +# Make sure we only match G_XOR (if), -1 +--- +name: brcond_si_if_or_neg1 +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr0, $vgpr1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s1) = G_ICMP intpred(ne), %0, %1 + %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 + %5:_(s1) = G_CONSTANT i1 true + %6:_(s1) = G_OR %3, %5 + G_BRCOND %6, %bb.2 + G_BR %bb.3 + + bb.1: + S_NOP 0 + + bb.2: + S_NOP 1 + + bb.3: + S_NOP 2 +... + +--- +name: brcond_si_if_negated_multi_use +body: | + bb.0: + successors: %bb.1 liveins: $vgpr0, $vgpr1 %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -66,8 +121,16 @@ body: | %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 %5:_(s1) = G_CONSTANT i1 true %6:_(s1) = G_XOR %3, %5 - G_BRCOND %6, %bb.1 + S_NOP 0, implicit %6 + G_BRCOND %6, %bb.2 + G_BR %bb.3 bb.1: + S_NOP 0 + + bb.2: + S_NOP 1 + bb.3: + S_NOP 2 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir index c5a357732fc32..38f959145048d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir @@ -435,28 +435,67 @@ body: | name: test_and_v3s16 body: | bb.0: - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; CHECK-LABEL: name: test_and_v3s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 ; CHECK: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[INSERT]], [[INSERT1]] - ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[AND]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 - ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) - %0:_(<3 x s16>) = G_IMPLICIT_DEF - %1:_(<3 x s16>) = G_IMPLICIT_DEF - %2:_(<3 x s16>) = G_AND %0, %1 - %4:_(<4 x s16>) = G_IMPLICIT_DEF - %5:_(<4 x s16>) = G_INSERT %4, %2, 0 - $vgpr0_vgpr1 = COPY %5 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[AND]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; CHECK: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; CHECK: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] + ; CHECK: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL1]] + ; CHECK: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL2]] + ; CHECK: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_AND %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir index 697ab25b4b1f4..2f8bc0832b17c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_ashr_s32_s32 @@ -750,26 +750,25 @@ body: | name: test_ashr_v3s16_v3s16 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; SI-LABEL: name: test_ashr_v3s16_v3s16 - ; SI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; SI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; SI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; SI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) @@ -802,32 +801,68 @@ body: | ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; SI: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL2]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL3]] + ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C1]] + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C1]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; VI-LABEL: name: test_ashr_v3s16_v3s16 - ; VI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; VI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; VI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC]], [[TRUNC3]](s16) @@ -845,46 +880,109 @@ body: | ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; VI: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_ashr_v3s16_v3s16 - ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[EXTRACT3:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT1]](<4 x s16>), 32 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[EXTRACT4:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT2]](<4 x s16>), 0 - ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[EXTRACT5:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT3]](<4 x s16>), 32 - ; GFX9: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[EXTRACT2]], [[EXTRACT4]](<2 x s16>) - ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[EXTRACT3]], [[EXTRACT5]](s16) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT1]](<4 x s16>), 32 + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT2]](<4 x s16>), 0 + ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[EXTRACT3:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT3]](<4 x s16>), 32 + ; GFX9: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[EXTRACT]], [[EXTRACT2]](<2 x s16>) + ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[EXTRACT1]], [[EXTRACT3]](s16) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) - ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT4]], [[ASHR]](<2 x s16>), 0 ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT5]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) - ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 ; GFX9: [[INSERT7:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT6]], [[ASHR1]](s16), 32 ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT7]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) - ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) - ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 - ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT8]](<4 x s16>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 - %2:_(<3 x s16>) = G_EXTRACT %0, 0 - %3:_(<3 x s16>) = G_EXTRACT %1, 0 - %4:_(<3 x s16>) = G_ASHR %2, %3 - %5:_(<4 x s16>) = G_IMPLICIT_DEF - %6:_(<4 x s16>) = G_INSERT %5, %4, 0 - $vgpr0_vgpr1 = COPY %6 + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV16:%[0-9]+]]:_(<3 x s16>), [[UV17:%[0-9]+]]:_(<3 x s16>), [[UV18:%[0-9]+]]:_(<3 x s16>), [[UV19:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT8]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV20]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV21]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[INSERT9:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV16]](<3 x s16>), 0 + ; GFX9: [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT9]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV22]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV23]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[CONCAT_VECTORS4:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS4]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_ASHR %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir index de65d83f11597..79c78d29e6729 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir @@ -1557,12 +1557,49 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[DEF]](<2 x s16>) ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; CHECK: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CHECK: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C2]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C2]](s32) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CHECK: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C2]](s32) + ; CHECK: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C2]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]] + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]] + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL5]] + ; CHECK: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C4]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C4]] + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; CHECK: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C4]] + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C4]] + ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL7]] + ; CHECK: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>) + ; CHECK: S_ENDPGM 0, implicit [[CONCAT_VECTORS1]](<6 x s16>) %0:_(<6 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 %1:_(<6 x s8>) = G_TRUNC %0 %2:_(<6 x s8>) = G_ADD %1, %1 %3:_(<3 x s16>) = G_BITCAST %2 - S_ENDPGM 0, implicit %3 + %4:_(<6 x s16>) = G_CONCAT_VECTORS %3, %3 + S_ENDPGM 0, implicit %4 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir index 068ad6780a427..f5c7adcea576c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir @@ -406,3 +406,216 @@ body: | bb.2: S_NOP 0 ... + +--- +name: brcond_si_if_negated +body: | + ; WAVE64-LABEL: name: brcond_si_if_negated + ; WAVE64: bb.0: + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; WAVE64: [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] + ; WAVE64: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; WAVE64: [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: G_BR %bb.1 + ; WAVE64: bb.1: + ; WAVE64: successors: %bb.2(0x80000000) + ; WAVE64: S_NOP 0 + ; WAVE64: bb.2: + ; WAVE64: S_NOP 1 + ; WAVE32-LABEL: name: brcond_si_if_negated + ; WAVE32: bb.0: + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; WAVE32: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] + ; WAVE32: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; WAVE32: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: G_BR %bb.1 + ; WAVE32: bb.1: + ; WAVE32: successors: %bb.2(0x80000000) + ; WAVE32: S_NOP 0 + ; WAVE32: bb.2: + ; WAVE32: S_NOP 1 + bb.0: + successors: %bb.1 + liveins: $vgpr0, $vgpr1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s1) = G_ICMP intpred(ne), %0, %1 + %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 + %5:_(s1) = G_CONSTANT i1 true + %6:_(s1) = G_XOR %3, %5 + G_BRCOND %6, %bb.2 + + bb.1: + S_NOP 0 + + bb.2: + S_NOP 1 +... + +--- +name: brcond_si_if_br_negated +body: | + ; WAVE64-LABEL: name: brcond_si_if_br_negated + ; WAVE64: bb.0: + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; WAVE64: [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] + ; WAVE64: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; WAVE64: [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: G_BR %bb.3 + ; WAVE64: bb.1: + ; WAVE64: successors: %bb.2(0x80000000) + ; WAVE64: S_NOP 0 + ; WAVE64: bb.2: + ; WAVE64: successors: %bb.3(0x80000000) + ; WAVE64: S_NOP 1 + ; WAVE64: bb.3: + ; WAVE64: S_NOP 2 + ; WAVE32-LABEL: name: brcond_si_if_br_negated + ; WAVE32: bb.0: + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; WAVE32: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] + ; WAVE32: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; WAVE32: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: G_BR %bb.3 + ; WAVE32: bb.1: + ; WAVE32: successors: %bb.2(0x80000000) + ; WAVE32: S_NOP 0 + ; WAVE32: bb.2: + ; WAVE32: successors: %bb.3(0x80000000) + ; WAVE32: S_NOP 1 + ; WAVE32: bb.3: + ; WAVE32: S_NOP 2 + bb.0: + successors: %bb.1 + liveins: $vgpr0, $vgpr1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s1) = G_ICMP intpred(ne), %0, %1 + %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 + %5:_(s1) = G_CONSTANT i1 true + %6:_(s1) = G_XOR %3, %5 + G_BRCOND %6, %bb.2 + G_BR %bb.3 + + bb.1: + S_NOP 0 + + bb.2: + S_NOP 1 + + bb.3: + S_NOP 2 +... + +--- +name: brcond_si_loop_brcond_negated +tracksRegLiveness: true +body: | + ; WAVE64-LABEL: name: brcond_si_loop_brcond_negated + ; WAVE64: bb.0: + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1 + ; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; WAVE64: [[COPY2:%[0-9]+]]:sreg_64_xexec(s64) = COPY $sgpr0_sgpr1 + ; WAVE64: bb.1: + ; WAVE64: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; WAVE64: S_NOP 0 + ; WAVE64: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; WAVE64: SI_LOOP [[COPY2]](s64), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: G_BR %bb.2 + ; WAVE64: bb.2: + ; WAVE64: S_NOP 0 + ; WAVE32-LABEL: name: brcond_si_loop_brcond_negated + ; WAVE32: bb.0: + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1 + ; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; WAVE32: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec(s64) = COPY $sgpr0_sgpr1 + ; WAVE32: bb.1: + ; WAVE32: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; WAVE32: S_NOP 0 + ; WAVE32: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; WAVE32: SI_LOOP [[COPY2]](s64), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: G_BR %bb.2 + ; WAVE32: bb.2: + ; WAVE32: S_NOP 0 + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s64) = COPY $sgpr0_sgpr1 + + bb.1: + successors: %bb.1, %bb.2 + S_NOP 0 + %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 + %4:_(s1) = G_CONSTANT i1 true + %5:_(s1) = G_XOR %3, %4 + G_BRCOND %5, %bb.1 + + bb.2: + S_NOP 0 +... + +--- +name: brcond_si_loop_brcond_br_negated +tracksRegLiveness: true +body: | + ; WAVE64-LABEL: name: brcond_si_loop_brcond_br_negated + ; WAVE64: bb.0: + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1 + ; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; WAVE64: [[COPY2:%[0-9]+]]:sreg_64_xexec(s64) = COPY $sgpr0_sgpr1 + ; WAVE64: bb.1: + ; WAVE64: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; WAVE64: S_NOP 0 + ; WAVE64: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; WAVE64: SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: G_BR %bb.1 + ; WAVE64: bb.2: + ; WAVE64: S_NOP 0 + ; WAVE32-LABEL: name: brcond_si_loop_brcond_br_negated + ; WAVE32: bb.0: + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1 + ; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; WAVE32: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec(s64) = COPY $sgpr0_sgpr1 + ; WAVE32: bb.1: + ; WAVE32: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; WAVE32: S_NOP 0 + ; WAVE32: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; WAVE32: SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: G_BR %bb.1 + ; WAVE32: bb.2: + ; WAVE32: S_NOP 0 + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s64) = COPY $sgpr0_sgpr1 + + bb.1: + successors: %bb.1, %bb.2 + S_NOP 0 + %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 + %4:_(s1) = G_CONSTANT i1 true + %5:_(s1) = G_XOR %3, %4 + G_BRCOND %5, %bb.2 + G_BR %bb.1 + + bb.2: + S_NOP 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir index 57bd59f6233a4..bfc4df09b275c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir @@ -65,7 +65,42 @@ body: | ; GFX78: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX78: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[DEF]](<2 x s16>) ; GFX78: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX78: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX78: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX78: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX78: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX78: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX78: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) + ; GFX78: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX78: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX78: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX78: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX78: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX78: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) + ; GFX78: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX78: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C1]](s32) + ; GFX78: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX78: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C]] + ; GFX78: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX78: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C]] + ; GFX78: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C1]](s32) + ; GFX78: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]] + ; GFX78: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GFX78: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX78: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C]] + ; GFX78: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX78: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C]] + ; GFX78: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C1]](s32) + ; GFX78: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL3]] + ; GFX78: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; GFX78: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX78: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; GFX78: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX78: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; GFX78: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C1]](s32) + ; GFX78: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; GFX78: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GFX78: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>) + ; GFX78: S_NOP 0, implicit [[CONCAT_VECTORS1]](<6 x s16>) ; GFX9-LABEL: name: build_vector_v3s16 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 @@ -79,7 +114,31 @@ body: | ; GFX9: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF1]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX9: S_NOP 0, implicit [[CONCAT_VECTORS1]](<6 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -87,7 +146,8 @@ body: | %4:_(s16) = G_TRUNC %1 %5:_(s16) = G_TRUNC %2 %6:_(<3 x s16>) = G_BUILD_VECTOR %3, %4, %5 - S_NOP 0, implicit %6 + %7:_(<6 x s16>) = G_CONCAT_VECTORS %6, %6 + S_NOP 0, implicit %7 ... --- @@ -181,7 +241,60 @@ body: | ; GFX78: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX78: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) ; GFX78: [[EXTRACT:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<10 x s16>), 0 - ; GFX78: S_NOP 0, implicit [[EXTRACT]](<5 x s16>) + ; GFX78: [[DEF1:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF + ; GFX78: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<5 x s16>), 0 + ; GFX78: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<6 x s16>) + ; GFX78: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX78: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX78: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX78: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) + ; GFX78: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX78: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C1]](s32) + ; GFX78: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<5 x s16>), 0 + ; GFX78: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) + ; GFX78: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX78: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32) + ; GFX78: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; GFX78: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32) + ; GFX78: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; GFX78: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32) + ; GFX78: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX78: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; GFX78: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX78: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; GFX78: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C1]](s32) + ; GFX78: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL3]] + ; GFX78: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; GFX78: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX78: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C]] + ; GFX78: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX78: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C]] + ; GFX78: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C1]](s32) + ; GFX78: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; GFX78: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GFX78: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX78: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C]] + ; GFX78: [[COPY15:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX78: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C]] + ; GFX78: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C1]](s32) + ; GFX78: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SHL5]] + ; GFX78: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; GFX78: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; GFX78: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C]] + ; GFX78: [[COPY17:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX78: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C]] + ; GFX78: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C1]](s32) + ; GFX78: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND11]], [[SHL6]] + ; GFX78: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; GFX78: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX78: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C]] + ; GFX78: [[COPY19:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; GFX78: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C]] + ; GFX78: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C1]](s32) + ; GFX78: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND13]], [[SHL7]] + ; GFX78: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; GFX78: [[CONCAT_VECTORS1:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>) + ; GFX78: S_NOP 0, implicit [[CONCAT_VECTORS1]](<10 x s16>) ; GFX9-LABEL: name: build_vector_v5s16 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 @@ -200,7 +313,41 @@ body: | ; GFX9: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[DEF1]](<2 x s16>), [[DEF1]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<10 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<5 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<5 x s16>), 0 + ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<6 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<5 x s16>), 0 + ; GFX9: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>) + ; GFX9: S_NOP 0, implicit [[CONCAT_VECTORS1]](<10 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -212,7 +359,8 @@ body: | %8:_(s16) = G_TRUNC %3 %9:_(s16) = G_TRUNC %4 %10:_(<5 x s16>) = G_BUILD_VECTOR %5, %6, %7, %8, %9 - S_NOP 0, implicit %10 + %11:_(<10 x s16>) = G_CONCAT_VECTORS %10, %10 + S_NOP 0, implicit %11 ... --- @@ -261,7 +409,78 @@ body: | ; GFX78: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX78: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) ; GFX78: [[EXTRACT:%[0-9]+]]:_(<7 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<14 x s16>), 0 - ; GFX78: S_NOP 0, implicit [[EXTRACT]](<7 x s16>) + ; GFX78: [[DEF1:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF + ; GFX78: [[INSERT:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<7 x s16>), 0 + ; GFX78: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<8 x s16>) + ; GFX78: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX78: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) + ; GFX78: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX78: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C1]](s32) + ; GFX78: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX78: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32) + ; GFX78: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX78: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32) + ; GFX78: [[INSERT1:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<7 x s16>), 0 + ; GFX78: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<8 x s16>) + ; GFX78: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; GFX78: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32) + ; GFX78: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; GFX78: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C1]](s32) + ; GFX78: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX78: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32) + ; GFX78: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX78: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C1]](s32) + ; GFX78: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX78: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C]] + ; GFX78: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX78: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C]] + ; GFX78: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C1]](s32) + ; GFX78: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; GFX78: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GFX78: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX78: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C]] + ; GFX78: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX78: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C]] + ; GFX78: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C1]](s32) + ; GFX78: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SHL5]] + ; GFX78: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; GFX78: [[COPY18:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX78: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C]] + ; GFX78: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX78: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C]] + ; GFX78: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C1]](s32) + ; GFX78: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND11]], [[SHL6]] + ; GFX78: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; GFX78: [[COPY20:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX78: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C]] + ; GFX78: [[COPY21:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; GFX78: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C]] + ; GFX78: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C1]](s32) + ; GFX78: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND13]], [[SHL7]] + ; GFX78: [[BITCAST15:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; GFX78: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX78: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C]] + ; GFX78: [[COPY23:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; GFX78: [[AND16:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C]] + ; GFX78: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND16]], [[C1]](s32) + ; GFX78: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND15]], [[SHL8]] + ; GFX78: [[BITCAST16:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32) + ; GFX78: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; GFX78: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C]] + ; GFX78: [[COPY25:%[0-9]+]]:_(s32) = COPY [[BITCAST10]](s32) + ; GFX78: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C]] + ; GFX78: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C1]](s32) + ; GFX78: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND17]], [[SHL9]] + ; GFX78: [[BITCAST17:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR9]](s32) + ; GFX78: [[COPY26:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; GFX78: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY26]], [[C]] + ; GFX78: [[COPY27:%[0-9]+]]:_(s32) = COPY [[BITCAST11]](s32) + ; GFX78: [[AND20:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C]] + ; GFX78: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND20]], [[C1]](s32) + ; GFX78: [[OR10:%[0-9]+]]:_(s32) = G_OR [[AND19]], [[SHL10]] + ; GFX78: [[BITCAST18:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32) + ; GFX78: [[CONCAT_VECTORS1:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>), [[BITCAST14]](<2 x s16>), [[BITCAST15]](<2 x s16>), [[BITCAST16]](<2 x s16>), [[BITCAST17]](<2 x s16>), [[BITCAST18]](<2 x s16>) + ; GFX78: S_NOP 0, implicit [[CONCAT_VECTORS1]](<14 x s16>) ; GFX9-LABEL: name: build_vector_v7s16 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 @@ -285,7 +504,51 @@ body: | ; GFX9: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[DEF1]](<2 x s16>), [[DEF1]](<2 x s16>), [[DEF1]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<7 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<14 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<7 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<7 x s16>), 0 + ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<8 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<7 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<8 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32) + ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; GFX9: [[COPY27:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[COPY27]](s32) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>), [[BUILD_VECTOR_TRUNC8]](<2 x s16>), [[BUILD_VECTOR_TRUNC9]](<2 x s16>), [[BUILD_VECTOR_TRUNC10]](<2 x s16>) + ; GFX9: S_NOP 0, implicit [[CONCAT_VECTORS1]](<14 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -301,7 +564,8 @@ body: | %12:_(s16) = G_TRUNC %5 %13:_(s16) = G_TRUNC %6 %14:_(<7 x s16>) = G_BUILD_VECTOR %7, %8, %9, %10, %11, %12, %13 - S_NOP 0, implicit %14 + %15:_(<14 x s16>) = G_CONCAT_VECTORS %14, %14 + S_NOP 0, implicit %15 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-concat-vectors.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-concat-vectors.mir index e2724ffeb9c94..30df6596f8764 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-concat-vectors.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-concat-vectors.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s --- name: concat_vectors_v2s32_v2s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir index f3c82289c2398..d9cf582b0c296 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir @@ -1476,21 +1476,25 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_33_v64p3 ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $sgpr0_sgpr1 - ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x p3>) = G_LOAD [[COPY]](p1) :: (load 64, align 4, addrspace 4) + ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load 64, align 4, addrspace 4) + ; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x p3>) = G_BITCAST [[LOAD]](<16 x s32>) ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK: [[LOAD1:%[0-9]+]]:_(<16 x p3>) = G_LOAD [[PTR_ADD]](p1) :: (load 64 + 64, align 4, addrspace 4) + ; CHECK: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 64 + 64, align 4, addrspace 4) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(<16 x p3>) = G_BITCAST [[LOAD1]](<16 x s32>) ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 128 ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; CHECK: [[LOAD2:%[0-9]+]]:_(<16 x p3>) = G_LOAD [[PTR_ADD1]](p1) :: (load 64 + 128, align 4, addrspace 4) + ; CHECK: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 64 + 128, align 4, addrspace 4) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(<16 x p3>) = G_BITCAST [[LOAD2]](<16 x s32>) ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 192 ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; CHECK: [[LOAD3:%[0-9]+]]:_(<16 x p3>) = G_LOAD [[PTR_ADD2]](p1) :: (load 64 + 192, align 4, addrspace 4) + ; CHECK: [[LOAD3:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 64 + 192, align 4, addrspace 4) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(<16 x p3>) = G_BITCAST [[LOAD3]](<16 x s32>) ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 - ; CHECK: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3), [[UV2:%[0-9]+]]:_(p3), [[UV3:%[0-9]+]]:_(p3), [[UV4:%[0-9]+]]:_(p3), [[UV5:%[0-9]+]]:_(p3), [[UV6:%[0-9]+]]:_(p3), [[UV7:%[0-9]+]]:_(p3), [[UV8:%[0-9]+]]:_(p3), [[UV9:%[0-9]+]]:_(p3), [[UV10:%[0-9]+]]:_(p3), [[UV11:%[0-9]+]]:_(p3), [[UV12:%[0-9]+]]:_(p3), [[UV13:%[0-9]+]]:_(p3), [[UV14:%[0-9]+]]:_(p3), [[UV15:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[LOAD]](<16 x p3>) - ; CHECK: [[UV16:%[0-9]+]]:_(p3), [[UV17:%[0-9]+]]:_(p3), [[UV18:%[0-9]+]]:_(p3), [[UV19:%[0-9]+]]:_(p3), [[UV20:%[0-9]+]]:_(p3), [[UV21:%[0-9]+]]:_(p3), [[UV22:%[0-9]+]]:_(p3), [[UV23:%[0-9]+]]:_(p3), [[UV24:%[0-9]+]]:_(p3), [[UV25:%[0-9]+]]:_(p3), [[UV26:%[0-9]+]]:_(p3), [[UV27:%[0-9]+]]:_(p3), [[UV28:%[0-9]+]]:_(p3), [[UV29:%[0-9]+]]:_(p3), [[UV30:%[0-9]+]]:_(p3), [[UV31:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[LOAD1]](<16 x p3>) - ; CHECK: [[UV32:%[0-9]+]]:_(p3), [[UV33:%[0-9]+]]:_(p3), [[UV34:%[0-9]+]]:_(p3), [[UV35:%[0-9]+]]:_(p3), [[UV36:%[0-9]+]]:_(p3), [[UV37:%[0-9]+]]:_(p3), [[UV38:%[0-9]+]]:_(p3), [[UV39:%[0-9]+]]:_(p3), [[UV40:%[0-9]+]]:_(p3), [[UV41:%[0-9]+]]:_(p3), [[UV42:%[0-9]+]]:_(p3), [[UV43:%[0-9]+]]:_(p3), [[UV44:%[0-9]+]]:_(p3), [[UV45:%[0-9]+]]:_(p3), [[UV46:%[0-9]+]]:_(p3), [[UV47:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[LOAD2]](<16 x p3>) - ; CHECK: [[UV48:%[0-9]+]]:_(p3), [[UV49:%[0-9]+]]:_(p3), [[UV50:%[0-9]+]]:_(p3), [[UV51:%[0-9]+]]:_(p3), [[UV52:%[0-9]+]]:_(p3), [[UV53:%[0-9]+]]:_(p3), [[UV54:%[0-9]+]]:_(p3), [[UV55:%[0-9]+]]:_(p3), [[UV56:%[0-9]+]]:_(p3), [[UV57:%[0-9]+]]:_(p3), [[UV58:%[0-9]+]]:_(p3), [[UV59:%[0-9]+]]:_(p3), [[UV60:%[0-9]+]]:_(p3), [[UV61:%[0-9]+]]:_(p3), [[UV62:%[0-9]+]]:_(p3), [[UV63:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[LOAD3]](<16 x p3>) + ; CHECK: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3), [[UV2:%[0-9]+]]:_(p3), [[UV3:%[0-9]+]]:_(p3), [[UV4:%[0-9]+]]:_(p3), [[UV5:%[0-9]+]]:_(p3), [[UV6:%[0-9]+]]:_(p3), [[UV7:%[0-9]+]]:_(p3), [[UV8:%[0-9]+]]:_(p3), [[UV9:%[0-9]+]]:_(p3), [[UV10:%[0-9]+]]:_(p3), [[UV11:%[0-9]+]]:_(p3), [[UV12:%[0-9]+]]:_(p3), [[UV13:%[0-9]+]]:_(p3), [[UV14:%[0-9]+]]:_(p3), [[UV15:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[BITCAST]](<16 x p3>) + ; CHECK: [[UV16:%[0-9]+]]:_(p3), [[UV17:%[0-9]+]]:_(p3), [[UV18:%[0-9]+]]:_(p3), [[UV19:%[0-9]+]]:_(p3), [[UV20:%[0-9]+]]:_(p3), [[UV21:%[0-9]+]]:_(p3), [[UV22:%[0-9]+]]:_(p3), [[UV23:%[0-9]+]]:_(p3), [[UV24:%[0-9]+]]:_(p3), [[UV25:%[0-9]+]]:_(p3), [[UV26:%[0-9]+]]:_(p3), [[UV27:%[0-9]+]]:_(p3), [[UV28:%[0-9]+]]:_(p3), [[UV29:%[0-9]+]]:_(p3), [[UV30:%[0-9]+]]:_(p3), [[UV31:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[BITCAST1]](<16 x p3>) + ; CHECK: [[UV32:%[0-9]+]]:_(p3), [[UV33:%[0-9]+]]:_(p3), [[UV34:%[0-9]+]]:_(p3), [[UV35:%[0-9]+]]:_(p3), [[UV36:%[0-9]+]]:_(p3), [[UV37:%[0-9]+]]:_(p3), [[UV38:%[0-9]+]]:_(p3), [[UV39:%[0-9]+]]:_(p3), [[UV40:%[0-9]+]]:_(p3), [[UV41:%[0-9]+]]:_(p3), [[UV42:%[0-9]+]]:_(p3), [[UV43:%[0-9]+]]:_(p3), [[UV44:%[0-9]+]]:_(p3), [[UV45:%[0-9]+]]:_(p3), [[UV46:%[0-9]+]]:_(p3), [[UV47:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[BITCAST2]](<16 x p3>) + ; CHECK: [[UV48:%[0-9]+]]:_(p3), [[UV49:%[0-9]+]]:_(p3), [[UV50:%[0-9]+]]:_(p3), [[UV51:%[0-9]+]]:_(p3), [[UV52:%[0-9]+]]:_(p3), [[UV53:%[0-9]+]]:_(p3), [[UV54:%[0-9]+]]:_(p3), [[UV55:%[0-9]+]]:_(p3), [[UV56:%[0-9]+]]:_(p3), [[UV57:%[0-9]+]]:_(p3), [[UV58:%[0-9]+]]:_(p3), [[UV59:%[0-9]+]]:_(p3), [[UV60:%[0-9]+]]:_(p3), [[UV61:%[0-9]+]]:_(p3), [[UV62:%[0-9]+]]:_(p3), [[UV63:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[BITCAST3]](<16 x p3>) ; CHECK: G_STORE [[UV]](p3), [[FRAME_INDEX]](p5) :: (store 4 into %stack.0, align 256, addrspace 5) ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C3]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir index 0857d286ff5c9..3bb0c0ce74bf5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- @@ -238,7 +238,41 @@ body: | ; SI: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BITCAST3]] ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>), [[DEF2]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]] + ; SI: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL3]] + ; SI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>) + ; SI: S_NOP 0, implicit [[CONCAT_VECTORS2]](<6 x s16>) ; VI-LABEL: name: test_fabs_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -270,7 +304,41 @@ body: | ; VI: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BITCAST3]] ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>), [[DEF2]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]] + ; VI: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL3]] + ; VI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>) + ; VI: S_NOP 0, implicit [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_fabs_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -294,10 +362,33 @@ body: | ; GFX9: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BUILD_VECTOR_TRUNC1]] ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>), [[DEF3]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[COPY8]](s32) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX9: S_NOP 0, implicit [[CONCAT_VECTORS2]](<6 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FABS %0 - S_NOP 0, implicit %1 + %2:_(<6 x s16>) = G_CONCAT_VECTORS %1, %1 + S_NOP 0, implicit %2 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir index edd80e142a1d6..5508e598cc346 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fadd_s32 @@ -325,34 +325,44 @@ body: | --- name: test_fadd_v3s16 body: | - bb.0.entry: - liveins: $vgpr0, $vgpr1 - + bb.0: + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; CHECK-LABEL: name: test_or_v3s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[INSERT]], [[INSERT1]] + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[OR]](<4 x s16>), 0 + ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; SI-LABEL: name: test_fadd_v3s16 + ; SI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; SI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -377,35 +387,71 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; VI-LABEL: name: test_fadd_v3s16 + ; VI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[TRUNC]], [[TRUNC3]] @@ -421,52 +467,118 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_fadd_v3s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF3]](s32) - ; GFX9: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[DEF1]](s32) + ; GFX9: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF3]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[DEF1]](s32) ; GFX9: [[FADD:%[0-9]+]]:_(<2 x s16>) = G_FADD [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] ; GFX9: [[FADD1:%[0-9]+]]:_(<2 x s16>) = G_FADD [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<2 x s16>), [[FADD1]](<2 x s16>), [[DEF4]](<2 x s16>) - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) - %0:_(<3 x s16>) = G_IMPLICIT_DEF - %1:_(<3 x s16>) = G_IMPLICIT_DEF - %2:_(<3 x s16>) = G_FADD %0, %1 - S_NOP 0, implicit %2 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<2 x s16>), [[FADD1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; GFX9: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; GFX9: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; GFX9: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; GFX9: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_FADD %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir index 03e92158259ef..6cfd9ae5f4830 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- @@ -257,7 +257,17 @@ body: | ; SI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_fcanonicalize_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -289,7 +299,17 @@ body: | ; VI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fcanonicalize_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -313,10 +333,21 @@ body: | ; GFX9: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC1]] ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FCANONICALIZE]](<2 x s16>), [[FCANONICALIZE1]](<2 x s16>), [[DEF3]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FCANONICALIZE %0 - S_NOP 0, implicit %1 + %2:_(<3 x s32>) = G_ANYEXT %1 + S_NOP 0, implicit %2 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir index cba2aef2e8d51..16e0c52988b0a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir @@ -116,9 +116,10 @@ body: | ; GFX7: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oeq), [[C]](s32), [[UV1]] ; GFX7: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX7: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) - ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX7: [[TRUNC:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>) - ; GFX7: S_NOP 0, implicit [[TRUNC]](<2 x s1>) + ; GFX7: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; GFX7: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32) + ; GFX7: S_NOP 0, implicit [[BUILD_VECTOR]](<2 x s32>) ; GFX8-LABEL: name: test_fcmp_v2s32 ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -127,9 +128,10 @@ body: | ; GFX8: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oeq), [[C]](s32), [[UV1]] ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX8: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX8: [[TRUNC:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>) - ; GFX8: S_NOP 0, implicit [[TRUNC]](<2 x s1>) + ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32) + ; GFX8: S_NOP 0, implicit [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_fcmp_v2s32 ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -138,14 +140,14 @@ body: | ; GFX9: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oeq), [[C]](s32), [[UV1]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9: [[TRUNC:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: S_NOP 0, implicit [[TRUNC]](<2 x s1>) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<2 x s32>) %0:_(s32) = G_CONSTANT i32 0 %1:_(<2 x s32>) = G_BUILD_VECTOR %0, %0 %2:_(<2 x s32>) = COPY $vgpr0_vgpr1 %3:_(<2 x s1>) = G_FCMP floatpred(oeq), %1, %2 - S_NOP 0, implicit %3 + %4:_(<2 x s32>) = G_ANYEXT %3 + S_NOP 0, implicit %4 ... --- @@ -161,9 +163,10 @@ body: | ; GFX7: [[FCMP1:%[0-9]+]]:_(s1) = nnan G_FCMP floatpred(oeq), [[C]](s32), [[UV1]] ; GFX7: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX7: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) - ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX7: [[TRUNC:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>) - ; GFX7: S_NOP 0, implicit [[TRUNC]](<2 x s1>) + ; GFX7: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; GFX7: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32) + ; GFX7: S_NOP 0, implicit [[BUILD_VECTOR]](<2 x s32>) ; GFX8-LABEL: name: test_fcmp_v2s32_flags ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -172,9 +175,10 @@ body: | ; GFX8: [[FCMP1:%[0-9]+]]:_(s1) = nnan G_FCMP floatpred(oeq), [[C]](s32), [[UV1]] ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX8: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX8: [[TRUNC:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>) - ; GFX8: S_NOP 0, implicit [[TRUNC]](<2 x s1>) + ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32) + ; GFX8: S_NOP 0, implicit [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_fcmp_v2s32_flags ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -183,14 +187,14 @@ body: | ; GFX9: [[FCMP1:%[0-9]+]]:_(s1) = nnan G_FCMP floatpred(oeq), [[C]](s32), [[UV1]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9: [[TRUNC:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: S_NOP 0, implicit [[TRUNC]](<2 x s1>) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<2 x s32>) %0:_(s32) = G_CONSTANT i32 0 %1:_(<2 x s32>) = G_BUILD_VECTOR %0, %0 %2:_(<2 x s32>) = COPY $vgpr0_vgpr1 %3:_(<2 x s1>) = nnan G_FCMP floatpred(oeq), %1, %2 - S_NOP 0, implicit %3 + %4:_(<2 x s32>) = G_ANYEXT %3 + S_NOP 0, implicit %4 ... --- @@ -210,9 +214,11 @@ body: | ; GFX7: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX7: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) ; GFX7: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP2]](s1) - ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) - ; GFX7: [[TRUNC:%[0-9]+]]:_(<3 x s1>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>) - ; GFX7: S_NOP 0, implicit [[TRUNC]](<3 x s1>) + ; GFX7: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; GFX7: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; GFX7: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ANYEXT2]](s32) + ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX7: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX8-LABEL: name: test_fcmp_v3s32 ; GFX8: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF ; GFX8: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 @@ -224,9 +230,11 @@ body: | ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX8: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) ; GFX8: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP2]](s1) - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) - ; GFX8: [[TRUNC:%[0-9]+]]:_(<3 x s1>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>) - ; GFX8: S_NOP 0, implicit [[TRUNC]](<3 x s1>) + ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ANYEXT2]](s32) + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX8: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fcmp_v3s32 ; GFX9: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF ; GFX9: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 @@ -237,19 +245,15 @@ body: | ; GFX9: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oeq), [[UV2]](s32), [[UV5]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP2]](s1) - ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF1]](s32) - ; GFX9: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF2]](<2 x s16>) - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: [[TRUNC:%[0-9]+]]:_(<3 x s1>) = G_TRUNC [[EXTRACT]](<3 x s16>) - ; GFX9: S_NOP 0, implicit [[TRUNC]](<3 x s1>) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = G_IMPLICIT_DEF %1:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %2:_(<3 x s1>) = G_FCMP floatpred(oeq), %0, %1 - S_NOP 0, implicit %2 + %3:_(<3 x s32>) = G_ANYEXT %2 + S_NOP 0, implicit %3 + ... --- @@ -272,9 +276,12 @@ body: | ; GFX7: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) ; GFX7: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP2]](s1) ; GFX7: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP3]](s1) - ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX7: [[TRUNC:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>) - ; GFX7: S_NOP 0, implicit [[TRUNC]](<4 x s1>) + ; GFX7: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; GFX7: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; GFX7: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ANYEXT2]](s32) + ; GFX7: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ANYEXT3]](s32) + ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32) + ; GFX7: S_NOP 0, implicit [[BUILD_VECTOR]](<4 x s32>) ; GFX8-LABEL: name: test_fcmp_v4s32 ; GFX8: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GFX8: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: (volatile load 16) @@ -289,9 +296,12 @@ body: | ; GFX8: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) ; GFX8: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP2]](s1) ; GFX8: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP3]](s1) - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX8: [[TRUNC:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>) - ; GFX8: S_NOP 0, implicit [[TRUNC]](<4 x s1>) + ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ANYEXT2]](s32) + ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ANYEXT3]](s32) + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32) + ; GFX8: S_NOP 0, implicit [[BUILD_VECTOR]](<4 x s32>) ; GFX9-LABEL: name: test_fcmp_v4s32 ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: (volatile load 16) @@ -304,18 +314,16 @@ body: | ; GFX9: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(oeq), [[UV3]](s32), [[UV7]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP1]](s1) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP2]](s1) ; GFX9: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP3]](s1) - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) - ; GFX9: S_NOP 0, implicit [[TRUNC]](<4 x s1>) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<4 x s32>) %0:_(p1) = G_IMPLICIT_DEF %1:_(<4 x s32>) = G_LOAD %0 :: (volatile load 16) %2:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %3:_(<4 x s1>) = G_FCMP floatpred(oeq) , %1, %2 - S_NOP 0, implicit %3 + %4:_(<4 x s32>) = G_ANYEXT %3 + S_NOP 0, implicit %4 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir index 54f6bd787a820..6b0b3eb6385e3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_fcos_s32 @@ -368,7 +368,17 @@ body: | ; SI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_fcos_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -407,7 +417,17 @@ body: | ; VI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fcos_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -439,10 +459,21 @@ body: | ; GFX9: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF3]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FCOS %0 - S_NOP 0, implicit %1 + %2:_(<3 x s32>) = G_ANYEXT %1 + S_NOP 0, implicit %2 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir index 8d85c78ef5ac8..a1f82f8577389 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir @@ -1,9 +1,9 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -enable-unsafe-fp-math -o - %s | FileCheck -check-prefix=GFX9-UNSAFE %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 -enable-unsafe-fp-math -o - %s | FileCheck -check-prefix=GFX9-UNSAFE %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s --- name: test_fdiv_s16 @@ -1391,7 +1391,17 @@ body: | ; SI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_fdiv_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -1450,7 +1460,17 @@ body: | ; VI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fdiv_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -1505,7 +1525,17 @@ body: | ; GFX9: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF4]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s16 ; GFX9-UNSAFE: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-UNSAFE: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -1548,7 +1578,17 @@ body: | ; GFX9-UNSAFE: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9-UNSAFE: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF4]](<2 x s16>) ; GFX9-UNSAFE: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; GFX9-UNSAFE: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9-UNSAFE: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-UNSAFE: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9-UNSAFE: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; GFX9-UNSAFE: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-UNSAFE: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; GFX9-UNSAFE: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9-UNSAFE: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9-UNSAFE: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX10-LABEL: name: test_fdiv_v3s16 ; GFX10: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -1603,11 +1643,22 @@ body: | ; GFX10: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX10: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF4]](<2 x s16>) ; GFX10: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; GFX10: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX10: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX10: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX10: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; GFX10: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX10: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; GFX10: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX10: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_FDIV %0, %1 - S_NOP 0, implicit %2 + %3:_(<3 x s32>) = G_ANYEXT %2 + S_NOP 0, implicit %3 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir index a7d45beee984f..7f29187d072bb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_ffloor_s32 @@ -357,7 +357,17 @@ body: | ; SI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_ffloor_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -389,7 +399,17 @@ body: | ; VI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_ffloor_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -417,10 +437,21 @@ body: | ; GFX9: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF3]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FFLOOR %0 - S_NOP 0, implicit %1 + %2:_(<3 x s32>) = G_ANYEXT %1 + S_NOP 0, implicit %2 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir index 6ec0fa70046ee..d3ca2704ba701 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fma_s32 @@ -390,44 +390,42 @@ body: | name: test_fma_v3s16 body: | bb.0: + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8 ; SI-LABEL: name: test_fma_v3s16 + ; SI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; SI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; SI: [[COPY2:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr6_vgpr7_vgpr8 + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY2]](<6 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; SI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 - ; SI: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) - ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -455,47 +453,82 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[DEF4]](<2 x s16>) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS3]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[DEF1]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; SI: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; SI: [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT4]](<4 x s16>) + ; SI: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV18]](<2 x s16>) + ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C]](s32) + ; SI: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV19]](<2 x s16>) + ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST10]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; SI: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST11]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C2]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; SI: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>), [[BITCAST14]](<2 x s16>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; VI-LABEL: name: test_fma_v3s16 + ; VI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; VI: [[COPY2:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr6_vgpr7_vgpr8 + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY2]](<6 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; VI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 - ; VI: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) - ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) ; VI: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC3]], [[TRUNC6]] @@ -511,67 +544,133 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[DEF4]](<2 x s16>) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS3]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[DEF1]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; VI: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; VI: [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT4]](<4 x s16>) + ; VI: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV18]](<2 x s16>) + ; VI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C]](s32) + ; VI: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV19]](<2 x s16>) + ; VI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST10]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; VI: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST11]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C2]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; VI: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>), [[BITCAST14]](<2 x s16>) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_fma_v3s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX9: [[COPY2:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr6_vgpr7_vgpr8 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY2]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; GFX9: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF4]](s32) - ; GFX9: [[DEF5:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; GFX9: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF1]](s32) + ; GFX9: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF4]](s32) - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 - ; GFX9: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[DEF1]](s32) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) - ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[DEF4]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[DEF1]](s32) ; GFX9: [[FMA:%[0-9]+]]:_(<2 x s16>) = G_FMA [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]], [[BUILD_VECTOR_TRUNC4]] ; GFX9: [[FMA1:%[0-9]+]]:_(<2 x s16>) = G_FMA [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<2 x s16>), [[FMA1]](<2 x s16>), [[DEF5]](<2 x s16>) - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS3]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) - %0:_(<3 x s16>) = G_IMPLICIT_DEF - %1:_(<3 x s16>) = G_IMPLICIT_DEF - %2:_(<3 x s16>) = G_IMPLICIT_DEF - %3:_(<3 x s16>) = G_FMA %0, %1, %2 - S_NOP 0, implicit %3 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<2 x s16>), [[FMA1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; GFX9: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; GFX9: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; GFX9: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT4]](<4 x s16>) + ; GFX9: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV18]](<2 x s16>) + ; GFX9: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; GFX9: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV19]](<2 x s16>) + ; GFX9: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>), [[BUILD_VECTOR_TRUNC8]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<6 x s16>) = COPY $vgpr6_vgpr7_vgpr8 + %3:_(<3 x s16>), %4:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %5:_(<3 x s16>), %6:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %7:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %2 + %9:_(<3 x s16>) = G_FMA %3, %5, %7 + %10:_(<3 x s16>) = G_IMPLICIT_DEF + %11:_(<6 x s16>) = G_CONCAT_VECTORS %9, %10 + $vgpr0_vgpr1_vgpr2 = COPY %11 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir index 1d5c7289134e2..167e011956f8c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fmaxnum_s32_ieee_mode_on @@ -388,31 +388,30 @@ body: | name: test_fmaxnum_v3s16 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; SI-LABEL: name: test_fmaxnum_v3s16 - ; SI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; SI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; SI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; SI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -439,32 +438,69 @@ body: | ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; SI: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; VI-LABEL: name: test_fmaxnum_v3s16 - ; VI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; VI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; VI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC]] @@ -488,38 +524,101 @@ body: | ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; VI: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_fmaxnum_v3s16 - ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV]] - ; GFX9: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV2]] + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV4]] + ; GFX9: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV6]] ; GFX9: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] - ; GFX9: [[FCANONICALIZE2:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV1]] - ; GFX9: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV3]] + ; GFX9: [[FCANONICALIZE2:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV5]] + ; GFX9: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV7]] ; GFX9: [[FMAXNUM_IEEE1:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FMAXNUM_IEEE]](<2 x s16>), [[FMAXNUM_IEEE1]](<2 x s16>) ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) - ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 - ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 - %2:_(<3 x s16>) = G_EXTRACT %0, 0 - %3:_(<3 x s16>) = G_EXTRACT %1, 0 - %4:_(<3 x s16>) = G_FMAXNUM %2, %3 - %5:_(<4 x s16>) = G_IMPLICIT_DEF - %6:_(<4 x s16>) = G_INSERT %5, %4, 0 - $vgpr0_vgpr1 = COPY %6 + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; GFX9: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV18]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV19]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS3]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_FMAXNUM %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir index 523fbefe0723e..2ba3345eb6cbe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fminnum_s32_ieee_mode_on @@ -388,31 +388,30 @@ body: | name: test_fminnum_v3s16 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; SI-LABEL: name: test_fminnum_v3s16 - ; SI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; SI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; SI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; SI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -439,32 +438,69 @@ body: | ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; SI: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; VI-LABEL: name: test_fminnum_v3s16 - ; VI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; VI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; VI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC]] @@ -488,38 +524,101 @@ body: | ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; VI: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_fminnum_v3s16 - ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV]] - ; GFX9: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV2]] + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV4]] + ; GFX9: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV6]] ; GFX9: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] - ; GFX9: [[FCANONICALIZE2:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV1]] - ; GFX9: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV3]] + ; GFX9: [[FCANONICALIZE2:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV5]] + ; GFX9: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV7]] ; GFX9: [[FMINNUM_IEEE1:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FMINNUM_IEEE]](<2 x s16>), [[FMINNUM_IEEE1]](<2 x s16>) ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) - ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 - ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 - %2:_(<3 x s16>) = G_EXTRACT %0, 0 - %3:_(<3 x s16>) = G_EXTRACT %1, 0 - %4:_(<3 x s16>) = G_FMINNUM %2, %3 - %5:_(<4 x s16>) = G_IMPLICIT_DEF - %6:_(<4 x s16>) = G_INSERT %5, %4, 0 - $vgpr0_vgpr1 = COPY %6 + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; GFX9: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV18]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV19]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS3]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_FMINNUM %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir index 3dcf8e9336c60..9e3007d070f03 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fmul_s32 @@ -325,33 +325,31 @@ body: | name: test_fmul_v3s16 body: | bb.0: - liveins: $vgpr0, $vgpr1 + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; SI-LABEL: name: test_fmul_v3s16 + ; SI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; SI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -376,35 +374,71 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; VI-LABEL: name: test_fmul_v3s16 + ; VI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC3]] @@ -420,52 +454,118 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_fmul_v3s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF3]](s32) - ; GFX9: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[DEF1]](s32) + ; GFX9: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF3]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[DEF1]](s32) ; GFX9: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] ; GFX9: [[FMUL1:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMUL]](<2 x s16>), [[FMUL1]](<2 x s16>), [[DEF4]](<2 x s16>) - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) - %0:_(<3 x s16>) = G_IMPLICIT_DEF - %1:_(<3 x s16>) = G_IMPLICIT_DEF - %2:_(<3 x s16>) = G_FMUL %0, %1 - S_NOP 0, implicit %2 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMUL]](<2 x s16>), [[FMUL1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; GFX9: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; GFX9: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; GFX9: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; GFX9: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_FMUL %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir index 35f229088167c..0f5ff912984f2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fneg_s32 @@ -236,7 +236,20 @@ body: | ; SI: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BITCAST3]] ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>), [[DEF2]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND3]](s32), [[AND4]](s32), [[AND5]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_fneg_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -268,7 +281,20 @@ body: | ; VI: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BITCAST3]] ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>), [[DEF2]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND3]](s32), [[AND4]](s32), [[AND5]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fneg_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -292,10 +318,25 @@ body: | ; GFX9: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BUILD_VECTOR_TRUNC1]] ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>), [[DEF3]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FNEG %0 - S_NOP 0, implicit %1 + %2:_(<3 x s32>) = G_ZEXT %1 + S_NOP 0, implicit %2 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpext.mir index 844c972a6dfab..d5b1c180fe0ef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpext.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s --- name: test_fpext_f16_to_f32 @@ -66,21 +66,19 @@ body: | name: test_fpext_v3f16_to_v3f32 body: | bb.0: - liveins: $vgpr0 - + liveins: $vgpr0_vgpr1_vgpr2 ; CHECK-LABEL: name: test_fpext_v3f16_to_v3f32 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; CHECK: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -88,9 +86,10 @@ body: | ; CHECK: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FPEXT]](s32), [[FPEXT1]](s32), [[FPEXT2]](s32) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) - %0:_(<3 x s16>) = G_IMPLICIT_DEF - %1:_(<3 x s32>) = G_FPEXT %0 - $vgpr0_vgpr1_vgpr2 = COPY %1 + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %3:_(<3 x s32>) = G_FPEXT %1 + $vgpr0_vgpr1_vgpr2 = COPY %3 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir index a62da1ad54b78..8b40eabef07f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_fsin_s32 @@ -368,7 +368,17 @@ body: | ; SI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_fsin_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -407,7 +417,17 @@ body: | ; VI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fsin_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -439,10 +459,21 @@ body: | ; GFX9: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF3]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FSIN %0 - S_NOP 0, implicit %1 + %2:_(<3 x s32>) = G_ANYEXT %1 + S_NOP 0, implicit %2 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir index 033f8dd2dd18b..120599d775963 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fsqrt_s32 @@ -281,7 +281,17 @@ body: | ; SI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_fsqrt_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -313,7 +323,17 @@ body: | ; VI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF2]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fsqrt_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -341,10 +361,21 @@ body: | ; GFX9: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF3]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FSQRT %0 - S_NOP 0, implicit %1 + %2:_(<3 x s32>) = G_ANYEXT %1 + S_NOP 0, implicit %2 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir index f5ad411f2b5c8..7ed82fc5614f3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fsub_s32 @@ -369,34 +369,44 @@ body: | --- name: test_fsub_v3s16 body: | - bb.0.entry: - liveins: $vgpr0, $vgpr1 - + bb.0: + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; CHECK-LABEL: name: test_or_v3s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[INSERT]], [[INSERT1]] + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[OR]](<4 x s16>), 0 + ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; SI-LABEL: name: test_fsub_v3s16 + ; SI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; SI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC3]] @@ -424,35 +434,71 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; VI-LABEL: name: test_fsub_v3s16 + ; VI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC3]] @@ -471,35 +517,71 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_fsub_v3s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 - ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX9: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC3]] @@ -512,16 +594,46 @@ body: | ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD2]](s16) - ; GFX9: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF3]](s32) - ; GFX9: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF4]](<2 x s16>) - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) - %0:_(<3 x s16>) = G_IMPLICIT_DEF - %1:_(<3 x s16>) = G_IMPLICIT_DEF - %2:_(<3 x s16>) = G_FSUB %0, %1 - S_NOP 0, implicit %2 + ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF1]](s32) + ; GFX9: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; GFX9: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; GFX9: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; GFX9: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; GFX9: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_FSUB %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir index 99ff3fff27dda..d461c72c6e141 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s --- name: test_insert_s64_s32_offset0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir index dd3a230fc7434..124f7a173cce8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_intrinsic_round_s32 @@ -616,20 +616,19 @@ body: | name: test_intrinsic_round_v3s16 body: | bb.0: - liveins: $vgpr0_vgpr1 - + liveins: $vgpr0_vgpr1_vgpr2 ; GFX6-LABEL: name: test_intrinsic_round_v3s16 - ; GFX6: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; GFX6: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX6: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX6: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX6: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX6: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX6: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX6: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX6: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX6: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX6: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX6: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX6: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -705,21 +704,58 @@ body: | ; GFX6: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) ; GFX6: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF1]](<2 x s16>) - ; GFX6: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX6: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX6: $vgpr0_vgpr1 = COPY [[INSERT1]](<4 x s16>) + ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX6: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX6: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX6: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX6: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX6: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX6: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX6: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX6: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX6: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX6: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX6: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX6: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C6]] + ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C6]] + ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32) + ; GFX6: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]] + ; GFX6: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C6]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX6: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C6]] + ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; GFX6: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL3]] + ; GFX6: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX6: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C6]] + ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX6: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C6]] + ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; GFX6: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; GFX6: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; GFX6: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>) + ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX8-LABEL: name: test_intrinsic_round_v3s16 - ; GFX8: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX8: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; GFX8: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX8: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX8: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX8: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX8: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX8: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX8: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] @@ -765,21 +801,58 @@ body: | ; GFX8: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) ; GFX8: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX8: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF1]](<2 x s16>) - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX8: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX8: $vgpr0_vgpr1 = COPY [[INSERT1]](<4 x s16>) + ; GFX8: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX8: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX8: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX8: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX8: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX8: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX8: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX8: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX8: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX8: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX8: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GFX8: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX8: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GFX8: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX8: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX8: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C6]] + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX8: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C6]] + ; GFX8: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32) + ; GFX8: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]] + ; GFX8: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX8: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C6]] + ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX8: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C6]] + ; GFX8: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; GFX8: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL3]] + ; GFX8: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; GFX8: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX8: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C6]] + ; GFX8: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX8: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C6]] + ; GFX8: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; GFX8: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; GFX8: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; GFX8: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>) + ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_intrinsic_round_v3s16 - ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] @@ -821,15 +894,39 @@ body: | ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF1]](s32) ; GFX9: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF2]](<2 x s16>) - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT1]](<4 x s16>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<3 x s16>) = G_EXTRACT %0, 0 - %2:_(<3 x s16>) = G_INTRINSIC_ROUND %1 - %3:_(<4 x s16>) = G_IMPLICIT_DEF - %4:_(<4 x s16>) = G_INSERT %3, %2, 0 - $vgpr0_vgpr1 = COPY %4 + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %3:_(<3 x s16>) = G_INTRINSIC_ROUND %1 + %4:_(<3 x s16>) = G_IMPLICIT_DEF + %5:_(<6 x s16>) = G_CONCAT_VECTORS %3, %4 + $vgpr0_vgpr1_vgpr2 = COPY %5 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll index a1fa9e0cd1546..61e3204d5e7f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -o - %s | FileCheck -check-prefix=UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -o - %s | FileCheck -check-prefix=PACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=PACKED %s define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { ; UNPACKED-LABEL: name: image_load_f16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll index 335f84f3e7bb4..ffbc1c04ac1d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GCN %s define amdgpu_ps float @image_load_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { ; GCN-LABEL: name: image_load_f32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll index e9b61ff2065cd..166adf83f7274 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -o - %s | FileCheck -check-prefix=UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -o - %s | FileCheck -check-prefix=PACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=PACKED %s define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) { ; UNPACKED-LABEL: name: image_store_f16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir index b75ec76d7ff44..2b23f5c28b86d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir @@ -50,11 +50,12 @@ body: | ; GCN-LABEL: name: s_buffer_load_v3p3 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x p3>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) - ; GCN: [[DEF:%[0-9]+]]:_(<4 x p3>) = G_IMPLICIT_DEF - ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x p3>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x p3>), [[DEF]](<4 x p3>), [[DEF]](<4 x p3>) - ; GCN: [[UV:%[0-9]+]]:_(<3 x p3>), [[UV1:%[0-9]+]]:_(<3 x p3>), [[UV2:%[0-9]+]]:_(<3 x p3>), [[UV3:%[0-9]+]]:_(<3 x p3>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x p3>) - ; GCN: S_ENDPGM 0, implicit [[UV]](<3 x p3>) + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; GCN: [[UV:%[0-9]+]]:_(<3 x s32>), [[UV1:%[0-9]+]]:_(<3 x s32>), [[UV2:%[0-9]+]]:_(<3 x s32>), [[UV3:%[0-9]+]]:_(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; GCN: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[UV]](<3 x s32>) + ; GCN: S_ENDPGM 0, implicit [[BITCAST]](<3 x p3>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<3 x p3>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir index 3e19d5c9d9a05..131a81f8fe58f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir @@ -11159,24 +11159,29 @@ body: | ; CI-LABEL: name: test_load_constant_v2p1_align16 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; CI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_constant_v2p1_align16 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_constant_v2p1_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_constant_v2p1_align16 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_constant_v2p1_align16 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, addrspace 4) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 16, addrspace 4) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -11190,24 +11195,29 @@ body: | ; CI-LABEL: name: test_load_constant_v2p1_align8 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; CI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_constant_v2p1_align8 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_constant_v2p1_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_constant_v2p1_align8 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_constant_v2p1_align8 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 8, addrspace 4) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 8, addrspace 4) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -11221,24 +11231,29 @@ body: | ; CI-LABEL: name: test_load_constant_v2p1_align4 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; CI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_constant_v2p1_align4 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_constant_v2p1_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_constant_v2p1_align4 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_constant_v2p1_align4 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 4, addrspace 4) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 4, addrspace 4) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -11262,64 +11277,49 @@ body: | ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; CI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 1 + 3, addrspace 4) - ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C7]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (load 1 + 4, addrspace 4) - ; CI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; CI: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; CI: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p4) :: (load 1 + 5, addrspace 4) - ; CI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; CI: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; CI: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p4) :: (load 1 + 6, addrspace 4) - ; CI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; CI: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; CI: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p4) :: (load 1 + 7, addrspace 4) - ; CI: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; CI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) - ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C9]] - ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) - ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; CI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) - ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C9]] - ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) - ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; CI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C8]](s32) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) - ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C9]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) - ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C9]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C11]](s64) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C8]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p4) :: (load 1 + 8, addrspace 4) ; CI: [[PTR_ADD8:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p4) :: (load 1 + 9, addrspace 4) @@ -11327,57 +11327,46 @@ body: | ; CI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p4) :: (load 1 + 10, addrspace 4) ; CI: [[PTR_ADD10:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; CI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p4) :: (load 1 + 11, addrspace 4) - ; CI: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; CI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C9]](s64) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p4) :: (load 1 + 12, addrspace 4) - ; CI: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; CI: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p4) :: (load 1 + 13, addrspace 4) - ; CI: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; CI: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; CI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p4) :: (load 1 + 14, addrspace 4) - ; CI: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; CI: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; CI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p4) :: (load 1 + 15, addrspace 4) - ; CI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; CI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) - ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] - ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) - ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) - ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] - ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) - ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) - ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) - ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; CI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; CI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; CI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_constant_v2p1_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -11390,106 +11379,96 @@ body: | ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 1 + 3, addrspace 4) - ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; VI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C7]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (load 1 + 4, addrspace 4) - ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; VI: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; VI: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p4) :: (load 1 + 5, addrspace 4) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; VI: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; VI: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p4) :: (load 1 + 6, addrspace 4) - ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; VI: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; VI: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p4) :: (load 1 + 7, addrspace 4) - ; VI: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] - ; VI: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) - ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] - ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) - ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) - ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) - ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C10]](s64) - ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p4) :: (load 1 + 8, addrspace 4) - ; VI: [[PTR_ADD8:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p4) :: (load 1 + 9, addrspace 4) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; VI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C8]](s64) + ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p4) :: (load 1 + 8, addrspace 4) + ; VI: [[PTR_ADD8:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) + ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p4) :: (load 1 + 9, addrspace 4) ; VI: [[PTR_ADD9:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64) ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p4) :: (load 1 + 10, addrspace 4) ; VI: [[PTR_ADD10:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p4) :: (load 1 + 11, addrspace 4) - ; VI: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; VI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; VI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; VI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; VI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; VI: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C9]](s64) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p4) :: (load 1 + 12, addrspace 4) - ; VI: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; VI: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p4) :: (load 1 + 13, addrspace 4) - ; VI: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; VI: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; VI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p4) :: (load 1 + 14, addrspace 4) - ; VI: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; VI: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; VI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p4) :: (load 1 + 15, addrspace 4) - ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] - ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] - ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) - ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] - ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) - ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] - ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) - ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) - ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; VI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; VI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; VI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; VI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; VI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; VI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; VI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_constant_v2p1_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -11502,56 +11481,49 @@ body: | ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 1 + 3, addrspace 4) - ; GFX9: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX9: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C7]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (load 1 + 4, addrspace 4) - ; GFX9: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; GFX9: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; GFX9: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p4) :: (load 1 + 5, addrspace 4) - ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; GFX9: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p4) :: (load 1 + 6, addrspace 4) - ; GFX9: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p4) :: (load 1 + 7, addrspace 4) - ; GFX9: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] - ; GFX9: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) - ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] - ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) - ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) - ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) - ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C10]](s64) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX9: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C8]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p4) :: (load 1 + 8, addrspace 4) ; GFX9: [[PTR_ADD8:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p4) :: (load 1 + 9, addrspace 4) @@ -11559,49 +11531,46 @@ body: | ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p4) :: (load 1 + 10, addrspace 4) ; GFX9: [[PTR_ADD10:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p4) :: (load 1 + 11, addrspace 4) - ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX9: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX9: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C9]](s64) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p4) :: (load 1 + 12, addrspace 4) - ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p4) :: (load 1 + 13, addrspace 4) - ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; GFX9: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p4) :: (load 1 + 14, addrspace 4) - ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; GFX9: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p4) :: (load 1 + 15, addrspace 4) - ; GFX9: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] - ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] - ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) - ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] - ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) - ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] - ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) - ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) - ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; GFX9: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX9: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX9: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX9: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX9: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX9: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_constant_v2p1_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -11614,64 +11583,49 @@ body: | ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; CI-MESA: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 1 + 3, addrspace 4) - ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (load 1 + 4, addrspace 4) - ; CI-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; CI-MESA: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p4) :: (load 1 + 5, addrspace 4) - ; CI-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; CI-MESA: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p4) :: (load 1 + 6, addrspace 4) - ; CI-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; CI-MESA: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p4) :: (load 1 + 7, addrspace 4) - ; CI-MESA: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; CI-MESA: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; CI-MESA: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) - ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C9]] - ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) - ; CI-MESA: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; CI-MESA: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C8]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) - ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C9]] - ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) - ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; CI-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C8]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C7]](s64) + ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (load 1 + 4, addrspace 4) + ; CI-MESA: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) + ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p4) :: (load 1 + 5, addrspace 4) + ; CI-MESA: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) + ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p4) :: (load 1 + 6, addrspace 4) + ; CI-MESA: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) + ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p4) :: (load 1 + 7, addrspace 4) + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) - ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C9]] - ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] - ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; CI-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) - ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C9]] - ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C11]](s64) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CI-MESA: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C8]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p4) :: (load 1 + 8, addrspace 4) ; CI-MESA: [[PTR_ADD8:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p4) :: (load 1 + 9, addrspace 4) @@ -11679,57 +11633,46 @@ body: | ; CI-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p4) :: (load 1 + 10, addrspace 4) ; CI-MESA: [[PTR_ADD10:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; CI-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p4) :: (load 1 + 11, addrspace 4) - ; CI-MESA: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; CI-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI-MESA: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C9]](s64) ; CI-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p4) :: (load 1 + 12, addrspace 4) - ; CI-MESA: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; CI-MESA: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; CI-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p4) :: (load 1 + 13, addrspace 4) - ; CI-MESA: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; CI-MESA: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; CI-MESA: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p4) :: (load 1 + 14, addrspace 4) - ; CI-MESA: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; CI-MESA: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; CI-MESA: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p4) :: (load 1 + 15, addrspace 4) - ; CI-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; CI-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) - ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] - ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) - ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) - ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] - ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) - ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; CI-MESA: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; CI-MESA: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_constant_v2p1_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -11742,56 +11685,49 @@ body: | ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; GFX9-MESA: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 1 + 3, addrspace 4) - ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX9-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C7]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (load 1 + 4, addrspace 4) - ; GFX9-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; GFX9-MESA: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; GFX9-MESA: [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p4) :: (load 1 + 5, addrspace 4) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9-MESA: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; GFX9-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p4) :: (load 1 + 6, addrspace 4) - ; GFX9-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; GFX9-MESA: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; GFX9-MESA: [[PTR_ADD6:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; GFX9-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p4) :: (load 1 + 7, addrspace 4) - ; GFX9-MESA: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-MESA: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] - ; GFX9-MESA: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) - ; GFX9-MESA: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] - ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) - ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] - ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) - ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] - ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] - ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) - ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C10]](s64) + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX9-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX9-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX9-MESA: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[PTR_ADD7:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C8]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p4) :: (load 1 + 8, addrspace 4) ; GFX9-MESA: [[PTR_ADD8:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p4) :: (load 1 + 9, addrspace 4) @@ -11799,49 +11735,46 @@ body: | ; GFX9-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p4) :: (load 1 + 10, addrspace 4) ; GFX9-MESA: [[PTR_ADD10:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; GFX9-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p4) :: (load 1 + 11, addrspace 4) - ; GFX9-MESA: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; GFX9-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX9-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX9-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX9-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9-MESA: [[PTR_ADD11:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C9]](s64) ; GFX9-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p4) :: (load 1 + 12, addrspace 4) - ; GFX9-MESA: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; GFX9-MESA: [[PTR_ADD12:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; GFX9-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p4) :: (load 1 + 13, addrspace 4) - ; GFX9-MESA: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; GFX9-MESA: [[PTR_ADD13:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; GFX9-MESA: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p4) :: (load 1 + 14, addrspace 4) - ; GFX9-MESA: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; GFX9-MESA: [[PTR_ADD14:%[0-9]+]]:_(p4) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; GFX9-MESA: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p4) :: (load 1 + 15, addrspace 4) - ; GFX9-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] - ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] - ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) - ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] - ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) - ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] - ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; GFX9-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX9-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX9-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX9-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 1, addrspace 4) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir index 54def72ad9da8..6aa49dae11b87 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -10670,24 +10670,29 @@ body: | ; CI-LABEL: name: test_load_flat_v2p1_align16 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; CI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_flat_v2p1_align16 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_flat_v2p1_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_flat_v2p1_align16 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_flat_v2p1_align16 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 16, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -10701,24 +10706,29 @@ body: | ; CI-LABEL: name: test_load_flat_v2p1_align8 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; CI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_flat_v2p1_align8 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_flat_v2p1_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_flat_v2p1_align8 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_flat_v2p1_align8 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 8) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 8, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -10732,24 +10742,29 @@ body: | ; CI-LABEL: name: test_load_flat_v2p1_align4 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; CI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_flat_v2p1_align4 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_flat_v2p1_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_flat_v2p1_align4 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_flat_v2p1_align4 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16, align 4) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 4, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -10773,64 +10788,49 @@ body: | ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; CI: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load 1 + 3) - ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p0) :: (load 1 + 4) - ; CI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; CI: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; CI: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load 1 + 5) - ; CI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; CI: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; CI: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p0) :: (load 1 + 6) - ; CI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; CI: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; CI: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load 1 + 7) - ; CI: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; CI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) - ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C9]] - ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) - ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; CI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) - ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C9]] - ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) - ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; CI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C8]](s32) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) - ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C9]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) - ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C9]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C11]](s64) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p0) :: (load 1 + 8) ; CI: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p0) :: (load 1 + 9) @@ -10838,57 +10838,46 @@ body: | ; CI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p0) :: (load 1 + 10) ; CI: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; CI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load 1 + 11) - ; CI: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; CI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p0) :: (load 1 + 12) - ; CI: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; CI: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p0) :: (load 1 + 13) - ; CI: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; CI: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; CI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p0) :: (load 1 + 14) - ; CI: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; CI: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; CI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load 1 + 15) - ; CI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; CI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) - ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] - ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) - ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) - ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] - ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) - ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) - ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) - ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; CI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; CI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; CI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_flat_v2p1_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -10901,106 +10890,96 @@ body: | ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; VI: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load 1 + 3) - ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; VI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p0) :: (load 1 + 4) - ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; VI: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; VI: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load 1 + 5) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; VI: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; VI: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p0) :: (load 1 + 6) - ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; VI: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; VI: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load 1 + 7) - ; VI: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] - ; VI: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) - ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] - ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) - ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) - ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) - ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C10]](s64) - ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p0) :: (load 1 + 8) - ; VI: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p0) :: (load 1 + 9) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; VI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64) + ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p0) :: (load 1 + 8) + ; VI: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) + ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p0) :: (load 1 + 9) ; VI: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64) ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p0) :: (load 1 + 10) ; VI: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load 1 + 11) - ; VI: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; VI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; VI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; VI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; VI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; VI: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p0) :: (load 1 + 12) - ; VI: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; VI: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p0) :: (load 1 + 13) - ; VI: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; VI: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; VI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p0) :: (load 1 + 14) - ; VI: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; VI: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; VI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load 1 + 15) - ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] - ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] - ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) - ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] - ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) - ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] - ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) - ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) - ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; VI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; VI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; VI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; VI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; VI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; VI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; VI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_flat_v2p1_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -11013,56 +10992,49 @@ body: | ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load 1 + 3) - ; GFX9: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX9: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p0) :: (load 1 + 4) - ; GFX9: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; GFX9: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; GFX9: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load 1 + 5) - ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; GFX9: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p0) :: (load 1 + 6) - ; GFX9: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load 1 + 7) - ; GFX9: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] - ; GFX9: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) - ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] - ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) - ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) - ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) - ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C10]](s64) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX9: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p0) :: (load 1 + 8) ; GFX9: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p0) :: (load 1 + 9) @@ -11070,49 +11042,46 @@ body: | ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p0) :: (load 1 + 10) ; GFX9: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load 1 + 11) - ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX9: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX9: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p0) :: (load 1 + 12) - ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p0) :: (load 1 + 13) - ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; GFX9: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p0) :: (load 1 + 14) - ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; GFX9: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load 1 + 15) - ; GFX9: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] - ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] - ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) - ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] - ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) - ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] - ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) - ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) - ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; GFX9: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX9: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX9: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX9: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX9: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX9: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_flat_v2p1_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -11125,64 +11094,49 @@ body: | ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; CI-MESA: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load 1 + 3) - ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p0) :: (load 1 + 4) - ; CI-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; CI-MESA: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load 1 + 5) - ; CI-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; CI-MESA: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p0) :: (load 1 + 6) - ; CI-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; CI-MESA: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load 1 + 7) - ; CI-MESA: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; CI-MESA: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; CI-MESA: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) - ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C9]] - ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) - ; CI-MESA: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; CI-MESA: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C8]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) - ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C9]] - ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) - ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; CI-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C8]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) + ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p0) :: (load 1 + 4) + ; CI-MESA: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) + ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load 1 + 5) + ; CI-MESA: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) + ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p0) :: (load 1 + 6) + ; CI-MESA: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) + ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load 1 + 7) + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) - ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C9]] - ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] - ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; CI-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) - ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C9]] - ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C11]](s64) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CI-MESA: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p0) :: (load 1 + 8) ; CI-MESA: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p0) :: (load 1 + 9) @@ -11190,57 +11144,46 @@ body: | ; CI-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p0) :: (load 1 + 10) ; CI-MESA: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; CI-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load 1 + 11) - ; CI-MESA: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; CI-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI-MESA: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64) ; CI-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p0) :: (load 1 + 12) - ; CI-MESA: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; CI-MESA: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; CI-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p0) :: (load 1 + 13) - ; CI-MESA: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; CI-MESA: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; CI-MESA: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p0) :: (load 1 + 14) - ; CI-MESA: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; CI-MESA: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; CI-MESA: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load 1 + 15) - ; CI-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; CI-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) - ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] - ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) - ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) - ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] - ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) - ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; CI-MESA: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; CI-MESA: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_flat_v2p1_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -11253,56 +11196,49 @@ body: | ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; GFX9-MESA: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load 1 + 3) - ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX9-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p0) :: (load 1 + 4) - ; GFX9-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; GFX9-MESA: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; GFX9-MESA: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load 1 + 5) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9-MESA: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; GFX9-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p0) :: (load 1 + 6) - ; GFX9-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; GFX9-MESA: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; GFX9-MESA: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; GFX9-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load 1 + 7) - ; GFX9-MESA: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-MESA: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] - ; GFX9-MESA: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) - ; GFX9-MESA: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] - ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) - ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] - ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) - ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] - ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] - ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) - ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C10]](s64) + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX9-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX9-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX9-MESA: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p0) :: (load 1 + 8) ; GFX9-MESA: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p0) :: (load 1 + 9) @@ -11310,49 +11246,46 @@ body: | ; GFX9-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p0) :: (load 1 + 10) ; GFX9-MESA: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; GFX9-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load 1 + 11) - ; GFX9-MESA: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; GFX9-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX9-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX9-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX9-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9-MESA: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64) ; GFX9-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p0) :: (load 1 + 12) - ; GFX9-MESA: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; GFX9-MESA: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; GFX9-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p0) :: (load 1 + 13) - ; GFX9-MESA: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; GFX9-MESA: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; GFX9-MESA: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p0) :: (load 1 + 14) - ; GFX9-MESA: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; GFX9-MESA: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; GFX9-MESA: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load 1 + 15) - ; GFX9-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] - ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] - ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) - ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] - ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) - ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] - ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; GFX9-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX9-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX9-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX9-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 1, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir index 0abd624724a1f..9a5c6fd00ed52 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir @@ -12654,6 +12654,47 @@ body: | $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 ... +--- +name: test_load_global_v2sp1_align16 +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; SI-LABEL: name: test_load_global_v2sp1_align16 + ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; SI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) + ; CI-HSA-LABEL: name: test_load_global_v2sp1_align16 + ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CI-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; CI-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) + ; CI-MESA-LABEL: name: test_load_global_v2sp1_align16 + ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) + ; VI-LABEL: name: test_load_global_v2sp1_align16 + ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) + ; GFX9-HSA-LABEL: name: test_load_global_v2sp1_align16 + ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; GFX9-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) + ; GFX9-MESA-LABEL: name: test_load_global_v2sp1_align16 + ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 16, addrspace 1) + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 +... + --- name: test_load_global_v3s64_align32 body: | @@ -14597,28 +14638,34 @@ body: | ; SI-LABEL: name: test_load_global_v2p1_align16 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; SI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; SI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-HSA-LABEL: name: test_load_global_v2p1_align16 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-HSA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) - ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; CI-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_global_v2p1_align16 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_global_v2p1_align16 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-HSA-LABEL: name: test_load_global_v2p1_align16 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) - ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; GFX9-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_global_v2p1_align16 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, addrspace 1) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 16, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -14632,28 +14679,34 @@ body: | ; SI-LABEL: name: test_load_global_v2p1_align8 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; SI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; SI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-HSA-LABEL: name: test_load_global_v2p1_align8 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-HSA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) - ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) + ; CI-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_global_v2p1_align8 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_global_v2p1_align8 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-HSA-LABEL: name: test_load_global_v2p1_align8 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) - ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) + ; GFX9-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_global_v2p1_align8 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 8, addrspace 1) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 8, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -14667,28 +14720,34 @@ body: | ; SI-LABEL: name: test_load_global_v2p1_align4 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; SI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; SI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-HSA-LABEL: name: test_load_global_v2p1_align4 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-HSA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) - ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) + ; CI-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_global_v2p1_align4 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_global_v2p1_align4 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-HSA-LABEL: name: test_load_global_v2p1_align4 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) - ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) + ; GFX9-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_global_v2p1_align4 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 4, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -14712,64 +14771,49 @@ body: | ; SI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; SI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load 1 + 3, addrspace 1) - ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; SI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; SI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load 1 + 4, addrspace 1) - ; SI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; SI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; SI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load 1 + 5, addrspace 1) - ; SI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; SI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; SI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load 1 + 6, addrspace 1) - ; SI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; SI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; SI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p1) :: (load 1 + 7, addrspace 1) - ; SI: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; SI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; SI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) - ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C9]] - ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) - ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; SI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) - ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C9]] - ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) - ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C8]](s32) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) - ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C9]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] - ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) - ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C9]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; SI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; SI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C11]](s64) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; SI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; SI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p1) :: (load 1 + 8, addrspace 1) ; SI: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; SI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p1) :: (load 1 + 9, addrspace 1) @@ -14777,61 +14821,51 @@ body: | ; SI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p1) :: (load 1 + 10, addrspace 1) ; SI: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; SI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p1) :: (load 1 + 11, addrspace 1) - ; SI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; SI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; SI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C9]](s64) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p1) :: (load 1 + 12, addrspace 1) - ; SI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; SI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p1) :: (load 1 + 13, addrspace 1) - ; SI: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; SI: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; SI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p1) :: (load 1 + 14, addrspace 1) - ; SI: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; SI: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; SI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p1) :: (load 1 + 15, addrspace 1) - ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) - ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] - ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) - ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) - ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] - ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) - ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) - ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) - ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; SI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; SI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; SI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-HSA-LABEL: name: test_load_global_v2p1_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-HSA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 1, addrspace 1) - ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 1, addrspace 1) + ; CI-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_global_v2p1_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 1, addrspace 1) @@ -14844,64 +14878,49 @@ body: | ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; CI-MESA: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load 1 + 3, addrspace 1) - ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load 1 + 4, addrspace 1) - ; CI-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; CI-MESA: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; CI-MESA: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load 1 + 5, addrspace 1) - ; CI-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; CI-MESA: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; CI-MESA: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load 1 + 6, addrspace 1) - ; CI-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; CI-MESA: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; CI-MESA: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p1) :: (load 1 + 7, addrspace 1) - ; CI-MESA: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; CI-MESA: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; CI-MESA: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) - ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C9]] - ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) - ; CI-MESA: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; CI-MESA: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) - ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C9]] - ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) - ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; CI-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C8]](s32) + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) - ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C9]] - ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] - ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; CI-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) - ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C9]] - ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C11]](s64) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CI-MESA: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p1) :: (load 1 + 8, addrspace 1) ; CI-MESA: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p1) :: (load 1 + 9, addrspace 1) @@ -14909,57 +14928,46 @@ body: | ; CI-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p1) :: (load 1 + 10, addrspace 1) ; CI-MESA: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; CI-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p1) :: (load 1 + 11, addrspace 1) - ; CI-MESA: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; CI-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI-MESA: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C9]](s64) ; CI-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p1) :: (load 1 + 12, addrspace 1) - ; CI-MESA: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; CI-MESA: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; CI-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p1) :: (load 1 + 13, addrspace 1) - ; CI-MESA: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; CI-MESA: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; CI-MESA: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p1) :: (load 1 + 14, addrspace 1) - ; CI-MESA: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; CI-MESA: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; CI-MESA: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p1) :: (load 1 + 15, addrspace 1) - ; CI-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; CI-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) - ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] - ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) - ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) - ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] - ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) - ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) - ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; CI-MESA: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; CI-MESA: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_global_v2p1_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 1, addrspace 1) @@ -14972,56 +14980,49 @@ body: | ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; VI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load 1 + 3, addrspace 1) - ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; VI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load 1 + 4, addrspace 1) - ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; VI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; VI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load 1 + 5, addrspace 1) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; VI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; VI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load 1 + 6, addrspace 1) - ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; VI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; VI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p1) :: (load 1 + 7, addrspace 1) - ; VI: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] - ; VI: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) - ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] - ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) - ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) - ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) - ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C10]](s64) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; VI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p1) :: (load 1 + 8, addrspace 1) ; VI: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p1) :: (load 1 + 9, addrspace 1) @@ -15029,53 +15030,51 @@ body: | ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p1) :: (load 1 + 10, addrspace 1) ; VI: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p1) :: (load 1 + 11, addrspace 1) - ; VI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; VI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; VI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; VI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; VI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; VI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C9]](s64) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p1) :: (load 1 + 12, addrspace 1) - ; VI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; VI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p1) :: (load 1 + 13, addrspace 1) - ; VI: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; VI: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; VI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p1) :: (load 1 + 14, addrspace 1) - ; VI: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; VI: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; VI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p1) :: (load 1 + 15, addrspace 1) - ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] - ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] - ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) - ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] - ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) - ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] - ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) - ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) - ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; VI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; VI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; VI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; VI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; VI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; VI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; VI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-HSA-LABEL: name: test_load_global_v2p1_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 1, addrspace 1) - ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16, align 1, addrspace 1) + ; GFX9-HSA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_global_v2p1_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 1, addrspace 1) @@ -15088,56 +15087,49 @@ body: | ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; GFX9-MESA: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load 1 + 3, addrspace 1) - ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX9-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load 1 + 4, addrspace 1) - ; GFX9-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; GFX9-MESA: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; GFX9-MESA: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load 1 + 5, addrspace 1) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9-MESA: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; GFX9-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load 1 + 6, addrspace 1) - ; GFX9-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; GFX9-MESA: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; GFX9-MESA: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) ; GFX9-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p1) :: (load 1 + 7, addrspace 1) - ; GFX9-MESA: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-MESA: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] - ; GFX9-MESA: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) - ; GFX9-MESA: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] - ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) - ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) - ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] - ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) - ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] - ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) - ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] - ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] - ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) - ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) - ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) - ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C10]](s64) + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX9-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX9-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX9-MESA: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p1) :: (load 1 + 8, addrspace 1) ; GFX9-MESA: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p1) :: (load 1 + 9, addrspace 1) @@ -15145,49 +15137,46 @@ body: | ; GFX9-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p1) :: (load 1 + 10, addrspace 1) ; GFX9-MESA: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) ; GFX9-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p1) :: (load 1 + 11, addrspace 1) - ; GFX9-MESA: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; GFX9-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX9-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX9-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX9-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9-MESA: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C9]](s64) ; GFX9-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p1) :: (load 1 + 12, addrspace 1) - ; GFX9-MESA: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; GFX9-MESA: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) ; GFX9-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p1) :: (load 1 + 13, addrspace 1) - ; GFX9-MESA: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; GFX9-MESA: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64) ; GFX9-MESA: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p1) :: (load 1 + 14, addrspace 1) - ; GFX9-MESA: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) + ; GFX9-MESA: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) ; GFX9-MESA: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p1) :: (load 1 + 15, addrspace 1) - ; GFX9-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) - ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] - ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] - ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) - ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] - ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] - ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) - ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] - ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) - ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] - ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) - ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] - ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) - ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] - ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) - ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] - ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) - ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; GFX9-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX9-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX9-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX9-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 1, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -15201,28 +15190,34 @@ body: | ; SI-LABEL: name: test_load_global_v4p1_align8 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; SI: [[LOAD:%[0-9]+]]:_(<4 x p1>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x p1>) + ; SI: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) ; CI-HSA-LABEL: name: test_load_global_v4p1_align8 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-HSA: [[LOAD:%[0-9]+]]:_(<4 x p1>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) - ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x p1>) + ; CI-HSA: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) + ; CI-HSA: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>) + ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) ; CI-MESA-LABEL: name: test_load_global_v4p1_align8 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x p1>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) - ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x p1>) + ; CI-MESA: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) + ; CI-MESA: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>) + ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) ; VI-LABEL: name: test_load_global_v4p1_align8 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; VI: [[LOAD:%[0-9]+]]:_(<4 x p1>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) ; GFX9-HSA-LABEL: name: test_load_global_v4p1_align8 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<4 x p1>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) - ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x p1>) + ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) + ; GFX9-HSA: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>) + ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) ; GFX9-MESA-LABEL: name: test_load_global_v4p1_align8 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x p1>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x p1>) + ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load 32, align 8, addrspace 1) + ; GFX9-MESA: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<4 x p1>) = G_LOAD %0 :: (load 32, align 8, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir index ce0ceadfadaed..88ab22bd5df68 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -10145,40 +10145,57 @@ body: | ; SI-LABEL: name: test_load_local_v2p1_align4 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; SI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; SI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; SI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; SI: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[LOAD]](p1), [[LOAD1]](p1) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; SI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>) + ; SI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-LABEL: name: test_load_local_v2p1_align4 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; CI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; CI: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[LOAD]](p1), [[LOAD1]](p1) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; CI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-DS128-LABEL: name: test_load_local_v2p1_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; CI-DS128: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) - ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[LOAD]](p1), [[LOAD1]](p1) - ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 4, addrspace 3) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 8, addrspace 3) + ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 12, addrspace 3) + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; CI-DS128: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_local_v2p1_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; VI: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[LOAD]](p1), [[LOAD1]](p1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 4, addrspace 3) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 8, addrspace 3) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 12, addrspace 3) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_local_v2p1_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir index 1659ce532ade3..d85d450dba84e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -8879,60 +8879,60 @@ body: | ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; SI: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 4 + 4, addrspace 5) - ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; SI: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 4 + 8, addrspace 5) - ; SI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load 4 + 12, addrspace 5) - ; SI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; SI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; CI-LABEL: name: test_load_private_v2p1_align4 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 4, addrspace 5) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 4 + 4, addrspace 5) - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CI: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 4 + 8, addrspace 5) - ; CI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load 4 + 12, addrspace 5) - ; CI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; CI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; VI-LABEL: name: test_load_private_v2p1_align4 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 4, addrspace 5) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; VI: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 4 + 4, addrspace 5) - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; VI: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 4 + 8, addrspace 5) - ; VI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load 4 + 12, addrspace 5) - ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; VI: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_private_v2p1_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 4, addrspace 5) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 4 + 4, addrspace 5) - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 4 + 8, addrspace 5) - ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load 4 + 12, addrspace 5) - ; GFX9: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 4, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -8950,108 +8950,108 @@ body: | ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; SI: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 4 + 4, addrspace 5) - ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; SI: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 4 + 8, align 8, addrspace 5) - ; SI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load 4 + 12, addrspace 5) - ; SI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) - ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; SI: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load 4 + 16, align 8, addrspace 5) - ; SI: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; SI: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load 4 + 20, addrspace 5) - ; SI: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32) - ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; SI: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; SI: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load 4 + 24, align 8, addrspace 5) - ; SI: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; SI: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load 4 + 28, addrspace 5) - ; SI: [[MV3:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1), [[MV2]](p1), [[MV3]](p1) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x p1>) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32) + ; SI: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) ; CI-LABEL: name: test_load_private_v4p1_align8 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 4, align 8, addrspace 5) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 4 + 4, addrspace 5) - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CI: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 4 + 8, align 8, addrspace 5) - ; CI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load 4 + 12, addrspace 5) - ; CI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) - ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CI: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load 4 + 16, align 8, addrspace 5) - ; CI: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CI: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load 4 + 20, addrspace 5) - ; CI: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32) - ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; CI: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CI: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load 4 + 24, align 8, addrspace 5) - ; CI: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CI: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load 4 + 28, addrspace 5) - ; CI: [[MV3:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1), [[MV2]](p1), [[MV3]](p1) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x p1>) + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32) + ; CI: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) ; VI-LABEL: name: test_load_private_v4p1_align8 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 4, align 8, addrspace 5) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; VI: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 4 + 4, addrspace 5) - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; VI: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 4 + 8, align 8, addrspace 5) - ; VI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load 4 + 12, addrspace 5) - ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) - ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load 4 + 16, align 8, addrspace 5) - ; VI: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; VI: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load 4 + 20, addrspace 5) - ; VI: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32) - ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; VI: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; VI: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load 4 + 24, align 8, addrspace 5) - ; VI: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; VI: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load 4 + 28, addrspace 5) - ; VI: [[MV3:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1), [[MV2]](p1), [[MV3]](p1) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x p1>) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32) + ; VI: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) ; GFX9-LABEL: name: test_load_private_v4p1_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 4, align 8, addrspace 5) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 4 + 4, addrspace 5) - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 4 + 8, align 8, addrspace 5) - ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load 4 + 12, addrspace 5) - ; GFX9: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load 4 + 16, align 8, addrspace 5) - ; GFX9: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GFX9: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load 4 + 20, addrspace 5) - ; GFX9: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; GFX9: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX9: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load 4 + 24, align 8, addrspace 5) - ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load 4 + 28, addrspace 5) - ; GFX9: [[MV3:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1), [[MV2]](p1), [[MV3]](p1) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x p1>) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x p1>) = G_LOAD %0 :: (load 32, align 8, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir index fb74dc12dfef1..544fa75d2ab46 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_lshr_s32_s32 @@ -750,6 +750,244 @@ body: | $vgpr0_vgpr1 = COPY %6 ... +--- +name: test_ashr_v3s16_v3s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; SI-LABEL: name: test_ashr_v3s16_v3s16 + ; SI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; SI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) + ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND2]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]] + ; SI: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL1]] + ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SHL2]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND11]], [[SHL3]] + ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32) + ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C1]] + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C1]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND13]], [[SHL4]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + ; VI-LABEL: name: test_ashr_v3s16_v3s16 + ; VI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) + ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; VI: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[TRUNC3]](s16) + ; VI: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[TRUNC4]](s16) + ; VI: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[TRUNC5]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR4]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR5]](s16) + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] + ; VI: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR6]](s16) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] + ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; VI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; VI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + ; GFX9-LABEL: name: test_ashr_v3s16_v3s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) + ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT1]](<4 x s16>), 32 + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT2]](<4 x s16>), 0 + ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[EXTRACT3:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT3]](<4 x s16>), 32 + ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[EXTRACT]], [[EXTRACT2]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[EXTRACT1]], [[EXTRACT3]](s16) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT4]], [[LSHR]](<2 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT5]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; GFX9: [[INSERT7:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT6]], [[LSHR1]](s16), 32 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT7]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV16:%[0-9]+]]:_(<3 x s16>), [[UV17:%[0-9]+]]:_(<3 x s16>), [[UV18:%[0-9]+]]:_(<3 x s16>), [[UV19:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT8]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV20]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV21]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[INSERT9:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV16]](<3 x s16>), 0 + ; GFX9: [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT9]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV22]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV23]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[CONCAT_VECTORS4:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS4]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_LSHR %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 +... + --- name: test_lshr_v4s16_v4s16 body: | diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir index df966affcf264..ae0573ab8272b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir @@ -23,8 +23,8 @@ body: | ; CHECK-LABEL: name: test_merge_s32_s32_v2s32 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK: [[MV:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32) - ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](<2 x s32>) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(s32) = G_CONSTANT i32 0 %1:_(s32) = G_CONSTANT i32 1 %2:_(<2 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32) @@ -39,8 +39,8 @@ body: | ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; CHECK: [[MV:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32) - ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](<3 x s32>) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(s32) = G_CONSTANT i32 0 %1:_(s32) = G_CONSTANT i32 1 %2:_(s32) = G_CONSTANT i32 2 @@ -55,8 +55,8 @@ body: | ; CHECK-LABEL: name: test_merge_s64_s64_s128 ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[MV:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](<2 x s64>) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(s64) = G_CONSTANT i64 0 %1:_(s64) = G_CONSTANT i64 1 %2:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64) @@ -72,8 +72,8 @@ body: | ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 - ; CHECK: [[MV:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64), [[C2]](s64), [[C3]](s64) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV]](<4 x s64>) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64), [[C2]](s64), [[C3]](s64) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) %0:_(s64) = G_CONSTANT i64 0 %1:_(s64) = G_CONSTANT i64 1 %2:_(s64) = G_CONSTANT i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir index 1cfc08d677fa6..7cb80a7a62cbf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir @@ -435,28 +435,67 @@ body: | name: test_or_v3s16 body: | bb.0: - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; CHECK-LABEL: name: test_or_v3s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 ; CHECK: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[INSERT]], [[INSERT1]] - ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[OR]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 - ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) - %0:_(<3 x s16>) = G_IMPLICIT_DEF - %1:_(<3 x s16>) = G_IMPLICIT_DEF - %2:_(<3 x s16>) = G_OR %0, %1 - %4:_(<4 x s16>) = G_IMPLICIT_DEF - %5:_(<4 x s16>) = G_INSERT %4, %2, 0 - $vgpr0_vgpr1 = COPY %5 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[OR]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; CHECK: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; CHECK: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CHECK: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CHECK: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_OR %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir index fa68d25b8ebbb..974c33a4f0040 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir @@ -175,20 +175,20 @@ body: | ; CHECK: G_BR %bb.2 ; CHECK: bb.2: ; CHECK: [[PHI:%[0-9]+]]:_(<4 x s16>) = G_PHI [[INSERT]](<4 x s16>), %bb.0, [[INSERT3]](<4 x s16>), %bb.1 - ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[PHI]](<4 x s16>), [[DEF2]](<4 x s16>), [[DEF2]](<4 x s16>) - ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) ; CHECK: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF2]](<4 x s16>), [[DEF2]](<4 x s16>) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[PHI]](<4 x s16>), [[DEF3]](<4 x s16>), [[DEF3]](<4 x s16>) + ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[DEF4:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF4]](<4 x s16>), [[DEF3]](<4 x s16>), [[DEF3]](<4 x s16>) ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) - ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[UV4]](<3 x s16>), 0 + ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV4]](<3 x s16>), 0 ; CHECK: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT4]](<4 x s16>) ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C4]](s32) ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C4]](s32) - ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[UV8]](<3 x s16>), 0 + ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV8]](<3 x s16>), 0 ; CHECK: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT5]](<4 x s16>) ; CHECK: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C4]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir index 0b26ba2858b43..ea6f916885cd1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s --- name: test_saddo_s7 @@ -209,26 +209,25 @@ body: | name: test_saddo_v3s16 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; CHECK-LABEL: name: test_saddo_v3s16 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr1_vgpr2 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) @@ -255,38 +254,110 @@ body: | ; CHECK: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; CHECK: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 ; CHECK: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) - ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) - ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) - ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) - ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) - ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND3]](s32), [[AND4]](s32), [[AND5]](s32) - ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s32>) = G_INSERT [[DEF2]], [[BUILD_VECTOR]](<3 x s32>), 0 - ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT3]](<4 x s16>) - ; CHECK: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[INSERT4]](<4 x s32>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 - %2:_(<3 x s16>) = G_EXTRACT %0(<4 x s16>), 0 - %3:_(<3 x s16>) = G_EXTRACT %0(<4 x s16>), 0 - %4:_(<3 x s16>), %5:_(<3 x s1>) = G_SADDO %2, %3 - %6:_(<3 x s32>) = G_ZEXT %3 - %7:_(<4 x s16>) = G_IMPLICIT_DEF - %8:_(<4 x s16>) = G_INSERT %7, %4(<3 x s16>), 0 - %9:_(<4 x s32>) = G_IMPLICIT_DEF - %10:_(<4 x s32>) = G_INSERT %9, %6(<3 x s32>), 0 - $vgpr0_vgpr1 = COPY %8 - $vgpr2_vgpr3_vgpr4_vgpr5 = COPY %10 + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY11]], 16 + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY12]], 16 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) + ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY13]], 16 + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY14]], 16 + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[SEXT_INREG3]] + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) + ; CHECK: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY15]], 16 + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; CHECK: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY16]], 16 + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG4]](s32), [[SEXT_INREG5]] + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; CHECK: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; CHECK: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY17]], 16 + ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[C3]](s16) + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG6]](s32), [[SEXT]] + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY18]], 16 + ; CHECK: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[C3]](s16) + ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG7]](s32), [[SEXT1]] + ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; CHECK: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY19]], 16 + ; CHECK: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[C3]](s16) + ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG8]](s32), [[SEXT2]] + ; CHECK: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP]] + ; CHECK: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP4]], [[ICMP1]] + ; CHECK: [[XOR2:%[0-9]+]]:_(s1) = G_XOR [[ICMP5]], [[ICMP2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) + ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CHECK: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT4]](<4 x s16>) + ; CHECK: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C]](s32) + ; CHECK: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32) + ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; CHECK: [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT5]](<4 x s16>) + ; CHECK: [[BITCAST12:%[0-9]+]]:_(s32) = G_BITCAST [[UV18]](<2 x s16>) + ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST12]], [[C]](s32) + ; CHECK: [[BITCAST13:%[0-9]+]]:_(s32) = G_BITCAST [[UV19]](<2 x s16>) + ; CHECK: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST13]], [[C]](s32) + ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY [[BITCAST10]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C1]] + ; CHECK: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C1]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]] + ; CHECK: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[COPY22:%[0-9]+]]:_(s32) = COPY [[BITCAST11]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C1]] + ; CHECK: [[COPY23:%[0-9]+]]:_(s32) = COPY [[BITCAST12]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C1]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL3]] + ; CHECK: [[BITCAST15:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; CHECK: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C1]] + ; CHECK: [[COPY25:%[0-9]+]]:_(s32) = COPY [[BITCAST13]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C1]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; CHECK: [[BITCAST16:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST14]](<2 x s16>), [[BITCAST15]](<2 x s16>), [[BITCAST16]](<2 x s16>) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[COPY26:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY26]], [[C4]] + ; CHECK: [[COPY27:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C4]] + ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY [[ANYEXT2]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY28]], [[C4]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND9]](s32), [[AND10]](s32), [[AND11]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>), %7:_(<3 x s1>) = G_SADDO %2, %4 + %8:_(<3 x s16>) = G_IMPLICIT_DEF + %9:_(<6 x s16>) = G_CONCAT_VECTORS %6, %8 + %10:_(<3 x s32>) = G_ZEXT %7 + $vgpr0_vgpr1_vgpr2 = COPY %9 + $vgpr0_vgpr1_vgpr2 = COPY %10 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir index 094a38ed5fca7..521fc08e12e24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir @@ -387,35 +387,72 @@ body: | name: test_select_v3s16 body: | bb.0: - liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5, $vgpr6 ; CHECK-LABEL: name: test_select_v3s16 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr1_vgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr3_vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr6 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[C]] - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY2]](<4 x s16>), 0 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY2]](s32), [[C]] + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 ; CHECK: [[SELECT:%[0-9]+]]:_(<4 x s16>) = G_SELECT [[ICMP]](s1), [[INSERT]], [[INSERT1]] ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[SELECT]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 - ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) - %0:_(s32) = COPY $vgpr0 - %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 - %2:_(<4 x s16>) = COPY $vgpr3_vgpr4 - %4:_(s32) = G_CONSTANT i32 0 - %5:_(s1) = G_ICMP intpred(ne), %0, %4 - - %6:_(<3 x s16>) = G_EXTRACT %1, 0 - %7:_(<3 x s16>) = G_EXTRACT %2, 0 - %8:_(<3 x s16>) = G_SELECT %5, %6, %7 - %9:_(<4 x s16>) = G_IMPLICIT_DEF - %10:_(<4 x s16>) = G_INSERT %9, %8, 0 - $vgpr0_vgpr1 = COPY %10 + ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; CHECK: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; CHECK: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CHECK: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C2]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CHECK: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(s32) = COPY $vgpr6 + %3:_(s32) = G_CONSTANT i32 0 + %4:_(s1) = G_ICMP intpred(ne), %2, %3 + %5:_(<3 x s16>), %6:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %7:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %9:_(<3 x s16>) = G_SELECT %4, %5, %7 + %10:_(<3 x s16>) = G_IMPLICIT_DEF + %11:_(<6 x s16>) = G_CONCAT_VECTORS %9, %10 + $vgpr0_vgpr1_vgpr2 = COPY %11 ... @@ -1247,63 +1284,103 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11 ; CHECK-LABEL: name: test_vselect_v3s16 - ; CHECK: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 ; CHECK: [[COPY2:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr6_vgpr7_vgpr8 ; CHECK: [[COPY3:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr9_vgpr10_vgpr11 - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; CHECK: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<3 x s32>) + ; CHECK: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](<3 x s32>) ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](s32), [[UV3]] ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](s32), [[UV4]] ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[UV5]] - ; CHECK: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<3 x s32>) - ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV6]](s32) - ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV7]](s32) - ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV8]](s32) - ; CHECK: [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](<3 x s32>) - ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[UV9]](s32) - ; CHECK: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[UV10]](s32) - ; CHECK: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[UV11]](s32) + ; CHECK: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV6]](<3 x s16>), 0 + ; CHECK: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; CHECK: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; CHECK: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; CHECK: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; CHECK: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP]](s1), [[TRUNC]], [[TRUNC3]] ; CHECK: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[ICMP1]](s1), [[TRUNC1]], [[TRUNC4]] ; CHECK: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[ICMP2]](s1), [[TRUNC2]], [[TRUNC5]] ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SELECT]](s16) ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SELECT1]](s16) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] - ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[SELECT2]](s16) ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] - ; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[DEF]](<2 x s16>) + ; CHECK: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; CHECK: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) - ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) - ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32) - ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) - %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>), [[UV16:%[0-9]+]]:_(<3 x s16>), [[UV17:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CHECK: [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV18]](<2 x s16>) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV19]](<2 x s16>) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV14]](<3 x s16>), 0 + ; CHECK: [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV20]](<2 x s16>) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; CHECK: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV21]](<2 x s16>) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; CHECK: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; CHECK: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C2]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; CHECK: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 %2:_(<3 x s32>) = COPY $vgpr6_vgpr7_vgpr8 %3:_(<3 x s32>) = COPY $vgpr9_vgpr10_vgpr11 - %4:_(<3 x s1>) = G_ICMP intpred(ne), %0, %1 - %5:_(<3 x s16>) = G_TRUNC %2 - %6:_(<3 x s16>) = G_TRUNC %3 - %7:_(<3 x s16>) = G_SELECT %4, %5, %6 - %8:_(<3 x s32>) = G_ANYEXT %7 - $vgpr0_vgpr1_vgpr2 = COPY %8 - + %4:_(<3 x s1>) = G_ICMP intpred(ne), %2, %3 + %5:_(<3 x s16>), %6:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %7:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %9:_(<3 x s16>) = G_SELECT %4, %5, %7 + %10:_(<3 x s16>) = G_IMPLICIT_DEF + %11:_(<6 x s16>) = G_CONCAT_VECTORS %9, %10 + $vgpr0_vgpr1_vgpr2 = COPY %11 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir index b6abe04c34266..01a32f3bb9068 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir @@ -691,18 +691,17 @@ body: | name: test_sext_inreg_v3s16_1 body: | bb.0: - liveins: $vgpr0_vgpr1 - + liveins: $vgpr0_vgpr1_vgpr2 ; GFX9-LABEL: name: test_sext_inreg_v3s16_1 - ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) @@ -727,21 +726,45 @@ body: | ; GFX9: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[DEF2]], [[BUILD_VECTOR_TRUNC4]](<2 x s16>) ; GFX9: [[ASHR2:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL2]], [[BUILD_VECTOR_TRUNC4]](<2 x s16>) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[ASHR]](<2 x s16>), [[ASHR1]](<2 x s16>), [[ASHR2]](<2 x s16>) - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[COPY]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT1]](<4 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX8-LABEL: name: test_sext_inreg_v3s16_1 - ; GFX8: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX8: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; GFX8: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX8: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX8: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX8: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX8: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX8: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX8: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 @@ -763,19 +786,56 @@ body: | ; GFX8: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX8: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX8: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF1]](<2 x s16>) - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX8: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[COPY]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX8: $vgpr0_vgpr1 = COPY [[INSERT1]](<4 x s16>) + ; GFX8: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX8: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX8: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX8: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX8: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX8: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX8: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX8: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX8: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX8: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX8: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GFX8: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX8: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GFX8: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX8: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL5]] + ; GFX8: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX8: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX8: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX8: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; GFX8: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL6]] + ; GFX8: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; GFX8: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX8: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX8: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX8: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX8: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; GFX8: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL7]] + ; GFX8: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GFX8: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>) + ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX6-LABEL: name: test_sext_inreg_v3s16_1 - ; GFX6: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; GFX6: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX6: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX6: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX6: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX6: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX6: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX6: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX6: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX6: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX6: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 1 @@ -799,14 +859,51 @@ body: | ; GFX6: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[DEF1]](<2 x s16>) - ; GFX6: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX6: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[COPY]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX6: $vgpr0_vgpr1 = COPY [[INSERT1]](<4 x s16>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<3 x s16>) = G_EXTRACT %0, 0 - %2:_(<3 x s16>) = G_SEXT_INREG %1, 1 - %3:_(<4 x s16>) = G_INSERT %0, %2, 0 - $vgpr0_vgpr1 = COPY %3 + ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; GFX6: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX6: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX6: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX6: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX6: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GFX6: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX6: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX6: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX6: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX6: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX6: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32) + ; GFX6: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]] + ; GFX6: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX6: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; GFX6: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL3]] + ; GFX6: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX6: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX6: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; GFX6: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; GFX6: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GFX6: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>) + ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %3:_(<3 x s16>) = G_SEXT_INREG %1, 1 + %4:_(<3 x s16>) = G_IMPLICIT_DEF + %5:_(<6 x s16>) = G_CONCAT_VECTORS %3, %4 + $vgpr0_vgpr1_vgpr2 = COPY %5 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir index d44700a84232f..b45f164a86267 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_shl_s32_s32 @@ -600,26 +600,25 @@ body: | name: test_shl_v3s16_v3s16 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; SI-LABEL: name: test_shl_v3s16_v3s16 - ; SI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; SI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; SI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; SI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; SI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) @@ -649,32 +648,68 @@ body: | ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; SI: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL5]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C1]] + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C1]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL7]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; VI-LABEL: name: test_shl_v3s16_v3s16 - ; VI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; VI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; VI: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; VI: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC3]](s16) @@ -692,46 +727,109 @@ body: | ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; VI: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL5]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL6]] + ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL7]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: test_shl_v3s16_v3s16 - ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY1]](<4 x s16>), 0 + ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[EXTRACT3:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT1]](<4 x s16>), 32 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[EXTRACT4:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT2]](<4 x s16>), 0 - ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[EXTRACT5:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT3]](<4 x s16>), 32 - ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[EXTRACT2]], [[EXTRACT4]](<2 x s16>) - ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[EXTRACT3]], [[EXTRACT5]](s16) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT1]](<4 x s16>), 32 + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT2]](<4 x s16>), 0 + ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; GFX9: [[EXTRACT3:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT3]](<4 x s16>), 32 + ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[EXTRACT]], [[EXTRACT2]](<2 x s16>) + ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[EXTRACT1]], [[EXTRACT3]](s16) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) - ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT4]], [[SHL]](<2 x s16>), 0 ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT5]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) - ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 ; GFX9: [[INSERT7:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT6]], [[SHL1]](s16), 32 ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT7]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) - ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) - ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 - ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT8]](<4 x s16>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 - %2:_(<3 x s16>) = G_EXTRACT %0, 0 - %3:_(<3 x s16>) = G_EXTRACT %1, 0 - %4:_(<3 x s16>) = G_SHL %2, %3 - %5:_(<4 x s16>) = G_IMPLICIT_DEF - %6:_(<4 x s16>) = G_INSERT %5, %4, 0 - $vgpr0_vgpr1 = COPY %6 + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV16:%[0-9]+]]:_(<3 x s16>), [[UV17:%[0-9]+]]:_(<3 x s16>), [[UV18:%[0-9]+]]:_(<3 x s16>), [[UV19:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT8]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV20]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV21]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[INSERT9:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV16]](<3 x s16>), 0 + ; GFX9: [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT9]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV22]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV23]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[CONCAT_VECTORS4:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS4]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_SHL %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 + ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir index 5b4f188a6334a..ea4a1a6a063a1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_smax_s32 @@ -400,7 +400,17 @@ body: | ; SI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_smax_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -444,7 +454,17 @@ body: | ; VI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_smax_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -462,11 +482,23 @@ body: | ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMAX]](<2 x s16>), [[SMAX1]](<2 x s16>) ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) - ; GFX9: S_NOP 0, implicit [[UV12]](<3 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_SMAX %0, %1 - S_NOP 0, implicit %2 + %3:_(<3 x s32>) = G_ANYEXT %2 + S_NOP 0, implicit %3 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir index dcb548d7dcc08..6cef6f4950ca9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_smin_s32 @@ -400,7 +400,17 @@ body: | ; SI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_smin_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -444,7 +454,17 @@ body: | ; VI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_smin_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -462,11 +482,23 @@ body: | ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMIN]](<2 x s16>), [[SMIN1]](<2 x s16>) ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) - ; GFX9: S_NOP 0, implicit [[UV12]](<3 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_SMIN %0, %1 - S_NOP 0, implicit %2 + %3:_(<3 x s32>) = G_ANYEXT %2 + S_NOP 0, implicit %3 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir index 3f955b624f880..7d917b1a3452d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s --- name: test_ssubo_s7 @@ -209,26 +209,25 @@ body: | name: test_ssubo_v3s16 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; CHECK-LABEL: name: test_ssubo_v3s16 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr1_vgpr2 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) @@ -255,38 +254,110 @@ body: | ; CHECK: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; CHECK: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 ; CHECK: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) - ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) - ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) - ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) - ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) - ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND3]](s32), [[AND4]](s32), [[AND5]](s32) - ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s32>) = G_INSERT [[DEF2]], [[BUILD_VECTOR]](<3 x s32>), 0 - ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT3]](<4 x s16>) - ; CHECK: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[INSERT4]](<4 x s32>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 - %2:_(<3 x s16>) = G_EXTRACT %0(<4 x s16>), 0 - %3:_(<3 x s16>) = G_EXTRACT %0(<4 x s16>), 0 - %4:_(<3 x s16>), %5:_(<3 x s1>) = G_SSUBO %2, %3 - %6:_(<3 x s32>) = G_ZEXT %3 - %7:_(<4 x s16>) = G_IMPLICIT_DEF - %8:_(<4 x s16>) = G_INSERT %7, %4(<3 x s16>), 0 - %9:_(<4 x s32>) = G_IMPLICIT_DEF - %10:_(<4 x s32>) = G_INSERT %9, %6(<3 x s32>), 0 - $vgpr0_vgpr1 = COPY %8 - $vgpr2_vgpr3_vgpr4_vgpr5 = COPY %10 + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY11]], 16 + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY12]], 16 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32) + ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY13]], 16 + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY14]], 16 + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[SEXT_INREG3]] + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[SUB2]](s32) + ; CHECK: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY15]], 16 + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; CHECK: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY16]], 16 + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG4]](s32), [[SEXT_INREG5]] + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; CHECK: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; CHECK: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY17]], 16 + ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[C3]](s16) + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG6]](s32), [[SEXT]] + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY18]], 16 + ; CHECK: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[C3]](s16) + ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG7]](s32), [[SEXT1]] + ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; CHECK: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY19]], 16 + ; CHECK: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[C3]](s16) + ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG8]](s32), [[SEXT2]] + ; CHECK: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP]] + ; CHECK: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP4]], [[ICMP1]] + ; CHECK: [[XOR2:%[0-9]+]]:_(s1) = G_XOR [[ICMP5]], [[ICMP2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) + ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CHECK: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT4]](<4 x s16>) + ; CHECK: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C]](s32) + ; CHECK: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32) + ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; CHECK: [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT5]](<4 x s16>) + ; CHECK: [[BITCAST12:%[0-9]+]]:_(s32) = G_BITCAST [[UV18]](<2 x s16>) + ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST12]], [[C]](s32) + ; CHECK: [[BITCAST13:%[0-9]+]]:_(s32) = G_BITCAST [[UV19]](<2 x s16>) + ; CHECK: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST13]], [[C]](s32) + ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY [[BITCAST10]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C1]] + ; CHECK: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C1]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]] + ; CHECK: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[COPY22:%[0-9]+]]:_(s32) = COPY [[BITCAST11]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C1]] + ; CHECK: [[COPY23:%[0-9]+]]:_(s32) = COPY [[BITCAST12]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C1]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL3]] + ; CHECK: [[BITCAST15:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; CHECK: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C1]] + ; CHECK: [[COPY25:%[0-9]+]]:_(s32) = COPY [[BITCAST13]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C1]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]] + ; CHECK: [[BITCAST16:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST14]](<2 x s16>), [[BITCAST15]](<2 x s16>), [[BITCAST16]](<2 x s16>) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[COPY26:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY26]], [[C4]] + ; CHECK: [[COPY27:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C4]] + ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY [[ANYEXT2]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY28]], [[C4]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND9]](s32), [[AND10]](s32), [[AND11]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>), %7:_(<3 x s1>) = G_SSUBO %2, %4 + %8:_(<3 x s16>) = G_IMPLICIT_DEF + %9:_(<6 x s16>) = G_CONCAT_VECTORS %6, %8 + %10:_(<3 x s32>) = G_ZEXT %7 + $vgpr0_vgpr1_vgpr2 = COPY %9 + $vgpr0_vgpr1_vgpr2 = COPY %10 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir index 1e46b8f8acf0a..aa24ab6100e47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir @@ -3590,193 +3590,159 @@ body: | ; SI-LABEL: name: test_store_global_v2p0_align1 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; SI: [[UV:%[0-9]+]]:_(p0), [[UV1:%[0-9]+]]:_(p0) = G_UNMERGE_VALUES [[COPY1]](<2 x p0>) - ; SI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](p0) - ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) - ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) - ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] - ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY2]](s32) - ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] - ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY4]](s32) - ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) - ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] - ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY6]](s32) - ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) - ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C2]] - ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C1]](s32) - ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) - ; SI: G_STORE [[COPY9]](s32), [[COPY]](p1) :: (store 1, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; SI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<4 x s32>) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; SI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 1, addrspace 1) ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; SI: G_STORE [[COPY10]](s32), [[PTR_ADD]](p1) :: (store 1 + 1, addrspace 1) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: G_STORE [[COPY3]](s32), [[PTR_ADD]](p1) :: (store 1 + 1, addrspace 1) ; SI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; SI: G_STORE [[COPY11]](s32), [[PTR_ADD1]](p1) :: (store 1 + 2, addrspace 1) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; SI: G_STORE [[COPY4]](s32), [[PTR_ADD1]](p1) :: (store 1 + 2, addrspace 1) ; SI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; SI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; SI: G_STORE [[COPY12]](s32), [[PTR_ADD2]](p1) :: (store 1 + 3, addrspace 1) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: G_STORE [[COPY5]](s32), [[PTR_ADD2]](p1) :: (store 1 + 3, addrspace 1) ; SI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) - ; SI: G_STORE [[COPY13]](s32), [[PTR_ADD3]](p1) :: (store 1 + 4, addrspace 1) - ; SI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; SI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) - ; SI: G_STORE [[COPY14]](s32), [[PTR_ADD4]](p1) :: (store 1 + 5, addrspace 1) - ; SI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; SI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) - ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) - ; SI: G_STORE [[COPY15]](s32), [[PTR_ADD5]](p1) :: (store 1 + 6, addrspace 1) - ; SI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; SI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C9]](s64) - ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) - ; SI: G_STORE [[COPY16]](s32), [[PTR_ADD6]](p1) :: (store 1 + 7, addrspace 1) - ; SI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; SI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C10]](s64) - ; SI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](p0) - ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C]](s32) - ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) - ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) - ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C2]] - ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY17]](s32) - ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) - ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C2]] - ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY19]](s32) - ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) - ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C2]] - ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY21]](s32) - ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; SI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) - ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C2]] - ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY23]](s32) - ; SI: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) - ; SI: G_STORE [[COPY25]](s32), [[PTR_ADD7]](p1) :: (store 1 + 8, addrspace 1) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; SI: G_STORE [[COPY6]](s32), [[PTR_ADD3]](p1) :: (store 1 + 4, addrspace 1) + ; SI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s64) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; SI: G_STORE [[COPY7]](s32), [[PTR_ADD4]](p1) :: (store 1 + 5, addrspace 1) + ; SI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: G_STORE [[COPY8]](s32), [[PTR_ADD5]](p1) :: (store 1 + 6, addrspace 1) + ; SI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C5]](s64) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; SI: G_STORE [[COPY9]](s32), [[PTR_ADD6]](p1) :: (store 1 + 7, addrspace 1) + ; SI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; SI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; SI: G_STORE [[COPY10]](s32), [[PTR_ADD7]](p1) :: (store 1 + 8, addrspace 1) ; SI: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) - ; SI: [[COPY26:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) - ; SI: G_STORE [[COPY26]](s32), [[PTR_ADD8]](p1) :: (store 1 + 9, addrspace 1) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: G_STORE [[COPY11]](s32), [[PTR_ADD8]](p1) :: (store 1 + 9, addrspace 1) ; SI: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; SI: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) - ; SI: G_STORE [[COPY27]](s32), [[PTR_ADD9]](p1) :: (store 1 + 10, addrspace 1) + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; SI: G_STORE [[COPY12]](s32), [[PTR_ADD9]](p1) :: (store 1 + 10, addrspace 1) ; SI: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) - ; SI: [[COPY28:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32) - ; SI: G_STORE [[COPY28]](s32), [[PTR_ADD10]](p1) :: (store 1 + 11, addrspace 1) - ; SI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) - ; SI: [[COPY29:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) - ; SI: G_STORE [[COPY29]](s32), [[PTR_ADD11]](p1) :: (store 1 + 12, addrspace 1) - ; SI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C7]](s64) - ; SI: [[COPY30:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) - ; SI: G_STORE [[COPY30]](s32), [[PTR_ADD12]](p1) :: (store 1 + 13, addrspace 1) - ; SI: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C8]](s64) - ; SI: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) - ; SI: G_STORE [[COPY31]](s32), [[PTR_ADD13]](p1) :: (store 1 + 14, addrspace 1) - ; SI: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C9]](s64) - ; SI: [[COPY32:%[0-9]+]]:_(s32) = COPY [[LSHR11]](s32) - ; SI: G_STORE [[COPY32]](s32), [[PTR_ADD14]](p1) :: (store 1 + 15, addrspace 1) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; SI: G_STORE [[COPY13]](s32), [[PTR_ADD10]](p1) :: (store 1 + 11, addrspace 1) + ; SI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; SI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) + ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) + ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C1]](s32) + ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32) + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; SI: G_STORE [[COPY14]](s32), [[PTR_ADD11]](p1) :: (store 1 + 12, addrspace 1) + ; SI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C3]](s64) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32) + ; SI: G_STORE [[COPY15]](s32), [[PTR_ADD12]](p1) :: (store 1 + 13, addrspace 1) + ; SI: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64) + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; SI: G_STORE [[COPY16]](s32), [[PTR_ADD13]](p1) :: (store 1 + 14, addrspace 1) + ; SI: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C5]](s64) + ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR11]](s32) + ; SI: G_STORE [[COPY17]](s32), [[PTR_ADD14]](p1) :: (store 1 + 15, addrspace 1) ; CI-LABEL: name: test_store_global_v2p0_align1 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; CI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 1, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; CI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 1, addrspace 1) ; VI-LABEL: name: test_store_global_v2p0_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; VI: [[UV:%[0-9]+]]:_(p0), [[UV1:%[0-9]+]]:_(p0) = G_UNMERGE_VALUES [[COPY1]](<2 x p0>) - ; VI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](p0) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) - ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV3]](s32) - ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; VI: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) - ; VI: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) - ; VI: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C1]](s16) - ; VI: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16) - ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; VI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; VI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<4 x s32>) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) ; VI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 1, addrspace 1) - ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) - ; VI: G_STORE [[ANYEXT]](s32), [[PTR_ADD]](p1) :: (store 1 + 1, addrspace 1) - ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; VI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; VI: G_STORE [[COPY3]](s32), [[PTR_ADD1]](p1) :: (store 1 + 2, addrspace 1) - ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 - ; VI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; VI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16) - ; VI: G_STORE [[ANYEXT1]](s32), [[PTR_ADD2]](p1) :: (store 1 + 3, addrspace 1) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) - ; VI: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store 1 + 4, addrspace 1) - ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 - ; VI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; VI: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16) - ; VI: G_STORE [[ANYEXT2]](s32), [[PTR_ADD4]](p1) :: (store 1 + 5, addrspace 1) - ; VI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; VI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) - ; VI: G_STORE [[COPY5]](s32), [[PTR_ADD5]](p1) :: (store 1 + 6, addrspace 1) - ; VI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 - ; VI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) - ; VI: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16) - ; VI: G_STORE [[ANYEXT3]](s32), [[PTR_ADD6]](p1) :: (store 1 + 7, addrspace 1) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C9]](s64) - ; VI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](p0) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[UV4]](s32) - ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C]](s32) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[UV5]](s32) - ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) - ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C1]](s16) - ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[C1]](s16) - ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC6]], [[C1]](s16) - ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC7]], [[C1]](s16) - ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) - ; VI: G_STORE [[COPY6]](s32), [[PTR_ADD7]](p1) :: (store 1 + 8, addrspace 1) - ; VI: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; VI: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR8]](s16) - ; VI: G_STORE [[ANYEXT4]](s32), [[PTR_ADD8]](p1) :: (store 1 + 9, addrspace 1) - ; VI: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) - ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) - ; VI: G_STORE [[COPY7]](s32), [[PTR_ADD9]](p1) :: (store 1 + 10, addrspace 1) - ; VI: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; VI: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR9]](s16) - ; VI: G_STORE [[ANYEXT5]](s32), [[PTR_ADD10]](p1) :: (store 1 + 11, addrspace 1) - ; VI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) - ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) - ; VI: G_STORE [[COPY8]](s32), [[PTR_ADD11]](p1) :: (store 1 + 12, addrspace 1) - ; VI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64) - ; VI: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR10]](s16) - ; VI: G_STORE [[ANYEXT6]](s32), [[PTR_ADD12]](p1) :: (store 1 + 13, addrspace 1) - ; VI: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C7]](s64) - ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) - ; VI: G_STORE [[COPY9]](s32), [[PTR_ADD13]](p1) :: (store 1 + 14, addrspace 1) - ; VI: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C8]](s64) - ; VI: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR11]](s16) - ; VI: G_STORE [[ANYEXT7]](s32), [[PTR_ADD14]](p1) :: (store 1 + 15, addrspace 1) + ; VI: G_STORE [[COPY3]](s32), [[PTR_ADD]](p1) :: (store 1 + 1, addrspace 1) + ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; VI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; VI: G_STORE [[COPY4]](s32), [[PTR_ADD1]](p1) :: (store 1 + 2, addrspace 1) + ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; VI: G_STORE [[COPY5]](s32), [[PTR_ADD2]](p1) :: (store 1 + 3, addrspace 1) + ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; VI: G_STORE [[COPY6]](s32), [[PTR_ADD3]](p1) :: (store 1 + 4, addrspace 1) + ; VI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s64) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; VI: G_STORE [[COPY7]](s32), [[PTR_ADD4]](p1) :: (store 1 + 5, addrspace 1) + ; VI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64) + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: G_STORE [[COPY8]](s32), [[PTR_ADD5]](p1) :: (store 1 + 6, addrspace 1) + ; VI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C5]](s64) + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; VI: G_STORE [[COPY9]](s32), [[PTR_ADD6]](p1) :: (store 1 + 7, addrspace 1) + ; VI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; VI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; VI: G_STORE [[COPY10]](s32), [[PTR_ADD7]](p1) :: (store 1 + 8, addrspace 1) + ; VI: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; VI: G_STORE [[COPY11]](s32), [[PTR_ADD8]](p1) :: (store 1 + 9, addrspace 1) + ; VI: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; VI: G_STORE [[COPY12]](s32), [[PTR_ADD9]](p1) :: (store 1 + 10, addrspace 1) + ; VI: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s64) + ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; VI: G_STORE [[COPY13]](s32), [[PTR_ADD10]](p1) :: (store 1 + 11, addrspace 1) + ; VI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; VI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) + ; VI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) + ; VI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C1]](s32) + ; VI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32) + ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; VI: G_STORE [[COPY14]](s32), [[PTR_ADD11]](p1) :: (store 1 + 12, addrspace 1) + ; VI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C3]](s64) + ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32) + ; VI: G_STORE [[COPY15]](s32), [[PTR_ADD12]](p1) :: (store 1 + 13, addrspace 1) + ; VI: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64) + ; VI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; VI: G_STORE [[COPY16]](s32), [[PTR_ADD13]](p1) :: (store 1 + 14, addrspace 1) + ; VI: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C5]](s64) + ; VI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR11]](s32) + ; VI: G_STORE [[COPY17]](s32), [[PTR_ADD14]](p1) :: (store 1 + 15, addrspace 1) ; GFX9-LABEL: name: test_store_global_v2p0_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 1, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; GFX9: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 1, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store 16, align 1, addrspace 1) @@ -3791,11 +3757,11 @@ body: | ; SI-LABEL: name: test_store_global_v2p0_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; SI: [[UV:%[0-9]+]]:_(p0), [[UV1:%[0-9]+]]:_(p0) = G_UNMERGE_VALUES [[COPY1]](<2 x p0>) - ; SI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](p0) + ; SI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; SI: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<4 x s32>) + ; SI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](<2 x s32>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) - ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) ; SI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, addrspace 1) ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 @@ -3804,40 +3770,41 @@ body: | ; SI: G_STORE [[COPY3]](s32), [[PTR_ADD]](p1) :: (store 2 + 2, addrspace 1) ; SI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) ; SI: G_STORE [[COPY4]](s32), [[PTR_ADD1]](p1) :: (store 2 + 4, addrspace 1) - ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; SI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; SI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; SI: G_STORE [[COPY5]](s32), [[PTR_ADD2]](p1) :: (store 2 + 6, addrspace 1) - ; SI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; SI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; SI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](p0) + ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; SI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; SI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](<2 x s32>) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C]](s32) - ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) ; SI: G_STORE [[COPY6]](s32), [[PTR_ADD3]](p1) :: (store 2 + 8, addrspace 1) ; SI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; SI: G_STORE [[COPY7]](s32), [[PTR_ADD4]](p1) :: (store 2 + 10, addrspace 1) ; SI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) ; SI: G_STORE [[COPY8]](s32), [[PTR_ADD5]](p1) :: (store 2 + 12, addrspace 1) - ; SI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s64) + ; SI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64) ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; SI: G_STORE [[COPY9]](s32), [[PTR_ADD6]](p1) :: (store 2 + 14, addrspace 1) ; CI-LABEL: name: test_store_global_v2p0_align2 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; CI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 2, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; CI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 2, addrspace 1) ; VI-LABEL: name: test_store_global_v2p0_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; VI: [[UV:%[0-9]+]]:_(p0), [[UV1:%[0-9]+]]:_(p0) = G_UNMERGE_VALUES [[COPY1]](<2 x p0>) - ; VI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](p0) + ; VI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; VI: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<4 x s32>) + ; VI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](<2 x s32>) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) - ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) ; VI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, addrspace 1) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 @@ -3846,32 +3813,33 @@ body: | ; VI: G_STORE [[COPY3]](s32), [[PTR_ADD]](p1) :: (store 2 + 2, addrspace 1) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) ; VI: G_STORE [[COPY4]](s32), [[PTR_ADD1]](p1) :: (store 2 + 4, addrspace 1) - ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; VI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64) ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; VI: G_STORE [[COPY5]](s32), [[PTR_ADD2]](p1) :: (store 2 + 6, addrspace 1) - ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; VI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](p0) + ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; VI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](<2 x s32>) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C]](s32) - ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) ; VI: G_STORE [[COPY6]](s32), [[PTR_ADD3]](p1) :: (store 2 + 8, addrspace 1) ; VI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; VI: G_STORE [[COPY7]](s32), [[PTR_ADD4]](p1) :: (store 2 + 10, addrspace 1) ; VI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) ; VI: G_STORE [[COPY8]](s32), [[PTR_ADD5]](p1) :: (store 2 + 12, addrspace 1) - ; VI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s64) + ; VI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64) ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; VI: G_STORE [[COPY9]](s32), [[PTR_ADD6]](p1) :: (store 2 + 14, addrspace 1) ; GFX9-LABEL: name: test_store_global_v2p0_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 2, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; GFX9: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 2, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store 16, align 2, addrspace 1) @@ -3886,19 +3854,23 @@ body: | ; SI-LABEL: name: test_store_global_v2p0_align4 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; SI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; SI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) ; CI-LABEL: name: test_store_global_v2p0_align4 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; CI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; CI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) ; VI-LABEL: name: test_store_global_v2p0_align4 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; VI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; VI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) ; GFX9-LABEL: name: test_store_global_v2p0_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; GFX9: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store 16, align 4, addrspace 1) @@ -3913,19 +3885,23 @@ body: | ; SI-LABEL: name: test_store_global_v2p0_align8 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; SI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; SI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) ; CI-LABEL: name: test_store_global_v2p0_align8 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; CI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; CI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) ; VI-LABEL: name: test_store_global_v2p0_align8 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; VI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; VI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) ; GFX9-LABEL: name: test_store_global_v2p0_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; GFX9: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store 16, align 8, addrspace 1) @@ -3940,19 +3916,23 @@ body: | ; SI-LABEL: name: test_store_global_v2p0_align16 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; SI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; SI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, addrspace 1) ; CI-LABEL: name: test_store_global_v2p0_align16 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; CI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; CI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, addrspace 1) ; VI-LABEL: name: test_store_global_v2p0_align16 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; VI: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; VI: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, addrspace 1) ; GFX9-LABEL: name: test_store_global_v2p0_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9: G_STORE [[COPY1]](<2 x p0>), [[COPY]](p1) :: (store 16, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>) + ; GFX9: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store 16, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store 16, align 16, addrspace 1) @@ -5126,17 +5106,17 @@ body: | ; SI-LABEL: name: test_store_global_v5p3_align1 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; SI: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3), [[UV2:%[0-9]+]]:_(p3), [[UV3:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[EXTRACT]](<4 x p3>) - ; SI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) + ; SI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; SI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<4 x s32>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C]](s32) + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C1]](s32) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C2]](s32) - ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) ; SI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 1, addrspace 1) ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) @@ -5152,11 +5132,10 @@ body: | ; SI: G_STORE [[COPY5]](s32), [[PTR_ADD2]](p1) :: (store 1 + 3, addrspace 1) ; SI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; SI: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) - ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT1]], [[C]](s32) - ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT1]], [[C1]](s32) - ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT1]], [[C2]](s32) - ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) ; SI: G_STORE [[COPY6]](s32), [[PTR_ADD3]](p1) :: (store 1 + 4, addrspace 1) ; SI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s64) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) @@ -5169,11 +5148,10 @@ body: | ; SI: G_STORE [[COPY9]](s32), [[PTR_ADD6]](p1) :: (store 1 + 7, addrspace 1) ; SI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; SI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; SI: [[PTRTOINT2:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV2]](p3) - ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT2]], [[C]](s32) - ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT2]], [[C1]](s32) - ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT2]], [[C2]](s32) - ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[PTRTOINT2]](s32) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) ; SI: G_STORE [[COPY10]](s32), [[PTR_ADD7]](p1) :: (store 1 + 8, addrspace 1) ; SI: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) @@ -5186,11 +5164,10 @@ body: | ; SI: G_STORE [[COPY13]](s32), [[PTR_ADD10]](p1) :: (store 1 + 11, addrspace 1) ; SI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; SI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) - ; SI: [[PTRTOINT3:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV3]](p3) - ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT3]], [[C]](s32) - ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT3]], [[C1]](s32) - ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT3]], [[C2]](s32) - ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[PTRTOINT3]](s32) + ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) + ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C1]](s32) + ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32) + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) ; SI: G_STORE [[COPY14]](s32), [[PTR_ADD11]](p1) :: (store 1 + 12, addrspace 1) ; SI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C3]](s64) ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32) @@ -5220,26 +5197,27 @@ body: | ; CI-LABEL: name: test_store_global_v5p3_align1 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; CI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 1, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; CI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 1, addrspace 1) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; CI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 1, addrspace 1) ; VI-LABEL: name: test_store_global_v5p3_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; VI: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3), [[UV2:%[0-9]+]]:_(p3), [[UV3:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[EXTRACT]](<4 x p3>) - ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) + ; VI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; VI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<4 x s32>) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C]](s32) + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C1]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C2]](s32) - ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) ; VI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 1, addrspace 1) ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) @@ -5255,11 +5233,10 @@ body: | ; VI: G_STORE [[COPY5]](s32), [[PTR_ADD2]](p1) :: (store 1 + 3, addrspace 1) ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; VI: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) - ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT1]], [[C]](s32) - ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT1]], [[C1]](s32) - ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT1]], [[C2]](s32) - ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) ; VI: G_STORE [[COPY6]](s32), [[PTR_ADD3]](p1) :: (store 1 + 4, addrspace 1) ; VI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s64) ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) @@ -5272,11 +5249,10 @@ body: | ; VI: G_STORE [[COPY9]](s32), [[PTR_ADD6]](p1) :: (store 1 + 7, addrspace 1) ; VI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; VI: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; VI: [[PTRTOINT2:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV2]](p3) - ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT2]], [[C]](s32) - ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT2]], [[C1]](s32) - ; VI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT2]], [[C2]](s32) - ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[PTRTOINT2]](s32) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; VI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) ; VI: G_STORE [[COPY10]](s32), [[PTR_ADD7]](p1) :: (store 1 + 8, addrspace 1) ; VI: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64) ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) @@ -5289,11 +5265,10 @@ body: | ; VI: G_STORE [[COPY13]](s32), [[PTR_ADD10]](p1) :: (store 1 + 11, addrspace 1) ; VI: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; VI: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C8]](s64) - ; VI: [[PTRTOINT3:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV3]](p3) - ; VI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT3]], [[C]](s32) - ; VI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT3]], [[C1]](s32) - ; VI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT3]], [[C2]](s32) - ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[PTRTOINT3]](s32) + ; VI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) + ; VI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C1]](s32) + ; VI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32) + ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) ; VI: G_STORE [[COPY14]](s32), [[PTR_ADD11]](p1) :: (store 1 + 12, addrspace 1) ; VI: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C3]](s64) ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32) @@ -5323,9 +5298,10 @@ body: | ; GFX9-LABEL: name: test_store_global_v5p3_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; GFX9: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 1, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; GFX9: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 1, addrspace 1) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 1, addrspace 1) @@ -5343,14 +5319,14 @@ body: | ; SI-LABEL: name: test_store_global_v5p3_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; SI: [[UV:%[0-9]+]]:_(<2 x p3>), [[UV1:%[0-9]+]]:_(<2 x p3>) = G_UNMERGE_VALUES [[EXTRACT]](<4 x p3>) - ; SI: [[UV2:%[0-9]+]]:_(p3), [[UV3:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[UV]](<2 x p3>) - ; SI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV2]](p3) + ; SI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; SI: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[EXTRACT]](<4 x s32>) + ; SI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](<2 x s32>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C]](s32) - ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) ; SI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, addrspace 1) ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) @@ -5358,27 +5334,24 @@ body: | ; SI: G_STORE [[COPY3]](s32), [[PTR_ADD]](p1) :: (store 2 + 2, addrspace 1) ; SI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; SI: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV3]](p3) - ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT1]], [[C]](s32) - ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) ; SI: G_STORE [[COPY4]](s32), [[PTR_ADD1]](p1) :: (store 2 + 4, addrspace 1) ; SI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; SI: G_STORE [[COPY5]](s32), [[PTR_ADD2]](p1) :: (store 2 + 6, addrspace 1) ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; SI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; SI: [[UV4:%[0-9]+]]:_(p3), [[UV5:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[UV1]](<2 x p3>) - ; SI: [[PTRTOINT2:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV4]](p3) - ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT2]], [[C]](s32) - ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PTRTOINT2]](s32) + ; SI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](<2 x s32>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) ; SI: G_STORE [[COPY6]](s32), [[PTR_ADD3]](p1) :: (store 2 + 8, addrspace 1) ; SI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; SI: G_STORE [[COPY7]](s32), [[PTR_ADD4]](p1) :: (store 2 + 10, addrspace 1) ; SI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; SI: [[PTRTOINT3:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV5]](p3) - ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT3]], [[C]](s32) - ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[PTRTOINT3]](s32) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) ; SI: G_STORE [[COPY8]](s32), [[PTR_ADD5]](p1) :: (store 2 + 12, addrspace 1) ; SI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64) ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) @@ -5394,23 +5367,24 @@ body: | ; CI-LABEL: name: test_store_global_v5p3_align2 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; CI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 2, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; CI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 2, addrspace 1) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; CI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 2, addrspace 1) ; VI-LABEL: name: test_store_global_v5p3_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; VI: [[UV:%[0-9]+]]:_(<2 x p3>), [[UV1:%[0-9]+]]:_(<2 x p3>) = G_UNMERGE_VALUES [[EXTRACT]](<4 x p3>) - ; VI: [[UV2:%[0-9]+]]:_(p3), [[UV3:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[UV]](<2 x p3>) - ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV2]](p3) + ; VI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; VI: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[EXTRACT]](<4 x s32>) + ; VI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](<2 x s32>) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C]](s32) - ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) ; VI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, addrspace 1) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) @@ -5418,27 +5392,24 @@ body: | ; VI: G_STORE [[COPY3]](s32), [[PTR_ADD]](p1) :: (store 2 + 2, addrspace 1) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; VI: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV3]](p3) - ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT1]], [[C]](s32) - ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) ; VI: G_STORE [[COPY4]](s32), [[PTR_ADD1]](p1) :: (store 2 + 4, addrspace 1) ; VI: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64) ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; VI: G_STORE [[COPY5]](s32), [[PTR_ADD2]](p1) :: (store 2 + 6, addrspace 1) ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; VI: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; VI: [[UV4:%[0-9]+]]:_(p3), [[UV5:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[UV1]](<2 x p3>) - ; VI: [[PTRTOINT2:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV4]](p3) - ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT2]], [[C]](s32) - ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PTRTOINT2]](s32) + ; VI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](<2 x s32>) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) ; VI: G_STORE [[COPY6]](s32), [[PTR_ADD3]](p1) :: (store 2 + 8, addrspace 1) ; VI: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; VI: G_STORE [[COPY7]](s32), [[PTR_ADD4]](p1) :: (store 2 + 10, addrspace 1) ; VI: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; VI: [[PTRTOINT3:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV5]](p3) - ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT3]], [[C]](s32) - ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[PTRTOINT3]](s32) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) ; VI: G_STORE [[COPY8]](s32), [[PTR_ADD5]](p1) :: (store 2 + 12, addrspace 1) ; VI: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64) ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) @@ -5454,9 +5425,10 @@ body: | ; GFX9-LABEL: name: test_store_global_v5p3_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; GFX9: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 2, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; GFX9: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 2, addrspace 1) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 2, addrspace 1) @@ -5474,36 +5446,40 @@ body: | ; SI-LABEL: name: test_store_global_v5p3_align4 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; SI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; SI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; SI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, addrspace 1) ; CI-LABEL: name: test_store_global_v5p3_align4 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; CI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; CI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; CI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, addrspace 1) ; VI-LABEL: name: test_store_global_v5p3_align4 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; VI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; VI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; VI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, addrspace 1) ; GFX9-LABEL: name: test_store_global_v5p3_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; GFX9: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; GFX9: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 4, addrspace 1) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, addrspace 1) @@ -5521,36 +5497,40 @@ body: | ; SI-LABEL: name: test_store_global_v5p3_align8 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; SI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; SI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; SI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 8, addrspace 1) ; CI-LABEL: name: test_store_global_v5p3_align8 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; CI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; CI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; CI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 8, addrspace 1) ; VI-LABEL: name: test_store_global_v5p3_align8 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; VI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; VI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; VI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 8, addrspace 1) ; GFX9-LABEL: name: test_store_global_v5p3_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; GFX9: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; GFX9: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, align 8, addrspace 1) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 8, addrspace 1) @@ -5568,36 +5548,40 @@ body: | ; SI-LABEL: name: test_store_global_v5p3_align16 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; SI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, addrspace 1) + ; SI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; SI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; SI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, addrspace 1) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; SI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 16, addrspace 1) ; CI-LABEL: name: test_store_global_v5p3_align16 ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; CI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, addrspace 1) + ; CI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; CI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; CI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; CI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, addrspace 1) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; CI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 16, addrspace 1) ; VI-LABEL: name: test_store_global_v5p3_align16 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; VI: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, addrspace 1) + ; VI: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; VI: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; VI: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, addrspace 1) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; VI: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 16, addrspace 1) ; GFX9-LABEL: name: test_store_global_v5p3_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<5 x p3>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x p3>) = G_EXTRACT [[COPY1]](<5 x p3>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](<5 x p3>), 128 - ; GFX9: G_STORE [[EXTRACT]](<4 x p3>), [[COPY]](p1) :: (store 16, addrspace 1) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<5 x s32>) = G_BITCAST [[COPY1]](<5 x p3>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[BITCAST]](<5 x s32>), 0 + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<5 x s32>), 128 + ; GFX9: G_STORE [[EXTRACT]](<4 x s32>), [[COPY]](p1) :: (store 16, addrspace 1) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9: G_STORE [[EXTRACT1]](s32), [[PTR_ADD]](p1) :: (store 4 + 16, align 16, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trunc.mir index 06aff5e241372..7a2012cf9d27b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trunc.mir @@ -211,15 +211,31 @@ body: | name: test_trunc_v4s32_to_v4s1 body: | bb.0: - liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 ; CHECK-LABEL: name: test_trunc_v4s32_to_v4s1 ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[TRUNC:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[COPY]](<4 x s32>) - ; CHECK: S_ENDPGM 0, implicit [[TRUNC]](<4 x s1>) + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[UV]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[UV1]](s32) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[UV2]](s32) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[UV3]](s32) + ; CHECK: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) + ; CHECK: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>) + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[TRUNC]](s1), [[UV4]], [[UV8]] + ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[TRUNC1]](s1), [[UV5]], [[UV9]] + ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[TRUNC2]](s1), [[UV6]], [[UV10]] + ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[TRUNC3]](s1), [[UV7]], [[UV11]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32) + ; CHECK: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - %1:_(<4 x s1>) = G_TRUNC %0 - S_ENDPGM 0, implicit %1 + %1:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + %2:_(<4 x s32>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + %3:_(<4 x s1>) = G_TRUNC %0 + %4:_(<4 x s32>) = G_SELECT %3, %1, %2 + S_ENDPGM 0, implicit %4 ... --- @@ -375,12 +391,14 @@ body: | ; CHECK-LABEL: name: test_trunc_v2s96_to_v2s8 ; CHECK: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 ; CHECK: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s96>) = G_BUILD_VECTOR [[COPY]](s96), [[COPY1]](s96) - ; CHECK: [[TRUNC:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[BUILD_VECTOR]](<2 x s96>) - ; CHECK: S_ENDPGM 0, implicit [[TRUNC]](<2 x s8>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s96) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s96) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32) + ; CHECK: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<2 x s32>) %0:_(s96) = COPY $vgpr0_vgpr1_vgpr2 %1:_(s96) = COPY $vgpr3_vgpr4_vgpr5 %2:_(<2 x s96>) = G_BUILD_VECTOR %0, %1 %3:_(<2 x s8>) = G_TRUNC %2 - S_ENDPGM 0, implicit %3 + %4:_(<2 x s32>) = G_ANYEXT %3 + S_ENDPGM 0, implicit %4 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir index cfab47dc11a27..fcc49853b8672 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s --- name: test_uaddo_s32 @@ -169,31 +169,30 @@ body: | name: test_uaddo_v3s16 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; CHECK-LABEL: name: test_uaddo_v3s16 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr1_vgpr2 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; CHECK: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; CHECK: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; CHECK: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC3]] @@ -211,38 +210,80 @@ body: | ; CHECK: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; CHECK: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; CHECK: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) - ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; CHECK: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[ADD]](s16), [[TRUNC6]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[ADD1]](s16), [[TRUNC7]] + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[ADD2]](s16), [[TRUNC8]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP2]](s1) + ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>), [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CHECK: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; CHECK: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV10]](<3 x s16>), 0 + ; CHECK: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT4]](<4 x s16>) + ; CHECK: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C]](s32) + ; CHECK: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32) ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] - ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; CHECK: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32) - ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s32>) = G_INSERT [[DEF2]], [[BUILD_VECTOR]](<3 x s32>), 0 - ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT3]](<4 x s16>) - ; CHECK: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[INSERT4]](<4 x s32>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 - %2:_(<3 x s16>) = G_EXTRACT %0(<4 x s16>), 0 - %3:_(<3 x s16>) = G_EXTRACT %0(<4 x s16>), 0 - %4:_(<3 x s16>), %5:_(<3 x s1>) = G_UADDO %2, %3 - %6:_(<3 x s32>) = G_ZEXT %3 - %7:_(<4 x s16>) = G_IMPLICIT_DEF - %8:_(<4 x s16>) = G_INSERT %7, %4(<3 x s16>), 0 - %9:_(<4 x s32>) = G_IMPLICIT_DEF - %10:_(<4 x s32>) = G_INSERT %9, %6(<3 x s32>), 0 - $vgpr0_vgpr1 = COPY %8 - $vgpr2_vgpr3_vgpr4_vgpr5 = COPY %10 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST10]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; CHECK: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST11]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; CHECK: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>), [[BITCAST14]](<2 x s16>) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[ANYEXT2]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32), [[AND8]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>), %7:_(<3 x s1>) = G_UADDO %2, %4 + %8:_(<3 x s16>) = G_IMPLICIT_DEF + %9:_(<6 x s16>) = G_CONCAT_VECTORS %6, %8 + %10:_(<3 x s32>) = G_ZEXT %7 + $vgpr0_vgpr1_vgpr2 = COPY %9 + $vgpr0_vgpr1_vgpr2 = COPY %10 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir index a7123b7315812..d374792f9b1c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_umax_s32 @@ -402,7 +402,17 @@ body: | ; SI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_umax_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -446,7 +456,17 @@ body: | ; VI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_umax_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -464,11 +484,23 @@ body: | ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMAX]](<2 x s16>), [[UMAX1]](<2 x s16>) ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) - ; GFX9: S_NOP 0, implicit [[UV12]](<3 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_UMAX %0, %1 - S_NOP 0, implicit %2 + %3:_(<3 x s32>) = G_ANYEXT %2 + S_NOP 0, implicit %3 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir index 84b0975e8e992..111b49d7b0afc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_umin_s32 @@ -402,7 +402,17 @@ body: | ; SI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; SI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_umin_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -446,7 +456,17 @@ body: | ; VI: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF3]](<2 x s16>) ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; VI: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_umin_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -464,11 +484,23 @@ body: | ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMIN]](<2 x s16>), [[UMIN1]](<2 x s16>) ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) - ; GFX9: S_NOP 0, implicit [[UV12]](<3 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV12]](<3 x s16>), 0 + ; GFX9: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_UMIN %0, %1 - S_NOP 0, implicit %2 + %3:_(<3 x s32>) = G_ANYEXT %2 + S_NOP 0, implicit %3 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir index c57bb52f1825b..473e0a3cf33cf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s --- name: test_unmerge_s32_s64 @@ -1151,3 +1151,150 @@ body: | $vgpr9_vgpr10_vgpr11 = COPY %6 ... + +--- +name: test_unmerge_v3s8_v12s8 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2 + + ; CHECK-LABEL: name: test_unmerge_v3s8_v12s8 + ; CHECK: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32) + ; CHECK: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR1]](<4 x s32>) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR2]](<4 x s32>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>) + ; CHECK: [[UV3:%[0-9]+]]:_(<3 x s8>), [[UV4:%[0-9]+]]:_(<3 x s8>), [[UV5:%[0-9]+]]:_(<3 x s8>), [[UV6:%[0-9]+]]:_(<3 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s8>) + ; CHECK: [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV3]](<3 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8) + ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) + ; CHECK: [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8), [[UV12:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV4]](<3 x s8>) + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s8) + ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s8) + ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s8) + ; CHECK: [[BUILD_VECTOR4:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32) + ; CHECK: [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV5]](<3 x s8>) + ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s8) + ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s8) + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s8) + ; CHECK: [[BUILD_VECTOR5:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32) + ; CHECK: [[UV16:%[0-9]+]]:_(s8), [[UV17:%[0-9]+]]:_(s8), [[UV18:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV6]](<3 x s8>) + ; CHECK: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s8) + ; CHECK: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s8) + ; CHECK: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s8) + ; CHECK: [[BUILD_VECTOR6:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR3]](<3 x s32>) + ; CHECK: $vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR4]](<3 x s32>) + ; CHECK: $vgpr6_vgpr7_vgpr8 = COPY [[BUILD_VECTOR5]](<3 x s32>) + ; CHECK: $vgpr9_vgpr10_vgpr11 = COPY [[BUILD_VECTOR6]](<3 x s32>) + %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<12 x s8>) = G_BITCAST %0 + %2:_(<3 x s8>), %3:_(<3 x s8>), %4:_(<3 x s8>), %5:_(<3 x s8>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s32>) = G_ANYEXT %2 + %7:_(<3 x s32>) = G_ANYEXT %3 + %8:_(<3 x s32>) = G_ANYEXT %4 + %9:_(<3 x s32>) = G_ANYEXT %5 + $vgpr0_vgpr1_vgpr2 = COPY %6 + $vgpr3_vgpr4_vgpr5 = COPY %7 + $vgpr6_vgpr7_vgpr8 = COPY %8 + $vgpr9_vgpr10_vgpr11 = COPY %9 + +... + +--- +name: test_unmerge_v3s16_v12s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + + ; CHECK-LABEL: name: test_unmerge_v3s16_v12s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<12 x s16>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<12 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV1]](<3 x s16>), 0 + ; CHECK: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; CHECK: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32) + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV3]](<3 x s16>), 0 + ; CHECK: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; CHECK: $vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR1]](<3 x s32>) + ; CHECK: $vgpr6_vgpr7_vgpr8 = COPY [[BUILD_VECTOR2]](<3 x s32>) + ; CHECK: $vgpr9_vgpr10_vgpr11 = COPY [[BUILD_VECTOR3]](<3 x s32>) + %0:_(<12 x s16>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + %1:_(<3 x s16>), %2:_(<3 x s16>), %3:_(<3 x s16>), %4:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %5:_(<3 x s32>) = G_ANYEXT %1 + %6:_(<3 x s32>) = G_ANYEXT %2 + %7:_(<3 x s32>) = G_ANYEXT %3 + %8:_(<3 x s32>) = G_ANYEXT %4 + $vgpr0_vgpr1_vgpr2 = COPY %5 + $vgpr3_vgpr4_vgpr5 = COPY %6 + $vgpr6_vgpr7_vgpr8 = COPY %7 + $vgpr9_vgpr10_vgpr11 = COPY %8 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir index 7d1d7392f6ecb..eb6fc9c64604d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s --- name: test_usubo_s32 @@ -173,31 +173,30 @@ body: | name: test_usubo_v3s16 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; CHECK-LABEL: name: test_usubo_v3s16 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr1_vgpr2 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[COPY]](<4 x s16>), 0 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; CHECK: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; CHECK: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; CHECK: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[TRUNC3]] @@ -215,38 +214,89 @@ body: | ; CHECK: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; CHECK: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[DEF1]](<2 x s16>) - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; CHECK: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) - ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; CHECK: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; CHECK: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; CHECK: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST8]](s32) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; CHECK: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; CHECK: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; CHECK: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST9]](s32) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[TRUNC6]](s16), [[TRUNC9]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[TRUNC7]](s16), [[TRUNC10]] + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[TRUNC8]](s16), [[TRUNC11]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP2]](s1) + ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CHECK: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT4]](<4 x s16>) + ; CHECK: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C]](s32) + ; CHECK: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32) + ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV12]](<3 x s16>), 0 + ; CHECK: [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT5]](<4 x s16>) + ; CHECK: [[BITCAST12:%[0-9]+]]:_(s32) = G_BITCAST [[UV18]](<2 x s16>) + ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST12]], [[C]](s32) + ; CHECK: [[BITCAST13:%[0-9]+]]:_(s32) = G_BITCAST [[UV19]](<2 x s16>) + ; CHECK: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST13]], [[C]](s32) ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST10]](s32) ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] - ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]] + ; CHECK: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST11]](s32) ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32) - ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 - ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s32>) = G_INSERT [[DEF2]], [[BUILD_VECTOR]](<3 x s32>), 0 - ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT3]](<4 x s16>) - ; CHECK: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[INSERT4]](<4 x s32>) - %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 - %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 - %2:_(<3 x s16>) = G_EXTRACT %0(<4 x s16>), 0 - %3:_(<3 x s16>) = G_EXTRACT %0(<4 x s16>), 0 - %4:_(<3 x s16>), %5:_(<3 x s1>) = G_USUBO %2, %3 - %6:_(<3 x s32>) = G_ZEXT %3 - %7:_(<4 x s16>) = G_IMPLICIT_DEF - %8:_(<4 x s16>) = G_INSERT %7, %4(<3 x s16>), 0 - %9:_(<4 x s32>) = G_IMPLICIT_DEF - %10:_(<4 x s32>) = G_INSERT %9, %6(<3 x s32>), 0 - $vgpr0_vgpr1 = COPY %8 - $vgpr2_vgpr3_vgpr4_vgpr5 = COPY %10 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST12]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]] + ; CHECK: [[BITCAST15:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST13]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; CHECK: [[BITCAST16:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST14]](<2 x s16>), [[BITCAST15]](<2 x s16>), [[BITCAST16]](<2 x s16>) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[ANYEXT2]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32), [[AND8]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>), %7:_(<3 x s1>) = G_USUBO %2, %4 + %8:_(<3 x s16>) = G_IMPLICIT_DEF + %9:_(<6 x s16>) = G_CONCAT_VECTORS %6, %8 + %10:_(<3 x s32>) = G_ZEXT %7 + $vgpr0_vgpr1_vgpr2 = COPY %9 + $vgpr0_vgpr1_vgpr2 = COPY %10 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir index e4be432f3dabf..cff36b3faf6d5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir @@ -435,28 +435,66 @@ body: | name: test_xor_v3s16 body: | bb.0: - + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; CHECK-LABEL: name: test_xor_v3s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) + ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) - ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV2]](<3 x s16>), 0 ; CHECK: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[INSERT]], [[INSERT1]] - ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[XOR]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) - ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 - ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) - %0:_(<3 x s16>) = G_IMPLICIT_DEF - %1:_(<3 x s16>) = G_IMPLICIT_DEF - %2:_(<3 x s16>) = G_XOR %0, %1 - %4:_(<4 x s16>) = G_IMPLICIT_DEF - %5:_(<4 x s16>) = G_INSERT %4, %2, 0 - $vgpr0_vgpr1 = COPY %5 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[XOR]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 + ; CHECK: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 + ; CHECK: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT3]](<4 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CHECK: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CHECK: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) + %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 + %2:_(<3 x s16>), %3:_(<3 x s16>) = G_UNMERGE_VALUES %0 + %4:_(<3 x s16>), %5:_(<3 x s16>) = G_UNMERGE_VALUES %1 + %6:_(<3 x s16>) = G_XOR %2, %4 + %7:_(<3 x s16>) = G_IMPLICIT_DEF + %8:_(<6 x s16>) = G_CONCAT_VECTORS %6, %7 + $vgpr0_vgpr1_vgpr2 = COPY %8 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg b/llvm/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg deleted file mode 100644 index e99d1bb8446ce..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not 'global-isel' in config.root.available_features: - config.unsupported = True diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll index 8817de69bdbab..fc8df81dff845 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -365,19 +365,18 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace ; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -387,19 +386,18 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -442,13 +440,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa ; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 @@ -458,13 +455,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 @@ -672,19 +668,18 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32* ; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -694,19 +689,18 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32* ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -749,13 +743,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 @@ -765,13 +758,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 @@ -991,19 +983,18 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64* ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc @@ -1014,19 +1005,18 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64* ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc @@ -1071,13 +1061,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1088,13 +1077,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1556,19 +1544,18 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace ; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc @@ -1579,19 +1566,18 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace ; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc @@ -1636,13 +1622,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1653,13 +1638,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll index ce898c2a73d4c..af02b77c51d7a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -365,19 +365,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace ; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -387,19 +386,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -409,23 +407,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: global_atomic_inc v2, v[2:3], v4, off glc +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id @@ -440,13 +427,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa ; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 @@ -456,13 +442,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 @@ -472,17 +457,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc +; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[0:1] offset:20 glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id @@ -925,19 +903,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -948,19 +925,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace ; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -971,24 +947,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[2:3], v[4:5], off glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id @@ -1003,13 +968,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1020,13 +984,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1037,18 +1000,11 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id @@ -1133,19 +1089,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32* ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1155,19 +1110,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32* ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1177,19 +1131,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32* ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1208,13 +1161,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 @@ -1224,13 +1176,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 @@ -1240,13 +1191,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 42 @@ -1406,19 +1356,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -1429,19 +1378,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -1452,19 +1400,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -1484,13 +1431,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1501,13 +1447,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1518,13 +1463,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 80f86c6b1f50a..a122a1b2e39d2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -840,9 +840,9 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] @@ -866,13 +866,12 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 8, v1 @@ -899,31 +898,24 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace ; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10_W32-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x54 +; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi -; GFX10_W32-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s6 -; GFX10_W32-NEXT: v_mov_b32_e32 v4, s7 +; GFX10_W32-NEXT: s_clause 0x2 +; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] +; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 +; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8 ; GFX10_W32-NEXT: s_add_u32 s0, s4, 8 ; GFX10_W32-NEXT: s_addc_u32 s1, s5, 0 ; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v3, v1 ; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo ; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_add_co_u32_e64 v3, vcc_lo, v1, 8 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s2, 0, s2 -; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo -; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10_W32-NEXT: s_clause 0x2 -; GFX10_W32-NEXT: global_load_dword v1, v[1:2], off -; GFX10_W32-NEXT: global_load_dword v2, v[3:4], off offset:-4 -; GFX10_W32-NEXT: global_load_dword v3, v[3:4], off ; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s2 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_div_fmas_f32 v2, v1, v2, v3 +; GFX10_W32-NEXT: v_div_fmas_f32 v2, v2, v3, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off @@ -932,30 +924,23 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10_W64-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x54 -; GFX10_W64-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] +; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s6 -; GFX10_W64-NEXT: v_mov_b32_e32 v4, s7 +; GFX10_W64-NEXT: s_clause 0x2 +; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] +; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 +; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8 ; GFX10_W64-NEXT: s_add_u32 s0, s4, 8 ; GFX10_W64-NEXT: s_addc_u32 s1, s5, 0 ; GFX10_W64-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v3, v1 ; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc ; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_add_co_u32_e64 v3, vcc, v1, 8 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 -; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v4, vcc, 0, v2, vcc -; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10_W64-NEXT: s_clause 0x2 -; GFX10_W64-NEXT: global_load_dword v1, v[1:2], off -; GFX10_W64-NEXT: global_load_dword v2, v[3:4], off offset:-4 -; GFX10_W64-NEXT: global_load_dword v3, v[3:4], off ; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[2:3] ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_div_fmas_f32 v2, v1, v2, v3 +; GFX10_W64-NEXT: v_div_fmas_f32 v2, v2, v3, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off @@ -984,8 +969,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1017,14 +1002,13 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x4c -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -1053,18 +1037,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W32: ; %bb.0: ; %entry ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10_W32-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: s_mov_b32 s4, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi -; GFX10_W32-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s2 +; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] +; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v3, v1 -; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo -; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v[1:2], off ; GFX10_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10_W32-NEXT: s_cbranch_execz BB13_2 ; GFX10_W32-NEXT: ; %bb.1: ; %bb @@ -1075,7 +1055,6 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W32-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10_W32-NEXT: BB13_2: ; %exit -; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 @@ -1092,17 +1071,13 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W64: ; %bb.0: ; %entry ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10_W64-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: s_mov_b32 s6, 0 -; GFX10_W64-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v4, s3 -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s2 +; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] +; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v3, v1 -; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc -; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v[1:2], off ; GFX10_W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10_W64-NEXT: s_cbranch_execz BB13_2 ; GFX10_W64-NEXT: ; %bb.1: ; %bb @@ -1113,7 +1088,6 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10_W64-NEXT: BB13_2: ; %exit -; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index de2b58aa6fed4..cd427c7d00d0c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float ; GFX7-LABEL: test_div_scale_f32_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -26,13 +26,12 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float ; GFX8-LABEL: test_div_scale_f32_1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] @@ -47,21 +46,14 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float ; GFX10-LABEL: test_div_scale_f32_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v1, v0 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -83,8 +75,8 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float ; GFX7-LABEL: test_div_scale_f32_2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -101,13 +93,12 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float ; GFX8-LABEL: test_div_scale_f32_2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] @@ -122,21 +113,14 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float ; GFX10-LABEL: test_div_scale_f32_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v1, v0 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -158,14 +142,13 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl ; GFX7-LABEL: test_div_scale_f64_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -180,14 +163,13 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl ; GFX8-LABEL: test_div_scale_f64_1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -202,20 +184,13 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl ; GFX10-LABEL: test_div_scale_f64_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 @@ -239,14 +214,13 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl ; GFX7-LABEL: test_div_scale_f64_2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -261,14 +235,13 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl ; GFX8-LABEL: test_div_scale_f64_2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -283,20 +256,13 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl ; GFX10-LABEL: test_div_scale_f64_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 @@ -321,8 +287,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -339,13 +305,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 @@ -357,16 +322,11 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* % ; GFX10-LABEL: test_div_scale_f32_scalar_num_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 @@ -389,8 +349,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -407,13 +367,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 @@ -425,16 +384,11 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* % ; GFX10-LABEL: test_div_scale_f32_scalar_num_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 @@ -457,8 +411,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -475,13 +429,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 @@ -493,16 +446,11 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* % ; GFX10-LABEL: test_div_scale_f32_scalar_den_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 @@ -525,8 +473,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -543,13 +491,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 @@ -561,16 +508,11 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* % ; GFX10-LABEL: test_div_scale_f32_scalar_den_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 @@ -593,13 +535,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 @@ -612,13 +553,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -630,18 +570,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* ; GFX10-LABEL: test_div_scale_f64_scalar_num_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -662,13 +597,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 @@ -681,13 +615,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -699,18 +632,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* ; GFX10-LABEL: test_div_scale_f64_scalar_num_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -731,13 +659,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 @@ -750,13 +677,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -768,18 +694,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* ; GFX10-LABEL: test_div_scale_f64_scalar_den_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -800,13 +721,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 @@ -819,13 +739,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -837,18 +756,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* ; GFX10-LABEL: test_div_scale_f64_scalar_den_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -1056,8 +970,8 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* ; GFX7-LABEL: test_div_scale_f32_inline_imm_num: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1073,13 +987,12 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* ; GFX8-LABEL: test_div_scale_f32_inline_imm_num: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 @@ -1091,15 +1004,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* ; GFX10-LABEL: test_div_scale_f32_inline_imm_num: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1120,8 +1028,8 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* ; GFX7-LABEL: test_div_scale_f32_inline_imm_den: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1137,13 +1045,12 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* ; GFX8-LABEL: test_div_scale_f32_inline_imm_den: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 @@ -1155,15 +1062,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* ; GFX10-LABEL: test_div_scale_f32_inline_imm_den: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s2, 2.0, 2.0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1184,8 +1086,8 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, ; GFX7-LABEL: test_div_scale_f32_fabs_num: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1204,13 +1106,12 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, ; GFX8-LABEL: test_div_scale_f32_fabs_num: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] @@ -1227,23 +1128,16 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, ; GFX10-LABEL: test_div_scale_f32_fabs_num: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v1, v0 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -1267,8 +1161,8 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, ; GFX7-LABEL: test_div_scale_f32_fabs_den: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1286,13 +1180,12 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, ; GFX8-LABEL: test_div_scale_f32_fabs_den: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] @@ -1308,22 +1201,15 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, ; GFX10-LABEL: test_div_scale_f32_fabs_den: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll index 3dedbec196790..88c82b1c3f7cf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -8,13 +8,12 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) { ; CI-LABEL: is_private_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_load_dword s0, s[4:5], 0x11 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -26,14 +25,9 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) { ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll index f0eb57cef219c..ec477c9925c9a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -8,13 +8,12 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) { ; CI-LABEL: is_local_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_load_dword s0, s[4:5], 0x10 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -26,14 +25,9 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) { ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 05717460bade0..8eab3e78b0d5b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -39,13 +39,12 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i ; GFX8-LABEL: update_dpp64_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 @@ -59,21 +58,16 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i ; GFX10-LABEL: update_dpp64_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 -; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v[6:7], v[4:5], off +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index ea2631cbcb294..0b9a514d2398c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -46,8 +46,8 @@ define i8 @v_lshr_i8_7(i8 %value) { ; GFX9-LABEL: v_lshr_i8_7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 7 -; GFX9-NEXT: v_lshrrev_b16_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v1, 7 +; GFX9-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = lshr i8 %value, 7 ret i8 %result @@ -557,13 +557,11 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX8-LABEL: s_lshr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: s_lshr_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -740,21 +738,17 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX8-LABEL: s_lshr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_mov_b32 s6, 0xffff +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s4, s4, s6 -; GFX8-NEXT: s_and_b32 s7, s7, s6 ; GFX8-NEXT: s_lshr_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s4, s7 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s6 ; GFX8-NEXT: s_and_b32 s3, s3, s6 -; GFX8-NEXT: s_and_b32 s5, s5, s6 -; GFX8-NEXT: s_and_b32 s8, s8, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_lshr_b32 s3, s5, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -932,39 +926,31 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX8-LABEL: s_lshr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_mov_b32 s12, 0xffff +; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_lshr_b32 s13, s4, 16 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 ; GFX8-NEXT: s_and_b32 s4, s4, s12 -; GFX8-NEXT: s_and_b32 s8, s8, s12 -; GFX8-NEXT: s_and_b32 s13, s13, s12 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_lshr_b32 s4, s8, s13 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 ; GFX8-NEXT: s_and_b32 s5, s5, s12 -; GFX8-NEXT: s_and_b32 s9, s9, s12 -; GFX8-NEXT: s_and_b32 s14, s14, s12 -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_lshr_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s12 ; GFX8-NEXT: s_and_b32 s6, s6, s12 -; GFX8-NEXT: s_and_b32 s10, s10, s12 -; GFX8-NEXT: s_and_b32 s15, s15, s12 ; GFX8-NEXT: s_lshr_b32 s5, s9, s14 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 ; GFX8-NEXT: s_lshr_b32 s2, s2, s6 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_and_b32 s3, s3, s12 ; GFX8-NEXT: s_and_b32 s7, s7, s12 -; GFX8-NEXT: s_and_b32 s11, s11, s12 -; GFX8-NEXT: s_and_b32 s16, s16, s12 ; GFX8-NEXT: s_lshr_b32 s6, s10, s15 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll index 2db7d19ac9dd0..b9e302ad95e5d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s define amdgpu_ps i32 @s_orn2_i32(i32 inreg %src0, i32 inreg %src1) { ; GCN-LABEL: s_orn2_i32: @@ -196,58 +196,31 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32 } define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) { -; GFX6-LABEL: s_orn2_i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_orn2_b32 s0, s2, s3 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_orn2_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s3, s0 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: s_or_b32 s0, s2, s0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_orn2_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_orn2_b32 s0, s2, s3 +; GCN-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 ret i16 %or } define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) { -; GFX6-LABEL: s_orn2_i16_commute: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_orn2_b32 s0, s2, s3 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_orn2_i16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s3, s0 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_orn2_i16_commute: +; GCN: ; %bb.0: +; GCN-NEXT: s_orn2_b32 s0, s2, s3 +; GCN-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %not.src1, %src0 ret i16 %or } define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) { -; GFX6-LABEL: s_orn2_i16_multi_use: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s1, s3, -1 -; GFX6-NEXT: s_orn2_b32 s0, s2, s3 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_orn2_i16_multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s3, s0 -; GFX9-NEXT: s_xor_b32 s1, s1, s0 -; GFX9-NEXT: s_or_b32 s0, s2, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_orn2_i16_multi_use: +; GCN: ; %bb.0: +; GCN-NEXT: s_xor_b32 s1, s3, -1 +; GCN-NEXT: s_orn2_b32 s0, s2, s3 +; GCN-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %insert.0 = insertvalue { i16, i16 } undef, i16 %or, 0 @@ -256,23 +229,11 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg % } define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) { -; GFX6-LABEL: s_orn2_i16_multi_foldable_use: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_orn2_b32 s0, s2, s4 -; GFX6-NEXT: s_orn2_b32 s1, s3, s4 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_orn2_i16_multi_foldable_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: s_and_b32 s0, s4, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s2, s2, s1 -; GFX9-NEXT: s_and_b32 s4, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s1 -; GFX9-NEXT: s_or_b32 s0, s2, s4 -; GFX9-NEXT: s_or_b32 s1, s1, s4 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_orn2_i16_multi_foldable_use: +; GCN: ; %bb.0: +; GCN-NEXT: s_orn2_b32 s0, s2, s4 +; GCN-NEXT: s_orn2_b32 s1, s3, s4 +; GCN-NEXT: ; return to shader part epilog %not.src2 = xor i16 %src2, -1 %or0 = or i16 %src0, %not.src2 %or1 = or i16 %src1, %not.src2 @@ -308,21 +269,12 @@ define amdgpu_ps float @v_orn2_i16_sv(i16 inreg %src0, i16 %src1) { } define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) { -; GFX6-LABEL: v_orn2_i16_vs: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s2, -1 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: v_orn2_i16_vs: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX9-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: v_orn2_i16_vs: +; GCN: ; %bb.0: +; GCN-NEXT: s_xor_b32 s0, s2, -1 +; GCN-NEXT: v_or_b32_e32 v0, s0, v0 +; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GCN-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %zext = zext i16 %or to i32 @@ -346,8 +298,7 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) ; ; GFX9-LABEL: s_orn2_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s0, s3, -1 -; GFX9-NEXT: s_or_b32 s0, s2, s0 +; GFX9-NEXT: s_orn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %src0, %not.src1 @@ -371,8 +322,7 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inre ; ; GFX9-LABEL: s_orn2_v2i16_commute: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s0, s3, -1 -; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_orn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %not.src1, %src0 @@ -397,7 +347,7 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 ; GFX9-LABEL: s_orn2_v2i16_multi_use: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_xor_b32 s1, s3, -1 -; GFX9-NEXT: s_or_b32 s0, s2, s1 +; GFX9-NEXT: s_orn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %src0, %not.src1 @@ -429,9 +379,8 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg % ; ; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s1, s4, -1 -; GFX9-NEXT: s_or_b32 s0, s2, s1 -; GFX9-NEXT: s_or_b32 s1, s3, s1 +; GFX9-NEXT: s_orn2_b32 s0, s2, s4 +; GFX9-NEXT: s_orn2_b32 s1, s3, s4 ; GFX9-NEXT: ; return to shader part epilog %not.src2 = xor <2 x i16> %src2, %or0 = or <2 x i16> %src0, %not.src2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir new file mode 100644 index 0000000000000..b7491ab4dafb0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir @@ -0,0 +1,267 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: remove_and_255_zextload +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: remove_and_255_zextload + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: %ptr:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: %load:_(s32) = G_ZEXTLOAD %ptr(p1) :: (load 1, addrspace 1) + ; CHECK: $vgpr0 = COPY %load(s32) + %ptr:_(p1) = COPY $vgpr0_vgpr1 + %load:_(s32) = G_ZEXTLOAD %ptr :: (load 1, addrspace 1, align 1) + %mask:_(s32) = G_CONSTANT i32 255 + %and:_(s32) = G_AND %load, %mask + $vgpr0 = COPY %and + +... + +--- +name: remove_and_255_smin_zextload +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: remove_and_255_smin_zextload + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: %ptr0:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: %ptr1:_(p1) = COPY $vgpr2_vgpr3 + ; CHECK: %load0:_(s32) = G_ZEXTLOAD %ptr0(p1) :: (load 1, addrspace 1) + ; CHECK: %load1:_(s32) = G_ZEXTLOAD %ptr1(p1) :: (load 1, addrspace 1) + ; CHECK: %smin:_(s32) = G_SMIN %load0, %load1 + ; CHECK: $vgpr0 = COPY %smin(s32) + %ptr0:_(p1) = COPY $vgpr0_vgpr1 + %ptr1:_(p1) = COPY $vgpr2_vgpr3 + %load0:_(s32) = G_ZEXTLOAD %ptr0 :: (load 1, addrspace 1, align 1) + %load1:_(s32) = G_ZEXTLOAD %ptr1 :: (load 1, addrspace 1, align 1) + %smin:_(s32) = G_SMIN %load0, %load1 + %mask:_(s32) = G_CONSTANT i32 255 + %and:_(s32) = G_AND %smin, %mask + $vgpr0 = COPY %and + +... + +--- +name: remove_and_255_smax_zextload +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: remove_and_255_smax_zextload + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: %ptr0:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: %ptr1:_(p1) = COPY $vgpr2_vgpr3 + ; CHECK: %load0:_(s32) = G_ZEXTLOAD %ptr0(p1) :: (load 1, addrspace 1) + ; CHECK: %load1:_(s32) = G_ZEXTLOAD %ptr1(p1) :: (load 1, addrspace 1) + ; CHECK: %smax:_(s32) = G_SMAX %load0, %load1 + ; CHECK: $vgpr0 = COPY %smax(s32) + %ptr0:_(p1) = COPY $vgpr0_vgpr1 + %ptr1:_(p1) = COPY $vgpr2_vgpr3 + %load0:_(s32) = G_ZEXTLOAD %ptr0 :: (load 1, addrspace 1, align 1) + %load1:_(s32) = G_ZEXTLOAD %ptr1 :: (load 1, addrspace 1, align 1) + %smax:_(s32) = G_SMAX %load0, %load1 + %mask:_(s32) = G_CONSTANT i32 255 + %and:_(s32) = G_AND %smax, %mask + $vgpr0 = COPY %and + +... + +--- +name: remove_and_255_umin_zextload +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: remove_and_255_umin_zextload + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: %ptr0:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: %ptr1:_(p1) = COPY $vgpr2_vgpr3 + ; CHECK: %load0:_(s32) = G_ZEXTLOAD %ptr0(p1) :: (load 1, addrspace 1) + ; CHECK: %load1:_(s32) = G_ZEXTLOAD %ptr1(p1) :: (load 1, addrspace 1) + ; CHECK: %umin:_(s32) = G_UMIN %load0, %load1 + ; CHECK: $vgpr0 = COPY %umin(s32) + %ptr0:_(p1) = COPY $vgpr0_vgpr1 + %ptr1:_(p1) = COPY $vgpr2_vgpr3 + %load0:_(s32) = G_ZEXTLOAD %ptr0 :: (load 1, addrspace 1, align 1) + %load1:_(s32) = G_ZEXTLOAD %ptr1 :: (load 1, addrspace 1, align 1) + %umin:_(s32) = G_UMIN %load0, %load1 + %mask:_(s32) = G_CONSTANT i32 255 + %and:_(s32) = G_AND %umin, %mask + $vgpr0 = COPY %and + +... + +--- +name: remove_and_255_umax_zextload +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: remove_and_255_umax_zextload + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: %ptr0:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: %ptr1:_(p1) = COPY $vgpr2_vgpr3 + ; CHECK: %load0:_(s32) = G_ZEXTLOAD %ptr0(p1) :: (load 1, addrspace 1) + ; CHECK: %load1:_(s32) = G_ZEXTLOAD %ptr1(p1) :: (load 1, addrspace 1) + ; CHECK: %umax:_(s32) = G_UMAX %load0, %load1 + ; CHECK: $vgpr0 = COPY %umax(s32) + %ptr0:_(p1) = COPY $vgpr0_vgpr1 + %ptr1:_(p1) = COPY $vgpr2_vgpr3 + %load0:_(s32) = G_ZEXTLOAD %ptr0 :: (load 1, addrspace 1, align 1) + %load1:_(s32) = G_ZEXTLOAD %ptr1 :: (load 1, addrspace 1, align 1) + %umax:_(s32) = G_UMAX %load0, %load1 + %mask:_(s32) = G_CONSTANT i32 255 + %and:_(s32) = G_AND %umax, %mask + $vgpr0 = COPY %and + +... + +# Don't have enough known bits for lhs +--- +name: remove_and_255_smin_fail_lhs +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: remove_and_255_smin_fail_lhs + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: %ptr0:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: %ptr1:_(p1) = COPY $vgpr2_vgpr3 + ; CHECK: %load0:_(s32) = G_LOAD %ptr0(p1) :: (load 4, addrspace 1) + ; CHECK: %load1:_(s32) = G_ZEXTLOAD %ptr1(p1) :: (load 1, addrspace 1) + ; CHECK: %smin:_(s32) = G_SMIN %load0, %load1 + ; CHECK: %mask:_(s32) = G_CONSTANT i32 255 + ; CHECK: %and:_(s32) = G_AND %smin, %mask + ; CHECK: $vgpr0 = COPY %and(s32) + %ptr0:_(p1) = COPY $vgpr0_vgpr1 + %ptr1:_(p1) = COPY $vgpr2_vgpr3 + %load0:_(s32) = G_LOAD %ptr0 :: (load 4, addrspace 1, align 4) + %load1:_(s32) = G_ZEXTLOAD %ptr1 :: (load 1, addrspace 1, align 1) + %smin:_(s32) = G_SMIN %load0, %load1 + %mask:_(s32) = G_CONSTANT i32 255 + %and:_(s32) = G_AND %smin, %mask + $vgpr0 = COPY %and + +... + +# Don't have enough known bits for rhs +--- +name: remove_and_255_smin_fail_rhs +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: remove_and_255_smin_fail_rhs + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: %ptr0:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: %ptr1:_(p1) = COPY $vgpr2_vgpr3 + ; CHECK: %load0:_(s32) = G_ZEXTLOAD %ptr0(p1) :: (load 1, addrspace 1) + ; CHECK: %load1:_(s32) = G_LOAD %ptr1(p1) :: (load 4, addrspace 1) + ; CHECK: %smin:_(s32) = G_SMIN %load0, %load1 + ; CHECK: %mask:_(s32) = G_CONSTANT i32 255 + ; CHECK: %and:_(s32) = G_AND %smin, %mask + ; CHECK: $vgpr0 = COPY %and(s32) + %ptr0:_(p1) = COPY $vgpr0_vgpr1 + %ptr1:_(p1) = COPY $vgpr2_vgpr3 + %load0:_(s32) = G_ZEXTLOAD %ptr0 :: (load 1, addrspace 1, align 1) + %load1:_(s32) = G_LOAD %ptr1 :: (load 4, addrspace 1, align 4) + %smin:_(s32) = G_SMIN %load0, %load1 + %mask:_(s32) = G_CONSTANT i32 255 + %and:_(s32) = G_AND %smin, %mask + $vgpr0 = COPY %and + +... + +# Test known bits for groupstaticsize is the maximum LDS size. +--- +name: remove_and_65535_groupstaticsize +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: remove_and_65535_groupstaticsize + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) + ; CHECK: %mask:_(s32) = G_CONSTANT i32 65535 + ; CHECK: %and:_(s32) = G_AND %lds_size, %mask + ; CHECK: $vgpr0 = COPY %and(s32) + %ptr:_(p1) = COPY $vgpr0_vgpr1 + %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) + %mask:_(s32) = G_CONSTANT i32 65535 + %and:_(s32) = G_AND %lds_size, %mask + $vgpr0 = COPY %and + +... + +--- +name: remove_and_131071_groupstaticsize +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: remove_and_131071_groupstaticsize + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) + ; CHECK: $vgpr0 = COPY %lds_size(s32) + %ptr:_(p1) = COPY $vgpr0_vgpr1 + %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) + %mask:_(s32) = G_CONSTANT i32 131071 + %and:_(s32) = G_AND %lds_size, %mask + $vgpr0 = COPY %and + +... + +--- +name: no_remove_and_65536_groupstaticsize +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: no_remove_and_65536_groupstaticsize + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) + ; CHECK: %mask:_(s32) = G_CONSTANT i32 65536 + ; CHECK: %and:_(s32) = G_AND %lds_size, %mask + ; CHECK: $vgpr0 = COPY %and(s32) + %ptr:_(p1) = COPY $vgpr0_vgpr1 + %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) + %mask:_(s32) = G_CONSTANT i32 65536 + %and:_(s32) = G_AND %lds_size, %mask + $vgpr0 = COPY %and + +... + +--- +name: no_remove_and_32767_groupstaticsize +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: no_remove_and_32767_groupstaticsize + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) + ; CHECK: %mask:_(s32) = G_CONSTANT i32 32767 + ; CHECK: %and:_(s32) = G_AND %lds_size, %mask + ; CHECK: $vgpr0 = COPY %and(s32) + %ptr:_(p1) = COPY $vgpr0_vgpr1 + %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) + %mask:_(s32) = G_CONSTANT i32 32767 + %and:_(s32) = G_AND %lds_size, %mask + $vgpr0 = COPY %and + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir index 89f58e1e76871..b8109fe6c87cf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir @@ -6,19 +6,23 @@ name: select_from_different_results_of_unmerge_values tracksRegLiveness: true body: | bb.0: + liveins: $vgpr0 ; GCN-LABEL: name: select_from_different_results_of_unmerge_values + ; GCN: liveins: $vgpr0 ; GCN: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF - ; GCN: [[DEF1:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32) ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) - ; GCN: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[DEF1]](s1), [[UV]], [[UV1]] + ; GCN: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[TRUNC]](s1), [[UV]], [[UV1]] ; GCN: $vgpr0 = COPY [[SELECT]](s32) ; GCN: SI_RETURN_TO_EPILOG $vgpr0 - %2:_(<2 x s32>) = G_IMPLICIT_DEF - %4:_(s1) = G_IMPLICIT_DEF - %0:_(s32), %1:_(s32) = G_UNMERGE_VALUES %2:_(<2 x s32>) - %3:_(s32) = G_SELECT %4:_(s1), %0:_, %1:_ - $vgpr0 = COPY %3 + %0:_(<2 x s32>) = G_IMPLICIT_DEF + %1:_(s32) = COPY $vgpr0 + %2:_(s1) = G_TRUNC %1:_(s32) + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %0:_(<2 x s32>) + %5:_(s32) = G_SELECT %2:_(s1), %3:_, %4:_ + $vgpr0 = COPY %5 SI_RETURN_TO_EPILOG $vgpr0 ... @@ -28,17 +32,20 @@ name: select_from_same_results_of_unmerge_values tracksRegLiveness: true body: | bb.0: + liveins: $vgpr0 ; GCN-LABEL: name: select_from_same_results_of_unmerge_values + ; GCN: liveins: $vgpr0 ; GCN: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) ; GCN: $vgpr0 = COPY [[UV]](s32) ; GCN: SI_RETURN_TO_EPILOG $vgpr0 - %2:_(<2 x s32>) = G_IMPLICIT_DEF - %4:_(s1) = G_IMPLICIT_DEF - %0:_(s32), %1:_(s32) = G_UNMERGE_VALUES %2:_(<2 x s32>) - %3:_(s32) = G_SELECT %4:_(s1), %0:_, %0:_ - $vgpr0 = COPY %3 + %0:_(<2 x s32>) = G_IMPLICIT_DEF + %1:_(s32) = COPY $vgpr0 + %2:_(s1) = G_TRUNC %1:_(s32) + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %0:_(<2 x s32>) + %5:_(s32) = G_SELECT %2:_(s1), %3:_, %3:_ + $vgpr0 = COPY %5 SI_RETURN_TO_EPILOG $vgpr0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir index cabef6bff2af5..172057d2c226f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s --- name: readlane_ss @@ -69,3 +69,78 @@ body: | %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1 ... + +--- +name: readlane_aa +legalized: true + +body: | + bb.0: + liveins: $agpr0, $agpr1 + ; CHECK-LABEL: name: readlane_aa + ; CHECK: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0 + ; CHECK: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY [[COPY1]](s32) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY3]](s32), implicit $exec + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: S_ENDPGM 0, implicit [[INT]](s32) + %0:_(s32) = COPY $agpr0 + %1:_(s32) = COPY $agpr1 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: readlane_as +legalized: true + +body: | + bb.0: + liveins: $agpr0, $sgpr0 + ; CHECK-LABEL: name: readlane_as + ; CHECK: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[COPY1]](s32) + %0:_(s32) = COPY $agpr0 + %1:_(s32) = COPY $sgpr0 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1 +... + +--- +name: readlane_sa +legalized: true + +body: | + bb.0: + liveins: $agpr0, $sgpr0 + ; CHECK-LABEL: name: readlane_sa + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY [[COPY1]](s32) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY3]](s32), implicit $exec + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $agpr0 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1 +... + +--- +name: readlane_va +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $agpr0 + ; CHECK-LABEL: name: readlane_va + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY [[COPY1]](s32) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $agpr0 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir index dad757b41f175..47d036846aae6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir @@ -28,6 +28,7 @@ body: | bb.0: ; CHECK-LABEL: name: test_fconstant_f16_1 ; CHECK: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH3C00 + ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[C]](s16) %0:_(s16) = G_FCONSTANT half 1.0 %1:_(s32) = G_ANYEXT %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir new file mode 100644 index 0000000000000..a437d6059ddbe --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir @@ -0,0 +1,107 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -run-pass=regbankselect -o - %s | FileCheck %s + +# Make sure that an arbitrary AGPR is treated as a divergent value +# that needs to be copied to VGPR, and then waterfalled + +# 32-bit case +--- +name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__agpr_soffset +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1, $agpr0 + + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__agpr_soffset + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1, $agpr0 + ; CHECK: %val:vgpr(s32) = COPY $vgpr0 + ; CHECK: %rsrc:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: %agpr:agpr(s32) = COPY $agpr0 + ; CHECK: %voffset:vgpr(s32) = COPY $vgpr1 + ; CHECK: %zero:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %zero(s32) + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY %agpr(s32) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), [[COPY]](s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store 4, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: S_ENDPGM 0 + %val:_(s32) = COPY $vgpr0 + %rsrc:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %agpr:_(s32) = COPY $agpr0 + %voffset:_(s32) = COPY $vgpr1 + %zero:_(s32) = G_CONSTANT i32 0 + G_AMDGPU_BUFFER_STORE %val, %rsrc, %zero, %voffset, %agpr, 0, 0, 0 :: (dereferenceable store 4, addrspace 4) + S_ENDPGM 0 + +... + +# Register tuple case +--- +name: load_1d_vgpr_vaddr__agpr_srsrc +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, $vgpr0 + ; CHECK-LABEL: name: load_1d_vgpr_vaddr__agpr_srsrc + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:agpr(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<8 x s32>) = COPY [[COPY]](<8 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY2]](<8 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec + ; CHECK: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec + ; CHECK: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec + ; CHECK: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY1]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: S_ENDPGM 0, implicit [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + %0:_(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + %1:_(s32) = COPY $vgpr0 + %2:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, %1(s32), %0(<8 x s32>), 0, 0, 0 :: (dereferenceable load 16) + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll new file mode 100644 index 0000000000000..b2f3dd8b2bf41 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -0,0 +1,541 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; Test optimization to reduce shifts to narrower sizes. + +define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) { +; GCN-LABEL: s_shl_i64_zext_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_andn2_b32 s0, s0, -2.0 +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: ; return to shader part epilog + %and = and i32 %x, 1073741823 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define i64 @v_shl_i64_zext_i32(i32 %x) { +; GCN-LABEL: v_shl_i64_zext_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %x, 1073741823 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define amdgpu_ps i64 @s_shl_i64_sext_i32(i32 inreg %x) { +; GCN-LABEL: s_shl_i64_sext_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_and_b32 s0, s0, 0x1fffffff +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: ; return to shader part epilog + %and = and i32 %x, 536870911 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define i64 @v_shl_i64_sext_i32(i32 %x) { +; GCN-LABEL: v_shl_i64_sext_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %x, 536870911 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define amdgpu_ps i64 @s_shl_i64_zext_i32_overflow(i32 inreg %x) { +; GCN-LABEL: s_shl_i64_zext_i32_overflow: +; GCN: ; %bb.0: +; GCN-NEXT: s_bitset0_b32 s0, 31 +; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN-NEXT: ; return to shader part epilog + %and = and i32 %x, 2147483647 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define i64 @v_shl_i64_zext_i32_overflow(i32 %x) { +; GFX7-LABEL: v_shl_i64_zext_i32_overflow: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_i64_zext_i32_overflow: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_i64_zext_i32_overflow: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %x, 2147483647 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define amdgpu_ps i64 @s_shl_i64_sext_i32_overflow(i32 inreg %x) { +; GCN-LABEL: s_shl_i64_sext_i32_overflow: +; GCN: ; %bb.0: +; GCN-NEXT: s_bitset0_b32 s0, 31 +; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN-NEXT: ; return to shader part epilog + %and = and i32 %x, 2147483647 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define i64 @v_shl_i64_sext_i32_overflow(i32 %x) { +; GFX7-LABEL: v_shl_i64_sext_i32_overflow: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_i64_sext_i32_overflow: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_i64_sext_i32_overflow: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %x, 2147483647 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define amdgpu_kernel void @mulu24_shl64(i32 addrspace(1)* nocapture %arg) { +; GFX7-LABEL: mulu24_shl64: +; GFX7: ; %bb.0: ; %bb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: v_and_b32_e32 v0, 6, v0 +; GFX7-NEXT: v_mul_u32_u24_e32 v0, 7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_lshl_b64 v[2:3], v[0:1], 2 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: mulu24_shl64: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_and_b32_e32 v0, 6, v0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 7, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: mulu24_shl64: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_and_b32_e32 v0, 6, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 7, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_endpgm +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = and i32 %tmp, 6 + %mulconv = mul nuw nsw i32 %tmp1, 7 + %tmp2 = zext i32 %mulconv to i64 + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp2 + store i32 0, i32 addrspace(1)* %tmp3, align 4 + ret void +} + +define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) { +; GFX7-LABEL: muli24_shl64: +; GFX7: ; %bb.0: ; %bb +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v0, 0xff800000, v1 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, -7, v0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[1:2], 3 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v5 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: muli24_shl64: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v0, 0xff800000, v4 +; GFX8-NEXT: v_mul_i32_i24_e32 v0, -7, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: muli24_shl64: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, 0xff800000, v1 +; GFX9-NEXT: v_mul_i32_i24_e32 v1, -7, v1 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp2 = sext i32 %tmp to i64 + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp2 + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = or i32 %tmp4, -8388608 + %tmp6 = mul nsw i32 %tmp5, -7 + %tmp7 = zext i32 %tmp6 to i64 + %tmp8 = shl nuw nsw i64 %tmp7, 3 + %tmp9 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp2 + store i64 %tmp8, i64 addrspace(1)* %tmp9, align 8 + ret void +} + +define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) { +; GCN-LABEL: s_shl_v2i64_zext_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_brev_b32 s2, -4 +; GCN-NEXT: s_mov_b32 s3, s2 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s0, s1 +; GCN-NEXT: s_bfe_u64 s[4:5], s[0:1], 0x200000 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 +; GCN-NEXT: ; return to shader part epilog + %and = and <2 x i32> %x, + %ext = zext <2 x i32> %and to <2 x i64> + %shl = shl <2 x i64> %ext, + ret <2 x i64> %shl +} + +define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) { +; GFX7-LABEL: v_shl_v2i64_zext_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_brev_b32 s4, -4 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], 2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_v2i64_zext_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_brev_b32 s4, -4 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_v2i64_zext_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_brev_b32 s4, -4 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and <2 x i32> %x, + %ext = zext <2 x i32> %and to <2 x i64> + %shl = shl <2 x i64> %ext, + ret <2 x i64> %shl +} + +define amdgpu_ps <2 x i64> @s_shl_v2i64_sext_v2i32(<2 x i32> inreg %x) { +; GCN-LABEL: s_shl_v2i64_sext_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_brev_b32 s2, -8 +; GCN-NEXT: s_mov_b32 s3, s2 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s0, s1 +; GCN-NEXT: s_bfe_i64 s[4:5], s[0:1], 0x200000 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 +; GCN-NEXT: ; return to shader part epilog + %and = and <2 x i32> %x, + %ext = sext <2 x i32> %and to <2 x i64> + %shl = shl <2 x i64> %ext, + ret <2 x i64> %shl +} + +define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) { +; GFX7-LABEL: v_shl_v2i64_sext_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_brev_b32 s4, -8 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], 2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_v2i64_sext_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_brev_b32 s4, -8 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_v2i64_sext_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_brev_b32 s4, -8 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and <2 x i32> %x, + %ext = sext <2 x i32> %and to <2 x i64> + %shl = shl <2 x i64> %ext, + ret <2 x i64> %shl +} + +define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) { +; GFX7-LABEL: s_shl_i32_zext_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff +; GFX7-NEXT: s_lshl_b32 s0, s0, 2 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_shl_i32_zext_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff +; GFX8-NEXT: s_bfe_u32 s1, 2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_shl_i32_zext_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff +; GFX9-NEXT: s_bfe_u32 s1, 2, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX9-NEXT: ; return to shader part epilog + %and = and i16 %x, 16383 + %ext = zext i16 %and to i32 + %shl = shl i32 %ext, 2 + ret i32 %shl +} + +define i32 @v_shl_i32_zext_i16(i16 %x) { +; GFX7-LABEL: v_shl_i32_zext_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_i32_zext_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, 0x3fff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 2, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_i32_zext_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and i16 %x, 16383 + %ext = zext i16 %and to i32 + %shl = shl i32 %ext, 2 + ret i32 %shl +} + +define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { +; GFX7-LABEL: s_shl_v2i32_zext_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 +; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 2 +; GFX7-NEXT: s_lshl_b32 s1, s1, 2 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_shl_v2i32_zext_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s2, 0x3fff +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_mov_b32 s5, s4 +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_shl_v2i32_zext_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff3fff +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 2 +; GFX9-NEXT: ; return to shader part epilog + %and = and <2 x i16> %x, + %ext = zext <2 x i16> %and to <2 x i32> + %shl = shl <2 x i32> %ext, + ret <2 x i32> %shl +} + +; FIXME: This doesn't do what we want. The pre-legalizer combiner +; fails to handle the vector splat. The post-legalizer sees the zext +; legalized into the and. This is probably not that important, since +; we really do this combine in the machine level for lowered +; getelementptrs. +define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) { +; GFX7-LABEL: v_shl_v2i32_zext_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_v2i32_zext_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_v2i32_zext_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 2 +; GFX9-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and <2 x i16> %x, + %ext = zext <2 x i16> %and to <2 x i32> + %shl = shl <2 x i32> %ext, + ret <2 x i32> %shl +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 015f6b5de8b04..f58e26604529e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -551,13 +551,11 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun ; ; GFX8-LABEL: s_shl_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -722,21 +720,17 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX8-LABEL: s_shl_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_mov_b32 s6, 0xffff +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s4, s4, s6 -; GFX8-NEXT: s_and_b32 s7, s7, s6 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s2, s4, s7 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s6 ; GFX8-NEXT: s_and_b32 s3, s3, s6 -; GFX8-NEXT: s_and_b32 s5, s5, s6 -; GFX8-NEXT: s_and_b32 s8, s8, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, s3 ; GFX8-NEXT: s_lshl_b32 s3, s5, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -898,39 +892,31 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX8-LABEL: s_shl_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_mov_b32 s12, 0xffff +; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_lshr_b32 s13, s4, 16 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 ; GFX8-NEXT: s_and_b32 s4, s4, s12 -; GFX8-NEXT: s_and_b32 s8, s8, s12 -; GFX8-NEXT: s_and_b32 s13, s13, s12 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s4, s8, s13 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 ; GFX8-NEXT: s_and_b32 s5, s5, s12 -; GFX8-NEXT: s_and_b32 s9, s9, s12 -; GFX8-NEXT: s_and_b32 s14, s14, s12 -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s12 ; GFX8-NEXT: s_and_b32 s6, s6, s12 -; GFX8-NEXT: s_and_b32 s10, s10, s12 -; GFX8-NEXT: s_and_b32 s15, s15, s12 ; GFX8-NEXT: s_lshl_b32 s5, s9, s14 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 ; GFX8-NEXT: s_lshl_b32 s2, s2, s6 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_and_b32 s3, s3, s12 ; GFX8-NEXT: s_and_b32 s7, s7, s12 -; GFX8-NEXT: s_and_b32 s11, s11, s12 -; GFX8-NEXT: s_and_b32 s16, s16, s12 ; GFX8-NEXT: s_lshl_b32 s6, s10, s15 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 6814f5bb18437..8c2d18406f475 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -205,10 +205,7 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -223,10 +220,9 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i8: @@ -291,10 +287,7 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_cmp_lt_u32 s3, s2 ; GFX6-NEXT: s_cselect_b32 s2, s3, s2 ; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -311,11 +304,10 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: s_lshl_b32 s0, s2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -399,24 +391,19 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_min_u32_e32 v3, v5, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -565,17 +552,12 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_cmp_lt_u32 s5, s4 ; GFX6-NEXT: s_cselect_b32 s4, s5, s4 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -1892,10 +1874,7 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_cmp_lt_u32 s3, s2 ; GFX6-NEXT: s_cselect_b32 s2, s3, s2 ; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -1946,10 +1925,7 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_min_u32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -1994,10 +1970,7 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -2063,19 +2036,14 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v4i16: @@ -2142,16 +2110,11 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_cmp_lt_u32 s5, s4 ; GFX6-NEXT: s_cselect_b32 s4, s5, s4 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v4i16: @@ -2241,29 +2204,22 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v6i16: @@ -2351,20 +2307,13 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_cmp_lt_u32 s7, s6 ; GFX6-NEXT: s_cselect_b32 s6, s7, s6 ; GFX6-NEXT: s_add_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v6i16: @@ -2466,36 +2415,27 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v9, -1, v6 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_xor_b32_e32 v9, -1, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v8i16: @@ -2603,24 +2543,15 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_cmp_lt_u32 s9, s8 ; GFX6-NEXT: s_cselect_b32 s8, s9, s8 ; GFX6-NEXT: s_add_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s8 -; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 -; GFX6-NEXT: s_and_b32 s3, s5, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 -; GFX6-NEXT: s_and_b32 s4, s7, s8 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_or_b32 s3, s6, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index b1a1a313f7e62..6dabfc3c495b2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -199,10 +199,7 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -217,10 +214,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i8: @@ -283,10 +279,7 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_cmp_lt_u32 s1, s2 ; GFX6-NEXT: s_cselect_b32 s2, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -303,11 +296,10 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: s_lshl_b32 s0, s2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -386,25 +378,20 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX6-NEXT: v_min_u32_e32 v3, v2, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -549,17 +536,12 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_cmp_lt_u32 s3, s4 ; GFX6-NEXT: s_cselect_b32 s4, s3, s4 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -1802,10 +1784,7 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_cmp_lt_u32 s1, s2 ; GFX6-NEXT: s_cselect_b32 s2, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -1854,10 +1833,7 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_min_u32_e32 v1, s0, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -1900,10 +1876,7 @@ define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-NEXT: v_min_u32_e32 v2, s0, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -1965,19 +1938,14 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v4i16: @@ -2040,16 +2008,11 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_cmp_lt_u32 s3, s4 ; GFX6-NEXT: s_cselect_b32 s4, s3, s4 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v4i16: @@ -2133,29 +2096,22 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_min_u32_e32 v6, v4, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v6, v5, v6 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v6i16: @@ -2237,20 +2193,13 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_cmp_lt_u32 s5, s6 ; GFX6-NEXT: s_cselect_b32 s6, s5, s6 ; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v6i16: @@ -2344,36 +2293,27 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 ; GFX6-NEXT: v_min_u32_e32 v8, v6, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v8, v7, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v8i16: @@ -2473,24 +2413,15 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_cmp_lt_u32 s7, s8 ; GFX6-NEXT: s_cselect_b32 s8, s7, s8 ; GFX6-NEXT: s_sub_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s8 -; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 -; GFX6-NEXT: s_and_b32 s3, s5, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 -; GFX6-NEXT: s_and_b32 s4, s7, s8 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_or_b32 s3, s6, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 41dc973c02585..4edc231fc1410 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -37,7 +37,6 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, s2 ; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -118,21 +117,19 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_and_b32 s2, s0, s4 -; GFX8-NEXT: s_and_b32 s0, s1, s4 -; GFX8-NEXT: s_and_b32 s1, s5, s4 ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_and_b32 s3, s3, s4 -; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_and_b32 s6, s1, s4 +; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b64 s[2:3], s[6:7], s[4:5] ; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[4:5] -; GFX8-NEXT: s_and_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s0, s3, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s7, 16 -; GFX8-NEXT: s_and_b32 s2, s6, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_lshl_b32 s1, s3, 16 +; GFX8-NEXT: s_and_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/barrier-elimination.ll b/llvm/test/CodeGen/AMDGPU/barrier-elimination.ll index c526baaab9cda..84b17f08bc3b1 100644 --- a/llvm/test/CodeGen/AMDGPU/barrier-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/barrier-elimination.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=amdgcn < %s | FileCheck %s +; RUN: llc -march=amdgcn < %s -global-isel | FileCheck %s ; CHECK-LABEL: {{^}}unknown_wgs: ; CHECK: s_barrier diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 2dc47ca94aa90..d23538cadcbd1 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -624,9 +624,9 @@ define void @too_many_args_use_workitem_id_x_byval( ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} ; FIXEDABI: s_movk_i32 s32, 0x400{{$}} ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} @@ -669,8 +669,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 ; FIXED-ABI-NOT: v31 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} -; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} ; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} +; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 68fe7ec623420..4aa97c57cbd9c 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1,6 +1,8 @@ -; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s -; CHECK: LLVM ERROR: unsupported libcall legalization +; SDAG-ERR: LLVM ERROR: unsupported libcall legalization +; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_i128_vv) define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { %shl = sdiv i128 %lhs, %rhs ret i128 %shl diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 2454efaa5e354..47ae95eefea9c 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s ; FIXME: We don't get cases where the address was an SGPR because we ; get a copy to the address register for each one. @@ -317,7 +318,9 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-NOT: ds_read2_b32 +; CI-COUNT-4: ds_read_u8 +; GFX9-ALIGNED-4: ds_read_u8 +; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}} ; GCN: s_endpgm define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -336,7 +339,9 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-NOT: ds_read2_b32 +; CI-COUNT-2: ds_read_u16 +; GFX9-ALIGNED-2: ds_read_u16 +; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}} ; GCN: s_endpgm define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -655,6 +660,22 @@ define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrsp ret <2 x float> %r1 } +@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 + +; GCN-LABEL: {{^}}read2_v2i32_align1_odd_offset: +; CI-COUNT-8: ds_read_u8 + +; GFX9-ALIGNED-COUNT-8: ds_read_u8 + +; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}} +; GFX9-UNALIGNED: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_ADDR]] offset1:1{{$}} +define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) { +entry: + %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 + store <2 x i32> %load, <2 x i32> addrspace(1)* %out + ret void +} + declare void @void_func_void() #3 declare i32 @llvm.amdgcn.workgroup.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 6b0ce6391ca89..dce2884d77c3d 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 @@ -523,6 +524,21 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs ret void } +@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 + +; GCN-LABEL: {{^}}write2_v2i32_align1_odd_offset: +; CI-COUNT-8: ds_write_b8 + +; GFX9-ALIGNED-COUNT-8: ds_write_b8 + +; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}} +; GFX9-UNALIGNED: ds_write2_b32 [[BASE_ADDR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { +entry: + store <2 x i32> , <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 + ret void +} + declare i32 @llvm.amdgcn.workgroup.id.x() #1 declare i32 @llvm.amdgcn.workgroup.id.y() #1 declare i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 489a3c7973666..6de55da7f90c8 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -16,13 +16,12 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 % ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_b32 s3, 31, s2 -; SI-NEXT: s_lshr_b32 s1, s1, 1 -; SI-NEXT: s_and_b32 s2, s2, 31 -; SI-NEXT: s_lshr_b32 s1, s1, s3 -; SI-NEXT: s_lshl_b32 s0, s0, s2 -; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_lshr_b32 s1, s0, 1 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; SI-NEXT: s_not_b32 s0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -31,15 +30,14 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 % ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_lshr_b32 s1, s0, 1 +; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_andn2_b32 s3, 31, s2 -; VI-NEXT: s_lshr_b32 s1, s1, 1 -; VI-NEXT: s_and_b32 s2, s2, 31 -; VI-NEXT: s_lshr_b32 s1, s1, s3 -; VI-NEXT: s_lshl_b32 s0, s0, s2 -; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -48,33 +46,28 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 % ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_alignbit_b32 v2, s1, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_andn2_b32 s3, 31, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_and_b32 s2, s2, 31 -; GFX9-NEXT: s_lshr_b32 s1, s1, s3 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: NOT_INT * T0.W, KC0[3].X, -; R600-NEXT: AND_INT T0.Z, KC0[3].X, literal.x, -; R600-NEXT: AND_INT T0.W, PV.W, literal.x, -; R600-NEXT: LSHR * T1.W, KC0[2].W, 1, -; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: LSHR T0.W, PS, PV.W, -; R600-NEXT: LSHL * T1.W, KC0[2].Z, PV.Z, -; R600-NEXT: OR_INT T0.X, PS, PV.W, +; R600-NEXT: LSHR T0.Z, KC0[2].Z, 1, +; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1, +; R600-NEXT: NOT_INT * T1.W, KC0[3].X, +; R600-NEXT: BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: @@ -147,20 +140,18 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s9, s9, 1 -; SI-NEXT: s_andn2_b32 s10, 31, s1 -; SI-NEXT: s_and_b32 s1, s1, 31 -; SI-NEXT: s_lshl_b32 s1, s3, s1 -; SI-NEXT: s_andn2_b32 s3, 31, s0 -; SI-NEXT: s_and_b32 s0, s0, 31 -; SI-NEXT: s_lshr_b32 s8, s8, 1 -; SI-NEXT: s_lshr_b32 s9, s9, s10 -; SI-NEXT: s_lshr_b32 s3, s8, s3 -; SI-NEXT: s_lshl_b32 s0, s2, s0 -; SI-NEXT: s_or_b32 s1, s1, s9 -; SI-NEXT: s_or_b32 s0, s0, s3 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_not_b32 s1, s1 +; SI-NEXT: v_alignbit_b32 v0, s3, v0, 1 ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_lshr_b32 s3, s3, 1 +; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_not_b32 s0, s0 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 +; SI-NEXT: s_lshr_b32 s1, s2, 1 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_alignbit_b32 v0, s1, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -171,22 +162,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_not_b32 s1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s7, s5, 1 +; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; VI-NEXT: s_lshr_b32 s1, s4, 1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_lshr_b32 s7, s7, 1 -; VI-NEXT: s_andn2_b32 s8, 31, s1 -; VI-NEXT: s_and_b32 s1, s1, 31 -; VI-NEXT: s_lshl_b32 s1, s5, s1 -; VI-NEXT: s_andn2_b32 s5, 31, s0 -; VI-NEXT: s_and_b32 s0, s0, 31 -; VI-NEXT: s_lshr_b32 s6, s6, 1 -; VI-NEXT: s_lshr_b32 s7, s7, s8 -; VI-NEXT: s_lshr_b32 s5, s6, s5 -; VI-NEXT: s_lshl_b32 s0, s4, s0 -; VI-NEXT: s_or_b32 s1, s1, s7 -; VI-NEXT: s_or_b32 s0, s0, s5 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -197,48 +186,38 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s7, s5, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; GFX9-NEXT: s_lshr_b32 s1, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_lshr_b32 s7, s7, 1 -; GFX9-NEXT: s_andn2_b32 s8, 31, s1 -; GFX9-NEXT: s_and_b32 s1, s1, 31 -; GFX9-NEXT: s_lshl_b32 s1, s5, s1 -; GFX9-NEXT: s_andn2_b32 s5, 31, s0 -; GFX9-NEXT: s_and_b32 s0, s0, 31 -; GFX9-NEXT: s_lshr_b32 s6, s6, 1 -; GFX9-NEXT: s_lshr_b32 s7, s7, s8 -; GFX9-NEXT: s_lshr_b32 s5, s6, s5 -; GFX9-NEXT: s_lshl_b32 s0, s4, s0 -; GFX9-NEXT: s_or_b32 s1, s1, s7 -; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: NOT_INT * T0.W, KC0[4].X, -; R600-NEXT: AND_INT T0.Y, KC0[4].X, literal.x, -; R600-NEXT: AND_INT T0.Z, PV.W, literal.x, -; R600-NEXT: LSHR T0.W, KC0[3].Z, 1, +; R600-NEXT: LSHR T0.Z, KC0[3].X, 1, +; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1, +; R600-NEXT: NOT_INT * T1.W, KC0[4].X, +; R600-NEXT: BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W, +; R600-NEXT: LSHR T0.Z, KC0[2].W, 1, +; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1, ; R600-NEXT: NOT_INT * T1.W, KC0[3].W, -; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: AND_INT T0.X, KC0[3].W, literal.x, -; R600-NEXT: AND_INT T1.Y, PS, literal.x, -; R600-NEXT: LSHR T1.Z, KC0[3].Y, 1, -; R600-NEXT: LSHR T0.W, PV.W, PV.Z, -; R600-NEXT: LSHL * T1.W, KC0[3].X, PV.Y, -; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: OR_INT T0.Y, PS, PV.W, -; R600-NEXT: LSHR T0.W, PV.Z, PV.Y, -; R600-NEXT: LSHL * T1.W, KC0[2].W, PV.X, -; R600-NEXT: OR_INT T0.X, PS, PV.W, +; R600-NEXT: BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: @@ -322,34 +301,30 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s14, 1 -; SI-NEXT: s_andn2_b32 s16, 31, s3 -; SI-NEXT: s_and_b32 s3, s3, 31 -; SI-NEXT: s_lshl_b32 s3, s11, s3 -; SI-NEXT: s_andn2_b32 s11, 31, s2 -; SI-NEXT: s_and_b32 s2, s2, 31 -; SI-NEXT: s_lshl_b32 s2, s10, s2 -; SI-NEXT: s_lshr_b32 s11, s14, s11 -; SI-NEXT: s_or_b32 s2, s2, s11 -; SI-NEXT: s_andn2_b32 s10, 31, s1 -; SI-NEXT: s_and_b32 s1, s1, 31 -; SI-NEXT: s_lshr_b32 s11, s13, 1 -; SI-NEXT: s_lshl_b32 s1, s9, s1 -; SI-NEXT: s_lshr_b32 s10, s11, s10 -; SI-NEXT: s_lshr_b32 s15, s15, 1 -; SI-NEXT: s_or_b32 s1, s1, s10 -; SI-NEXT: s_andn2_b32 s9, 31, s0 -; SI-NEXT: s_and_b32 s0, s0, 31 -; SI-NEXT: s_lshr_b32 s10, s12, 1 -; SI-NEXT: s_lshr_b32 s15, s15, s16 -; SI-NEXT: s_lshr_b32 s9, s10, s9 -; SI-NEXT: s_lshl_b32 s0, s8, s0 -; SI-NEXT: s_or_b32 s3, s3, s15 -; SI-NEXT: s_or_b32 s0, s0, s9 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: s_not_b32 s3, s3 +; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_lshr_b32 s11, s11, 1 +; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: s_not_b32 s2, s2 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; SI-NEXT: s_lshr_b32 s3, s10, 1 +; SI-NEXT: v_alignbit_b32 v2, s3, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: s_not_b32 s1, s1 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 +; SI-NEXT: s_lshr_b32 s2, s9, 1 +; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: s_not_b32 s0, s0 +; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; SI-NEXT: s_lshr_b32 s1, s8, 1 +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_alignbit_b32 v0, s1, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -360,36 +335,32 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, ; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: s_not_b32 s3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshr_b32 s11, s7, 1 +; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1 +; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; VI-NEXT: s_lshr_b32 s3, s6, 1 +; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: s_not_b32 s1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; VI-NEXT: s_lshr_b32 s2, s5, 1 +; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; VI-NEXT: s_lshr_b32 s1, s4, 1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: s_lshr_b32 s10, s10, 1 -; VI-NEXT: s_andn2_b32 s14, 31, s3 -; VI-NEXT: s_and_b32 s3, s3, 31 -; VI-NEXT: s_lshl_b32 s3, s7, s3 -; VI-NEXT: s_andn2_b32 s7, 31, s2 -; VI-NEXT: s_and_b32 s2, s2, 31 -; VI-NEXT: s_lshl_b32 s2, s6, s2 -; VI-NEXT: s_lshr_b32 s7, s10, s7 -; VI-NEXT: s_or_b32 s2, s2, s7 -; VI-NEXT: s_andn2_b32 s6, 31, s1 -; VI-NEXT: s_and_b32 s1, s1, 31 -; VI-NEXT: s_lshr_b32 s7, s9, 1 -; VI-NEXT: s_lshl_b32 s1, s5, s1 -; VI-NEXT: s_lshr_b32 s6, s7, s6 -; VI-NEXT: s_lshr_b32 s11, s11, 1 -; VI-NEXT: s_or_b32 s1, s1, s6 -; VI-NEXT: s_andn2_b32 s5, 31, s0 -; VI-NEXT: s_and_b32 s0, s0, 31 -; VI-NEXT: s_lshr_b32 s6, s8, 1 -; VI-NEXT: s_lshr_b32 s11, s11, s14 -; VI-NEXT: s_lshr_b32 s5, s6, s5 -; VI-NEXT: s_lshl_b32 s0, s4, s0 -; VI-NEXT: s_or_b32 s3, s3, s11 -; VI-NEXT: s_or_b32 s0, s0, s5 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; @@ -400,81 +371,59 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: s_not_b32 s3, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s11, s7, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 +; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; GFX9-NEXT: s_lshr_b32 s3, s6, 1 +; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; GFX9-NEXT: s_lshr_b32 s2, s5, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; GFX9-NEXT: s_lshr_b32 s1, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, s12 ; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: s_lshr_b32 s10, s10, 1 -; GFX9-NEXT: s_andn2_b32 s14, 31, s3 -; GFX9-NEXT: s_and_b32 s3, s3, 31 -; GFX9-NEXT: s_lshl_b32 s3, s7, s3 -; GFX9-NEXT: s_andn2_b32 s7, 31, s2 -; GFX9-NEXT: s_and_b32 s2, s2, 31 -; GFX9-NEXT: s_lshl_b32 s2, s6, s2 -; GFX9-NEXT: s_lshr_b32 s7, s10, s7 -; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_andn2_b32 s6, 31, s1 -; GFX9-NEXT: s_and_b32 s1, s1, 31 -; GFX9-NEXT: s_lshr_b32 s7, s9, 1 -; GFX9-NEXT: s_lshl_b32 s1, s5, s1 -; GFX9-NEXT: s_lshr_b32 s6, s7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s11, 1 -; GFX9-NEXT: s_or_b32 s1, s1, s6 -; GFX9-NEXT: s_andn2_b32 s5, 31, s0 -; GFX9-NEXT: s_and_b32 s0, s0, 31 -; GFX9-NEXT: s_lshr_b32 s6, s8, 1 -; GFX9-NEXT: s_lshr_b32 s11, s11, s14 -; GFX9-NEXT: s_lshr_b32 s5, s6, s5 -; GFX9-NEXT: s_lshl_b32 s0, s4, s0 -; GFX9-NEXT: s_or_b32 s3, s3, s11 -; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 35, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 +; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: AND_INT * T0.W, KC0[5].Y, literal.x, -; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: LSHR T0.Z, KC0[4].Z, 1, -; R600-NEXT: NOT_INT T1.W, KC0[6].X, -; R600-NEXT: AND_INT * T2.W, KC0[6].X, literal.x, -; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: AND_INT T0.X, KC0[5].Z, literal.x, -; R600-NEXT: LSHL T0.Y, KC0[4].X, PS, -; R600-NEXT: NOT_INT T1.Z, KC0[5].W, -; R600-NEXT: AND_INT * T1.W, PV.W, literal.x, -; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: LSHR * T2.W, KC0[5].X, 1, -; R600-NEXT: LSHR T1.X, PV.W, T1.W, -; R600-NEXT: AND_INT T1.Y, KC0[5].W, literal.x, -; R600-NEXT: AND_INT T1.Z, T1.Z, literal.x, -; R600-NEXT: LSHR T1.W, KC0[4].W, 1, +; R600-NEXT: LSHR T0.Z, KC0[4].X, 1, +; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, +; R600-NEXT: NOT_INT * T1.W, KC0[6].X, +; R600-NEXT: LSHR T0.Y, KC0[3].W, 1, +; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1, +; R600-NEXT: BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W, +; R600-NEXT: NOT_INT * T1.W, KC0[5].W, +; R600-NEXT: LSHR T1.Y, KC0[3].Z, 1, +; R600-NEXT: BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W, +; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1, ; R600-NEXT: NOT_INT * T2.W, KC0[5].Z, -; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: AND_INT T2.X, PS, literal.x, -; R600-NEXT: LSHR T2.Y, PV.W, PV.Z, -; R600-NEXT: LSHL T1.Z, KC0[3].W, PV.Y, -; R600-NEXT: NOT_INT T1.W, KC0[5].Y, -; R600-NEXT: OR_INT * T2.W, T0.Y, PV.X, -; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: AND_INT T1.X, PV.W, literal.x, -; R600-NEXT: LSHR T0.Y, KC0[4].Y, 1, -; R600-NEXT: OR_INT T2.Z, PV.Z, PV.Y, -; R600-NEXT: LSHR T1.W, T0.Z, PV.X, -; R600-NEXT: LSHL * T3.W, KC0[3].Z, T0.X, -; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: OR_INT T2.Y, PS, PV.W, -; R600-NEXT: LSHR T1.W, PV.Y, PV.X, -; R600-NEXT: LSHL * T0.W, KC0[3].Y, T0.W, -; R600-NEXT: OR_INT T2.X, PS, PV.W, -; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W, +; R600-NEXT: LSHR T1.Z, KC0[3].Y, 1, +; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1, +; R600-NEXT: NOT_INT * T2.W, KC0[5].Y, +; R600-NEXT: BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll index 8ba2b0e53698d..9a53b097a6f5f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll @@ -1,10 +1,13 @@ -; RUN: not --crash llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - < %s 2>&1 | FileCheck -enable-var-scope -check-prefix=GFX6ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - < %s 2>&1 | FileCheck -enable-var-scope -check-prefix=GFX6ERR-SDAG %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - < %s 2>&1 | FileCheck -enable-var-scope -check-prefix=GFX6ERR-GISEL %s + ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP,GFX8 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s -; GFX6ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.gws.sema.release.all +; GFX6ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.gws.sema.release.all +; GFX6ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.sema.release.all), %{{[0-9]+}}:sgpr(s32) :: (store 4 into custom "GWSResource") (in function: gws_sema_release_all_offset0) ; GCN-LABEL: {{^}}gws_sema_release_all_offset0: ; NOLOOP-DAG: s_mov_b32 m0, 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index c64e400e628fe..7c4ce5d6c1fff 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -145,14 +145,14 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, < ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 ; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_dword v4, v0, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: global_load_dword v3, v0, s[0:1] +; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v3, v3, v4 +; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] offset:4 ; GFX9-NEXT: global_store_dword v0, v3, s[4:5] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/shrink-instructions-implicit-vcclo.mir b/llvm/test/CodeGen/AMDGPU/shrink-instructions-implicit-vcclo.mir new file mode 100644 index 0000000000000..a04773e206b25 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shrink-instructions-implicit-vcclo.mir @@ -0,0 +1,22 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-shrink-instructions --verify-machineinstrs %s -o - | FileCheck %s + +# Make sure the implicit vcc_lo of V_CNDMASK is preserved and not promoted to vcc. +--- + +name: shrink_cndmask_implicit_vcc_lo +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: shrink_cndmask_implicit_vcc_lo + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: V_CMP_LT_I32_e32 0, [[COPY]], implicit-def $vcc_lo, implicit $exec + ; CHECK: V_CNDMASK_B32_e32 0, [[COPY1]], implicit $vcc_lo, implicit $exec + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr0 + V_CMP_LT_I32_e32 0, %0:vgpr_32, implicit-def $vcc_lo, implicit $exec, implicit-def $vcc + %2:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %1:vgpr_32, $vcc_lo, implicit $exec + S_NOP 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 74b53802ef5be..e8e3518aed1c2 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -160,16 +160,14 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3 -; GCN: s_mov_b32 s34, s32 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 +; GCN: s_mov_b32 s34, s32 +; GCN: v_mov_b32_e32 v32, 0 +; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000 -; GCN: v_mov_b32_e32 v33, 0 - -; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024 - ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll index e8240d57816c6..a7a6ec3a6fc92 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll @@ -112,7 +112,7 @@ define float @v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs(float %x, floa ; GCN-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_f32_e64 v0, -|v0|, v1 +; GCN-NEXT: v_sub_f32_e64 v0, v1, |v0| ; GCN-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %neg.fabs.x = fneg float %fabs.x diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-a16.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-a16.ll index c028adf5d9385..e8b18be06d81a 100644 --- a/llvm/test/CodeGen/AMDGPU/unsupported-image-a16.ll +++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-a16.ll @@ -1,9 +1,11 @@ -; RUN: not --crash llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not --crash llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s +; RUN: not --crash llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s ; Make sure this doesn't assert on targets without the r128-16 ; feature, and instead generates a slection error. -; ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.load.1d +; SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.load.1d +; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") (in function: load_1d) define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll index 02c8d67c525f9..4971ac7072735 100644 --- a/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll +++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll @@ -1,10 +1,14 @@ -; RUN: not --crash llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s -; RUN: not --crash llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not --crash llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s +; RUN: not --crash llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s + +; RUN: not --crash llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s +; RUN: not --crash llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s ; Make sure this doesn't assert on targets without the g16 feature, and instead ; generates a selection error. -; ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.sample.d.1d +; SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.sample.d.1d +; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s32), %{{[0-9]+}}:_(<8 x s32>), %{{[0-9]+}}:_(<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") (in function: sample_d_1d) define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { main_body: diff --git a/llvm/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll b/llvm/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll index 162f86306ff4b..8867d662ebc56 100644 --- a/llvm/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll +++ b/llvm/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll @@ -1,7 +1,6 @@ ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s ; RUN: llc < %s -mtriple=thumbv8 | FileCheck -check-prefix=CHECK-V8 %s ; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck -check-prefix=CHECK-V8 %s -; rdar://13782395 define i32 @t1(i32 %a, i32 %b, i8** %retaddr) { ; CHECK-LABEL: t1: @@ -101,7 +100,6 @@ return: ; preds = %if.end5, %if.then4, ; B can be predicated with A.BrToBPredicate into A iff B.Predicate is less ; "permissive" than A.BrToBPredicate, i.e., iff A.BrToBPredicate subsumes ; B.Predicate. -; ; Hard-coded registers comes from the ABI. ; CHECK-LABEL: wrapDistance: @@ -109,6 +107,7 @@ return: ; preds = %if.end5, %if.then4, ; CHECK-NEXT: itt le ; CHECK-NEXT: suble r0, r2, #1 ; CHECK-NEXT: bxle lr +; CHECK-NEXT: LBB{{.*}}: ; CHECK-NEXT: subs [[REG:r[0-9]+]], #120 ; CHECK-NEXT: cmp [[REG]], r1 ; CHECK-NOT: it lt diff --git a/llvm/test/CodeGen/ARM/GlobalISel/lit.local.cfg b/llvm/test/CodeGen/ARM/GlobalISel/lit.local.cfg deleted file mode 100644 index e99d1bb8446ce..0000000000000 --- a/llvm/test/CodeGen/ARM/GlobalISel/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not 'global-isel' in config.root.available_features: - config.unsupported = True diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index f137f715ee420..27cc1d3d6b45d 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -170,6 +170,8 @@ ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis +; CHECK-NEXT: Machine Outliner +; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: ARM constant island placement and branch shortening pass ; CHECK-NEXT: MachineDominator Tree Construction diff --git a/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll b/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll index da6e1274e50ac..522726c4e8890 100644 --- a/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll +++ b/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll @@ -7,10 +7,8 @@ define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> ; CHECK-NEXT: vdot.bf16 d0, d1, d2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <4 x bfloat> %a to <8 x i8> - %1 = bitcast <4 x bfloat> %b to <8 x i8> - %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) - ret <2 x float> %vbfdot1.i + %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) #3 + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -19,10 +17,8 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa ; CHECK-NEXT: vdot.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfdot1.i + %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) #3 + ret <4 x float> %vbfdot3.i } define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { @@ -31,12 +27,11 @@ define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x b ; CHECK-NEXT: vdot.bf16 d0, d1, d2[0] ; CHECK-NEXT: bx lr entry: - %0 = bitcast <4 x bfloat> %b to <2 x float> - %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer - %1 = bitcast <4 x bfloat> %a to <8 x i8> - %2 = bitcast <2 x float> %shuffle to <8 x i8> - %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) - ret <2 x float> %vbfdot1.i + %.cast = bitcast <4 x bfloat> %b to <2 x float> + %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer + %.cast1 = bitcast <2 x float> %lane to <4 x bfloat> + %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3 + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -46,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x ; CHECK-NEXT: vdot.bf16 q0, q1, q8 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %b to <4 x float> - %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> - %1 = bitcast <8 x bfloat> %a to <16 x i8> - %2 = bitcast <4 x float> %shuffle to <16 x i8> - %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) - ret <4 x float> %vbfdot1.i + %.cast = bitcast <8 x bfloat> %b to <4 x float> + %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> + %.cast1 = bitcast <4 x float> %lane to <8 x bfloat> + %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3 + ret <4 x float> %vbfdot3.i } define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) { @@ -60,12 +54,11 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x ; CHECK-NEXT: vdot.bf16 d0, d1, d3[1] ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %b to <4 x float> - %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> - %1 = bitcast <4 x bfloat> %a to <8 x i8> - %2 = bitcast <2 x float> %shuffle to <8 x i8> - %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) - ret <2 x float> %vbfdot1.i + %.cast = bitcast <8 x bfloat> %b to <4 x float> + %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> + %.cast1 = bitcast <2 x float> %lane to <4 x bfloat> + %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3 + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { @@ -75,12 +68,11 @@ define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x ; CHECK-NEXT: vdot.bf16 q0, q1, d4[0] ; CHECK-NEXT: bx lr entry: - %0 = bitcast <4 x bfloat> %b to <2 x float> - %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer - %1 = bitcast <8 x bfloat> %a to <16 x i8> - %2 = bitcast <4 x float> %shuffle to <16 x i8> - %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) - ret <4 x float> %vbfdot1.i + %.cast = bitcast <4 x bfloat> %b to <2 x float> + %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer + %.cast1 = bitcast <4 x float> %lane to <8 x bfloat> + %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3 + ret <4 x float> %vbfdot3.i } define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -89,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo ; CHECK-NEXT: vmmla.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmmla1.i + %vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmmlaq_v3.i } define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -101,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl ; CHECK-NEXT: vfmab.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -113,10 +101,8 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl ; CHECK-NEXT: vfmat.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { @@ -127,10 +113,8 @@ define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -140,10 +124,8 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { @@ -154,10 +136,8 @@ define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -167,10 +147,8 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -181,14 +159,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) ret <4 x float> %vbfmlalt1.i } -declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) -declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>) diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll index c8a937edf95c6..a4243276c70a4 100644 --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll @@ -20,87 +20,88 @@ define fastcc i8* @wrongUseOfPostDominate(i8* readonly %s, i32 %off, i8* readnon ; ENABLE-NEXT: .save {r11, lr} ; ENABLE-NEXT: push {r11, lr} ; ENABLE-NEXT: cmn r1, #1 -; ENABLE-NEXT: ble .LBB0_6 +; ENABLE-NEXT: ble .LBB0_7 ; ENABLE-NEXT: @ %bb.1: @ %while.cond.preheader ; ENABLE-NEXT: cmp r1, #0 -; ENABLE-NEXT: beq .LBB0_5 +; ENABLE-NEXT: beq .LBB0_6 ; ENABLE-NEXT: @ %bb.2: @ %while.cond.preheader ; ENABLE-NEXT: cmp r0, r2 ; ENABLE-NEXT: pophs {r11, pc} +; ENABLE-NEXT: .LBB0_3: @ %while.body.preheader ; ENABLE-NEXT: movw r12, :lower16:skip ; ENABLE-NEXT: sub r1, r1, #1 ; ENABLE-NEXT: movt r12, :upper16:skip -; ENABLE-NEXT: .LBB0_3: @ %while.body +; ENABLE-NEXT: .LBB0_4: @ %while.body ; ENABLE-NEXT: @ =>This Inner Loop Header: Depth=1 ; ENABLE-NEXT: ldrb r3, [r0] ; ENABLE-NEXT: ldrb r3, [r12, r3] ; ENABLE-NEXT: add r0, r0, r3 ; ENABLE-NEXT: sub r3, r1, #1 ; ENABLE-NEXT: cmp r3, r1 -; ENABLE-NEXT: bhs .LBB0_5 -; ENABLE-NEXT: @ %bb.4: @ %while.body -; ENABLE-NEXT: @ in Loop: Header=BB0_3 Depth=1 +; ENABLE-NEXT: bhs .LBB0_6 +; ENABLE-NEXT: @ %bb.5: @ %while.body +; ENABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLE-NEXT: cmp r0, r2 ; ENABLE-NEXT: mov r1, r3 -; ENABLE-NEXT: blo .LBB0_3 -; ENABLE-NEXT: .LBB0_5: @ %if.end29 +; ENABLE-NEXT: blo .LBB0_4 +; ENABLE-NEXT: .LBB0_6: @ %if.end29 ; ENABLE-NEXT: pop {r11, pc} -; ENABLE-NEXT: .LBB0_6: @ %while.cond2.outer +; ENABLE-NEXT: .LBB0_7: @ %while.cond2.outer ; ENABLE-NEXT: @ =>This Loop Header: Depth=1 -; ENABLE-NEXT: @ Child Loop BB0_7 Depth 2 -; ENABLE-NEXT: @ Child Loop BB0_14 Depth 2 +; ENABLE-NEXT: @ Child Loop BB0_8 Depth 2 +; ENABLE-NEXT: @ Child Loop BB0_15 Depth 2 ; ENABLE-NEXT: mov r3, r0 -; ENABLE-NEXT: .LBB0_7: @ %while.cond2 -; ENABLE-NEXT: @ Parent Loop BB0_6 Depth=1 +; ENABLE-NEXT: .LBB0_8: @ %while.cond2 +; ENABLE-NEXT: @ Parent Loop BB0_7 Depth=1 ; ENABLE-NEXT: @ => This Inner Loop Header: Depth=2 ; ENABLE-NEXT: add r1, r1, #1 ; ENABLE-NEXT: cmp r1, #1 -; ENABLE-NEXT: beq .LBB0_17 -; ENABLE-NEXT: @ %bb.8: @ %while.body4 -; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=2 +; ENABLE-NEXT: beq .LBB0_18 +; ENABLE-NEXT: @ %bb.9: @ %while.body4 +; ENABLE-NEXT: @ in Loop: Header=BB0_8 Depth=2 ; ENABLE-NEXT: cmp r3, r2 -; ENABLE-NEXT: bls .LBB0_7 -; ENABLE-NEXT: @ %bb.9: @ %if.then7 -; ENABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 +; ENABLE-NEXT: bls .LBB0_8 +; ENABLE-NEXT: @ %bb.10: @ %if.then7 +; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 ; ENABLE-NEXT: mov r0, r3 ; ENABLE-NEXT: ldrb r12, [r0, #-1]! ; ENABLE-NEXT: sxtb lr, r12 ; ENABLE-NEXT: cmn lr, #1 -; ENABLE-NEXT: bgt .LBB0_6 -; ENABLE-NEXT: @ %bb.10: @ %if.then7 -; ENABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 +; ENABLE-NEXT: bgt .LBB0_7 +; ENABLE-NEXT: @ %bb.11: @ %if.then7 +; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 ; ENABLE-NEXT: cmp r0, r2 -; ENABLE-NEXT: bls .LBB0_6 -; ENABLE-NEXT: @ %bb.11: @ %land.rhs14.preheader -; ENABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 -; ENABLE-NEXT: cmn lr, #1 -; ENABLE-NEXT: bgt .LBB0_6 +; ENABLE-NEXT: bls .LBB0_7 ; ENABLE-NEXT: @ %bb.12: @ %land.rhs14.preheader -; ENABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 +; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 +; ENABLE-NEXT: cmn lr, #1 +; ENABLE-NEXT: bgt .LBB0_7 +; ENABLE-NEXT: @ %bb.13: @ %land.rhs14.preheader +; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 ; ENABLE-NEXT: cmp r12, #191 -; ENABLE-NEXT: bhi .LBB0_6 -; ENABLE-NEXT: @ %bb.13: @ %while.body24.preheader -; ENABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 +; ENABLE-NEXT: bhi .LBB0_7 +; ENABLE-NEXT: @ %bb.14: @ %while.body24.preheader +; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 ; ENABLE-NEXT: sub r3, r3, #2 -; ENABLE-NEXT: .LBB0_14: @ %while.body24 -; ENABLE-NEXT: @ Parent Loop BB0_6 Depth=1 +; ENABLE-NEXT: .LBB0_15: @ %while.body24 +; ENABLE-NEXT: @ Parent Loop BB0_7 Depth=1 ; ENABLE-NEXT: @ => This Inner Loop Header: Depth=2 ; ENABLE-NEXT: mov r0, r3 ; ENABLE-NEXT: cmp r3, r2 -; ENABLE-NEXT: bls .LBB0_6 -; ENABLE-NEXT: @ %bb.15: @ %while.body24.land.rhs14_crit_edge -; ENABLE-NEXT: @ in Loop: Header=BB0_14 Depth=2 +; ENABLE-NEXT: bls .LBB0_7 +; ENABLE-NEXT: @ %bb.16: @ %while.body24.land.rhs14_crit_edge +; ENABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 ; ENABLE-NEXT: mov r3, r0 ; ENABLE-NEXT: ldrsb lr, [r3], #-1 ; ENABLE-NEXT: cmn lr, #1 ; ENABLE-NEXT: uxtb r12, lr -; ENABLE-NEXT: bgt .LBB0_6 -; ENABLE-NEXT: @ %bb.16: @ %while.body24.land.rhs14_crit_edge -; ENABLE-NEXT: @ in Loop: Header=BB0_14 Depth=2 +; ENABLE-NEXT: bgt .LBB0_7 +; ENABLE-NEXT: @ %bb.17: @ %while.body24.land.rhs14_crit_edge +; ENABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 ; ENABLE-NEXT: cmp r12, #192 -; ENABLE-NEXT: blo .LBB0_14 -; ENABLE-NEXT: b .LBB0_6 -; ENABLE-NEXT: .LBB0_17: +; ENABLE-NEXT: blo .LBB0_15 +; ENABLE-NEXT: b .LBB0_7 +; ENABLE-NEXT: .LBB0_18: ; ENABLE-NEXT: mov r0, r3 ; ENABLE-NEXT: pop {r11, pc} ; @@ -109,87 +110,88 @@ define fastcc i8* @wrongUseOfPostDominate(i8* readonly %s, i32 %off, i8* readnon ; DISABLE-NEXT: .save {r11, lr} ; DISABLE-NEXT: push {r11, lr} ; DISABLE-NEXT: cmn r1, #1 -; DISABLE-NEXT: ble .LBB0_6 +; DISABLE-NEXT: ble .LBB0_7 ; DISABLE-NEXT: @ %bb.1: @ %while.cond.preheader ; DISABLE-NEXT: cmp r1, #0 -; DISABLE-NEXT: beq .LBB0_5 +; DISABLE-NEXT: beq .LBB0_6 ; DISABLE-NEXT: @ %bb.2: @ %while.cond.preheader ; DISABLE-NEXT: cmp r0, r2 ; DISABLE-NEXT: pophs {r11, pc} +; DISABLE-NEXT: .LBB0_3: @ %while.body.preheader ; DISABLE-NEXT: movw r12, :lower16:skip ; DISABLE-NEXT: sub r1, r1, #1 ; DISABLE-NEXT: movt r12, :upper16:skip -; DISABLE-NEXT: .LBB0_3: @ %while.body +; DISABLE-NEXT: .LBB0_4: @ %while.body ; DISABLE-NEXT: @ =>This Inner Loop Header: Depth=1 ; DISABLE-NEXT: ldrb r3, [r0] ; DISABLE-NEXT: ldrb r3, [r12, r3] ; DISABLE-NEXT: add r0, r0, r3 ; DISABLE-NEXT: sub r3, r1, #1 ; DISABLE-NEXT: cmp r3, r1 -; DISABLE-NEXT: bhs .LBB0_5 -; DISABLE-NEXT: @ %bb.4: @ %while.body -; DISABLE-NEXT: @ in Loop: Header=BB0_3 Depth=1 +; DISABLE-NEXT: bhs .LBB0_6 +; DISABLE-NEXT: @ %bb.5: @ %while.body +; DISABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; DISABLE-NEXT: cmp r0, r2 ; DISABLE-NEXT: mov r1, r3 -; DISABLE-NEXT: blo .LBB0_3 -; DISABLE-NEXT: .LBB0_5: @ %if.end29 +; DISABLE-NEXT: blo .LBB0_4 +; DISABLE-NEXT: .LBB0_6: @ %if.end29 ; DISABLE-NEXT: pop {r11, pc} -; DISABLE-NEXT: .LBB0_6: @ %while.cond2.outer +; DISABLE-NEXT: .LBB0_7: @ %while.cond2.outer ; DISABLE-NEXT: @ =>This Loop Header: Depth=1 -; DISABLE-NEXT: @ Child Loop BB0_7 Depth 2 -; DISABLE-NEXT: @ Child Loop BB0_14 Depth 2 +; DISABLE-NEXT: @ Child Loop BB0_8 Depth 2 +; DISABLE-NEXT: @ Child Loop BB0_15 Depth 2 ; DISABLE-NEXT: mov r3, r0 -; DISABLE-NEXT: .LBB0_7: @ %while.cond2 -; DISABLE-NEXT: @ Parent Loop BB0_6 Depth=1 +; DISABLE-NEXT: .LBB0_8: @ %while.cond2 +; DISABLE-NEXT: @ Parent Loop BB0_7 Depth=1 ; DISABLE-NEXT: @ => This Inner Loop Header: Depth=2 ; DISABLE-NEXT: add r1, r1, #1 ; DISABLE-NEXT: cmp r1, #1 -; DISABLE-NEXT: beq .LBB0_17 -; DISABLE-NEXT: @ %bb.8: @ %while.body4 -; DISABLE-NEXT: @ in Loop: Header=BB0_7 Depth=2 +; DISABLE-NEXT: beq .LBB0_18 +; DISABLE-NEXT: @ %bb.9: @ %while.body4 +; DISABLE-NEXT: @ in Loop: Header=BB0_8 Depth=2 ; DISABLE-NEXT: cmp r3, r2 -; DISABLE-NEXT: bls .LBB0_7 -; DISABLE-NEXT: @ %bb.9: @ %if.then7 -; DISABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 +; DISABLE-NEXT: bls .LBB0_8 +; DISABLE-NEXT: @ %bb.10: @ %if.then7 +; DISABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 ; DISABLE-NEXT: mov r0, r3 ; DISABLE-NEXT: ldrb r12, [r0, #-1]! ; DISABLE-NEXT: sxtb lr, r12 ; DISABLE-NEXT: cmn lr, #1 -; DISABLE-NEXT: bgt .LBB0_6 -; DISABLE-NEXT: @ %bb.10: @ %if.then7 -; DISABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 +; DISABLE-NEXT: bgt .LBB0_7 +; DISABLE-NEXT: @ %bb.11: @ %if.then7 +; DISABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 ; DISABLE-NEXT: cmp r0, r2 -; DISABLE-NEXT: bls .LBB0_6 -; DISABLE-NEXT: @ %bb.11: @ %land.rhs14.preheader -; DISABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 -; DISABLE-NEXT: cmn lr, #1 -; DISABLE-NEXT: bgt .LBB0_6 +; DISABLE-NEXT: bls .LBB0_7 ; DISABLE-NEXT: @ %bb.12: @ %land.rhs14.preheader -; DISABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 +; DISABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 +; DISABLE-NEXT: cmn lr, #1 +; DISABLE-NEXT: bgt .LBB0_7 +; DISABLE-NEXT: @ %bb.13: @ %land.rhs14.preheader +; DISABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 ; DISABLE-NEXT: cmp r12, #191 -; DISABLE-NEXT: bhi .LBB0_6 -; DISABLE-NEXT: @ %bb.13: @ %while.body24.preheader -; DISABLE-NEXT: @ in Loop: Header=BB0_6 Depth=1 +; DISABLE-NEXT: bhi .LBB0_7 +; DISABLE-NEXT: @ %bb.14: @ %while.body24.preheader +; DISABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 ; DISABLE-NEXT: sub r3, r3, #2 -; DISABLE-NEXT: .LBB0_14: @ %while.body24 -; DISABLE-NEXT: @ Parent Loop BB0_6 Depth=1 +; DISABLE-NEXT: .LBB0_15: @ %while.body24 +; DISABLE-NEXT: @ Parent Loop BB0_7 Depth=1 ; DISABLE-NEXT: @ => This Inner Loop Header: Depth=2 ; DISABLE-NEXT: mov r0, r3 ; DISABLE-NEXT: cmp r3, r2 -; DISABLE-NEXT: bls .LBB0_6 -; DISABLE-NEXT: @ %bb.15: @ %while.body24.land.rhs14_crit_edge -; DISABLE-NEXT: @ in Loop: Header=BB0_14 Depth=2 +; DISABLE-NEXT: bls .LBB0_7 +; DISABLE-NEXT: @ %bb.16: @ %while.body24.land.rhs14_crit_edge +; DISABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 ; DISABLE-NEXT: mov r3, r0 ; DISABLE-NEXT: ldrsb lr, [r3], #-1 ; DISABLE-NEXT: cmn lr, #1 ; DISABLE-NEXT: uxtb r12, lr -; DISABLE-NEXT: bgt .LBB0_6 -; DISABLE-NEXT: @ %bb.16: @ %while.body24.land.rhs14_crit_edge -; DISABLE-NEXT: @ in Loop: Header=BB0_14 Depth=2 +; DISABLE-NEXT: bgt .LBB0_7 +; DISABLE-NEXT: @ %bb.17: @ %while.body24.land.rhs14_crit_edge +; DISABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 ; DISABLE-NEXT: cmp r12, #192 -; DISABLE-NEXT: blo .LBB0_14 -; DISABLE-NEXT: b .LBB0_6 -; DISABLE-NEXT: .LBB0_17: +; DISABLE-NEXT: blo .LBB0_15 +; DISABLE-NEXT: b .LBB0_7 +; DISABLE-NEXT: .LBB0_18: ; DISABLE-NEXT: mov r0, r3 ; DISABLE-NEXT: pop {r11, pc} entry: diff --git a/llvm/test/CodeGen/ARM/atomic-cmpxchg.ll b/llvm/test/CodeGen/ARM/atomic-cmpxchg.ll index 59fb02654cd3e..0f2c6600e93d5 100644 --- a/llvm/test/CodeGen/ARM/atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/ARM/atomic-cmpxchg.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARM ; RUN: llc < %s -mtriple=thumb-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMB @@ -8,78 +9,91 @@ ; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV7 define zeroext i1 @test_cmpxchg_res_i8(i8* %addr, i8 %desired, i8 zeroext %new) { +; CHECK-ARM-LABEL: test_cmpxchg_res_i8: +; CHECK-ARM: .save {r4, lr} +; CHECK-ARM-NEXT: push {r4, lr} +; CHECK-ARM-NEXT: mov r4, r1 +; CHECK-ARM-NEXT: bl __sync_val_compare_and_swap_1 +; CHECK-ARM-NEXT: and r1, r4, #255 +; CHECK-ARM-NEXT: sub r0, r0, r1 +; CHECK-ARM-NEXT: rsbs r1, r0, #0 +; CHECK-ARM-NEXT: adc r0, r0, r1 +; CHECK-ARM-NEXT: pop {r4, lr} +; CHECK-ARM-NEXT: mov pc, lr +; +; CHECK-THUMB-LABEL: test_cmpxchg_res_i8: +; CHECK-THUMB: .save {r4, lr} +; CHECK-THUMB-NEXT: push {r4, lr} +; CHECK-THUMB-NEXT: movs r4, r1 +; CHECK-THUMB-NEXT: bl __sync_val_compare_and_swap_1 +; CHECK-THUMB-NEXT: movs r1, #255 +; CHECK-THUMB-NEXT: ands r1, r4 +; CHECK-THUMB-NEXT: subs r1, r0, r1 +; CHECK-THUMB-NEXT: rsbs r0, r1, #0 +; CHECK-THUMB-NEXT: adcs r0, r1 +; CHECK-THUMB-NEXT: pop {r4} +; CHECK-THUMB-NEXT: pop {r1} +; CHECK-THUMB-NEXT: bx r1 +; +; CHECK-ARMV6-LABEL: test_cmpxchg_res_i8: +; CHECK-ARMV6: uxtb r1, r1 +; CHECK-ARMV6-NEXT: .LBB0_1: +; CHECK-ARMV6-NEXT: ldrexb r3, [r0] +; CHECK-ARMV6-NEXT: cmp r3, r1 +; CHECK-ARMV6-NEXT: movne r0, #0 +; CHECK-ARMV6-NEXT: bxne lr +; CHECK-ARMV6-NEXT: .LBB0_2: +; CHECK-ARMV6-NEXT: strexb r3, r2, [r0] +; CHECK-ARMV6-NEXT: cmp r3, #0 +; CHECK-ARMV6-NEXT: moveq r0, #1 +; CHECK-ARMV6-NEXT: bxeq lr +; CHECK-ARMV6-NEXT: b .LBB0_1 +; +; CHECK-THUMBV6-LABEL: test_cmpxchg_res_i8: +; CHECK-THUMBV6: .save {r4, lr} +; CHECK-THUMBV6-NEXT: push {r4, lr} +; CHECK-THUMBV6-NEXT: mov r4, r1 +; CHECK-THUMBV6-NEXT: bl __sync_val_compare_and_swap_1 +; CHECK-THUMBV6-NEXT: uxtb r1, r4 +; CHECK-THUMBV6-NEXT: subs r1, r0, r1 +; CHECK-THUMBV6-NEXT: rsbs r0, r1, #0 +; CHECK-THUMBV6-NEXT: adcs r0, r1 +; CHECK-THUMBV6-NEXT: pop {r4, pc} +; +; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8: +; CHECK-ARMV7: uxtb r1, r1 +; CHECK-ARMV7-NEXT: .LBB0_1: +; CHECK-ARMV7-NEXT: ldrexb r3, [r0] +; CHECK-ARMV7-NEXT: cmp r3, r1 +; CHECK-ARMV7-NEXT: bne .LBB0_3 +; CHECK-ARMV7-NEXT: strexb r3, r2, [r0] +; CHECK-ARMV7-NEXT: cmp r3, #0 +; CHECK-ARMV7-NEXT: moveq r0, #1 +; CHECK-ARMV7-NEXT: bxeq lr +; CHECK-ARMV7-NEXT: b .LBB0_1 +; CHECK-ARMV7-NEXT: .LBB0_3: +; CHECK-ARMV7-NEXT: mov r0, #0 +; CHECK-ARMV7-NEXT: clrex +; CHECK-ARMV7-NEXT: bx lr +; +; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8: +; CHECK-THUMBV7: uxtb r1, r1 +; CHECK-THUMBV7-NEXT: .LBB0_1: +; CHECK-THUMBV7-NEXT: ldrexb r3, [r0] +; CHECK-THUMBV7-NEXT: cmp r3, r1 +; CHECK-THUMBV7-NEXT: bne .LBB0_3 +; CHECK-THUMBV7-NEXT: strexb r3, r2, [r0] +; CHECK-THUMBV7-NEXT: cmp r3, #0 +; CHECK-THUMBV7-NEXT: itt eq +; CHECK-THUMBV7-NEXT: moveq r0, #1 +; CHECK-THUMBV7-NEXT: bxeq lr +; CHECK-THUMBV7-NEXT: b .LBB0_1 +; CHECK-THUMBV7-NEXT: .LBB0_3: +; CHECK-THUMBV7-NEXT: movs r0, #0 +; CHECK-THUMBV7-NEXT: clrex +; CHECK-THUMBV7-NEXT: bx lr entry: %0 = cmpxchg i8* %addr, i8 %desired, i8 %new monotonic monotonic %1 = extractvalue { i8, i1 } %0, 1 ret i1 %1 } - -; CHECK-ARM-LABEL: test_cmpxchg_res_i8 -; CHECK-ARM: bl __sync_val_compare_and_swap_1 -; CHECK-ARM: sub r0, r0, {{r[0-9]+}} -; CHECK-ARM: rsbs [[REG:r[0-9]+]], r0, #0 -; CHECK-ARM: adc r0, r0, [[REG]] - -; CHECK-THUMB-LABEL: test_cmpxchg_res_i8 -; CHECK-THUMB: bl __sync_val_compare_and_swap_1 -; CHECK-THUMB-NOT: mov [[R1:r[0-7]]], r0 -; CHECK-THUMB: subs [[R1:r[0-7]]], r0, {{r[0-9]+}} -; CHECK-THUMB: rsbs r0, [[R1]], #0 -; CHECK-THUMB: adcs r0, [[R1]] - -; CHECK-ARMV6-LABEL: test_cmpxchg_res_i8: -; CHECK-ARMV6-NEXT: .fnstart -; CHECK-ARMV6-NEXT: uxtb [[DESIRED:r[0-9]+]], r1 -; CHECK-ARMV6-NEXT: [[TRY:.LBB[0-9_]+]]: -; CHECK-ARMV6-NEXT: ldrexb [[LD:r[0-9]+]], [r0] -; CHECK-ARMV6-NEXT: cmp [[LD]], [[DESIRED]] -; CHECK-ARMV6-NEXT: movne [[RES:r[0-9]+]], #0 -; CHECK-ARMV6-NEXT: bxne lr -; CHECK-ARMV6-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0] -; CHECK-ARMV6-NEXT: cmp [[SUCCESS]], #0 -; CHECK-ARMV6-NEXT: moveq [[RES]], #1 -; CHECK-ARMV6-NEXT: bxeq lr -; CHECK-ARMV6-NEXT: b [[TRY]] - -; CHECK-THUMBV6-LABEL: test_cmpxchg_res_i8: -; CHECK-THUMBV6: mov [[EXPECTED:r[0-9]+]], r1 -; CHECK-THUMBV6-NEXT: bl __sync_val_compare_and_swap_1 -; CHECK-THUMBV6-NEXT: uxtb r1, r4 -; CHECK-THUMBV6-NEXT: subs [[R1:r[0-7]]], r0, {{r[0-9]+}} -; CHECK-THUMBV6-NEXT: rsbs r0, [[R1]], #0 -; CHECK-THUMBV6-NEXT: adcs r0, [[R1]] - -; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8: -; CHECK-ARMV7-NEXT: .fnstart -; CHECK-ARMV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1 -; CHECK-ARMV7-NEXT: [[TRY:.LBB[0-9_]+]]: -; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS:r[0-9]+]], [r0] -; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1 -; CHECK-ARMV7-NEXT: bne [[EXIT:.LBB[0-9_]+]] -; CHECK-ARMV7-NEXT: strexb [[SUCCESS]], r2, [r0] -; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0 -; CHECK-ARMV7-NEXT: moveq r0, #1 -; CHECK-ARMV7-NEXT: bxeq lr -; CHECK-ARMV7-NEXT: b [[TRY]] -; CHECK-ARMV7-NEXT: [[EXIT]]: -; CHECK-ARMV7-NEXT: mov r0, #0 -; CHECK-ARMV7-NEXT: clrex -; CHECK-ARMV7-NEXT: bx lr - -; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8: -; CHECK-THUMBV7-NEXT: .fnstart -; CHECK-THUMBV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1 -; CHECK-THUMBV7-NEXT: [[TRYLD:.LBB[0-9_]+]] -; CHECK-THUMBV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0] -; CHECK-THUMBV7-NEXT: cmp [[LD]], [[DESIRED]] -; CHECK-THUMBV7-NEXT: bne [[EXIT:.LBB[0-9_]+]] -; CHECK-THUMBV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0] -; CHECK-THUMBV7-NEXT: cmp [[SUCCESS]], #0 -; CHECK-THUMBV7-NEXT: itt eq -; CHECK-THUMBV7-NEXT: moveq r0, #1 -; CHECK-THUMBV7-NEXT: bxeq lr -; CHECK-THUMBV7-NEXT: b [[TRYLD]] -; CHECK-THUMBV7-NEXT: [[EXIT]]: -; CHECK-THUMBV7-NEXT: movs r0, #0 -; CHECK-THUMBV7-NEXT: clrex -; CHECK-THUMBV7-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/call-tc.ll b/llvm/test/CodeGen/ARM/call-tc.ll index 4256cc879e747..3ebaa7b025c77 100644 --- a/llvm/test/CodeGen/ARM/call-tc.ll +++ b/llvm/test/CodeGen/ARM/call-tc.ll @@ -85,6 +85,7 @@ entry: ; CHECKT2D-LABEL: t7: ; CHECKT2D: it ne ; CHECKT2D-NEXT: bne.w _foo +; CHECKT2D-NEXT: LBB{{.*}}: ; CHECKT2D-NEXT: push ; CHECKT2D-NEXT: mov r7, sp ; CHECKT2D-NEXT: bl _foo diff --git a/llvm/test/CodeGen/ARM/cmp-bool.ll b/llvm/test/CodeGen/ARM/cmp-bool.ll index 18ef348b9edac..9d83ee8c81a44 100644 --- a/llvm/test/CodeGen/ARM/cmp-bool.ll +++ b/llvm/test/CodeGen/ARM/cmp-bool.ll @@ -8,6 +8,7 @@ define void @bool_eq(i1 zeroext %a, i1 zeroext %b, void ()* nocapture %c) nounwi ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: cmp r0, r1 ; ARM-NEXT: bxne lr +; ARM-NEXT: .LBB0_1: @ %if.then ; ARM-NEXT: bx r2 ; ; THUMB-LABEL: bool_eq: @@ -25,6 +26,7 @@ define void @bool_eq(i1 zeroext %a, i1 zeroext %b, void ()* nocapture %c) nounwi ; THUMB2-NEXT: cmp r0, r1 ; THUMB2-NEXT: it ne ; THUMB2-NEXT: bxne lr +; THUMB2-NEXT: .LBB0_1: @ %if.then ; THUMB2-NEXT: bx r2 entry: %0 = xor i1 %a, %b @@ -43,6 +45,7 @@ define void @bool_ne(i1 zeroext %a, i1 zeroext %b, void ()* nocapture %c) nounwi ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: cmp r0, r1 ; ARM-NEXT: bxeq lr +; ARM-NEXT: .LBB1_1: @ %if.then ; ARM-NEXT: bx r2 ; ; THUMB-LABEL: bool_ne: @@ -60,6 +63,7 @@ define void @bool_ne(i1 zeroext %a, i1 zeroext %b, void ()* nocapture %c) nounwi ; THUMB2-NEXT: cmp r0, r1 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: bxeq lr +; THUMB2-NEXT: .LBB1_1: @ %if.then ; THUMB2-NEXT: bx r2 entry: %cmp = xor i1 %a, %b diff --git a/llvm/test/CodeGen/ARM/cmpxchg-weak.ll b/llvm/test/CodeGen/ARM/cmpxchg-weak.ll index 5ee07828526c5..78800fc8bc423 100644 --- a/llvm/test/CodeGen/ARM/cmpxchg-weak.ll +++ b/llvm/test/CodeGen/ARM/cmpxchg-weak.ll @@ -2,9 +2,6 @@ define void @test_cmpxchg_weak(i32 *%addr, i32 %desired, i32 %new) { ; CHECK-LABEL: test_cmpxchg_weak: - - %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic - %oldval = extractvalue { i32, i1 } %pair, 0 ; CHECK-NEXT: %bb.0: ; CHECK-NEXT: ldrex [[LOADED:r[0-9]+]], [r0] ; CHECK-NEXT: cmp [[LOADED]], r1 @@ -25,18 +22,15 @@ define void @test_cmpxchg_weak(i32 *%addr, i32 %desired, i32 %new) { ; CHECK-NEXT: dmb ish ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: bx lr - +; + %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic + %oldval = extractvalue { i32, i1 } %pair, 0 store i32 %oldval, i32* %addr ret void } - define i1 @test_cmpxchg_weak_to_bool(i32, i32 *%addr, i32 %desired, i32 %new) { ; CHECK-LABEL: test_cmpxchg_weak_to_bool: - - %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic - %success = extractvalue { i32, i1 } %pair, 1 - ; CHECK-NEXT: %bb.0: ; CHECK-NEXT: ldrex [[LOADED:r[0-9]+]], [r1] ; CHECK-NEXT: cmp [[LOADED]], r2 @@ -47,6 +41,7 @@ define i1 @test_cmpxchg_weak_to_bool(i32, i32 *%addr, i32 %desired, i32 %new) { ; CHECK-NEXT: strex [[SUCCESS:r[0-9]+]], r3, [r1] ; CHECK-NEXT: cmp [[SUCCESS]], #0 ; CHECK-NEXT: bxne lr +; CHECK-NEXT: LBB1_2: ; CHECK-NEXT: mov r0, #1 ; CHECK-NEXT: dmb ish ; CHECK-NEXT: bx lr @@ -54,6 +49,8 @@ define i1 @test_cmpxchg_weak_to_bool(i32, i32 *%addr, i32 %desired, i32 %new) { ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: clrex ; CHECK-NEXT: bx lr - +; + %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic + %success = extractvalue { i32, i1 } %pair, 1 ret i1 %success } diff --git a/llvm/test/CodeGen/ARM/code-placement.ll b/llvm/test/CodeGen/ARM/code-placement.ll index e0db88aa6703d..7755ff53512ef 100644 --- a/llvm/test/CodeGen/ARM/code-placement.ll +++ b/llvm/test/CodeGen/ARM/code-placement.ll @@ -1,6 +1,5 @@ ; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s ; PHI elimination shouldn't break backedge. -; rdar://8263994 %struct.list_data_s = type { i16, i16 } %struct.list_head = type { %struct.list_head*, %struct.list_data_s* } @@ -12,6 +11,7 @@ entry: br i1 %0, label %bb2, label %bb bb: +; CHECK: LBB0_1: ; CHECK: LBB0_[[LABEL:[0-9]]]: ; CHECK: bne LBB0_[[LABEL]] ; CHECK-NOT: b LBB0_[[LABEL]] @@ -30,7 +30,6 @@ bb2: } ; Optimize loop entry, eliminate intra loop branches -; rdar://8117827 define i32 @t2(i32 %passes, i32* nocapture %src, i32 %size) nounwind readonly { entry: ; CHECK-LABEL: t2: diff --git a/llvm/test/CodeGen/ARM/codesize-ifcvt.mir b/llvm/test/CodeGen/ARM/codesize-ifcvt.mir index 76b4b1dd65bf7..639d81921062f 100644 --- a/llvm/test/CodeGen/ARM/codesize-ifcvt.mir +++ b/llvm/test/CodeGen/ARM/codesize-ifcvt.mir @@ -158,31 +158,37 @@ machineFunctionInfo: {} body: | ; CHECK-V7-LABEL: name: test_nosize ; CHECK-V7: bb.0 (%ir-block.0): - ; CHECK-V7: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-V7: successors: %bb.1(0x80000000) ; CHECK-V7: liveins: $lr, $r7 ; CHECK-V7: renamable $r0 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg ; CHECK-V7: t2CMPri killed renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-V7: tTAILJMPdND @extfunc, 1 /* CC::ne */, killed $cpsr, implicit $sp, implicit $sp + ; CHECK-V7: bb.1.b1: + ; CHECK-V7: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-V7: liveins: $r7, $lr ; CHECK-V7: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr ; CHECK-V7: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK-V7: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK-V7: frame-setup CFI_INSTRUCTION offset $r7, -8 ; CHECK-V7: renamable $r0 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg ; CHECK-V7: t2CMPri killed renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr - ; CHECK-V7: t2Bcc %bb.2, 1 /* CC::ne */, killed $cpsr - ; CHECK-V7: bb.1.b2: - ; CHECK-V7: successors: %bb.3(0x80000000) + ; CHECK-V7: t2Bcc %bb.3, 1 /* CC::ne */, killed $cpsr + ; CHECK-V7: bb.2.b2: + ; CHECK-V7: successors: %bb.4(0x80000000) ; CHECK-V7: tBL 14 /* CC::al */, $noreg, @extfunc, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $r0 - ; CHECK-V7: t2B %bb.3, 14 /* CC::al */, $noreg - ; CHECK-V7: bb.2.b3: - ; CHECK-V7: successors: %bb.3(0x80000000) + ; CHECK-V7: t2B %bb.4, 14 /* CC::al */, $noreg + ; CHECK-V7: bb.3.b3: + ; CHECK-V7: successors: %bb.4(0x80000000) ; CHECK-V7: renamable $r0 = t2LDRi12 undef renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from `i32* undef`) ; CHECK-V7: renamable $r0 = t2ANDri killed renamable $r0, 256, 14 /* CC::al */, $noreg, $noreg - ; CHECK-V7: bb.3.b5: + ; CHECK-V7: bb.4.b5: + ; CHECK-V7: successors: %bb.5(0x50000000) ; CHECK-V7: liveins: $r0 ; CHECK-V7: t2CMPri killed renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-V7: $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r7, def $lr ; CHECK-V7: tBX_RET 0 /* CC::eq */, killed $cpsr + ; CHECK-V7: bb.5.b7: + ; CHECK-V7: liveins: $lr, $r7 ; CHECK-V7: tTAILJMPdND @extfunc, 14 /* CC::al */, $noreg, implicit $sp, implicit $sp ; CHECK-V8-LABEL: name: test_nosize ; CHECK-V8: bb.0 (%ir-block.0): diff --git a/llvm/test/CodeGen/ARM/constant-islands-split-IT.mir b/llvm/test/CodeGen/ARM/constant-islands-split-IT.mir index bbeae30ffece9..4ebcf77b9e66c 100644 --- a/llvm/test/CodeGen/ARM/constant-islands-split-IT.mir +++ b/llvm/test/CodeGen/ARM/constant-islands-split-IT.mir @@ -99,12 +99,10 @@ body: | ; CHECK: successors: ; CHECK: CONSTPOOL_ENTRY 7, %const.1, 8 ; CHECK: bb.7 (align 2): - ; CHECK: successors: ; CHECK: liveins: $r0, $cpsr, $d0, $s0, $s1, $d1, $s2, $s3, $d2, $s4, $s5 ; CHECK: t2IT 0, 4, implicit-def $itstate ; CHECK: $sp = tMOVr $r0, 0 /* CC::eq */, $cpsr, implicit $itstate ; CHECK: $sp = t2LDMIA_RET $sp, 0 /* CC::eq */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc, implicit killed $d0, implicit killed $d1, implicit killed $d2, implicit $sp, implicit killed $itstate - ; CHECK: tBL 14 /* CC::al */, $noreg, &__stack_chk_fail, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp ; CHECK: bb.8 (align 8): ; CHECK: successors: ; CHECK: CONSTPOOL_ENTRY 6, %const.0, 8 @@ -138,7 +136,6 @@ body: | t2IT 0, 4, implicit-def $itstate $sp = tMOVr $r0, 0, $cpsr, implicit $itstate $sp = t2LDMIA_RET $sp, 0, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc, implicit killed $d0, implicit killed $d1, implicit killed $d2, implicit $sp, implicit killed $itstate - tBL 14, $noreg, &__stack_chk_fail, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp bb.3: successors: %bb.3(0x80000000) diff --git a/llvm/test/CodeGen/ARM/csr-split.ll b/llvm/test/CodeGen/ARM/csr-split.ll index f9246cb6df280..199e9a8eed715 100644 --- a/llvm/test/CodeGen/ARM/csr-split.ll +++ b/llvm/test/CodeGen/ARM/csr-split.ll @@ -15,12 +15,13 @@ define dso_local signext i32 @test1(i32* %b) local_unnamed_addr { ; CHECK-NEXT: cmp r0, r4 ; CHECK-NEXT: popne {r4, lr} ; CHECK-NEXT: movne pc, lr +; CHECK-NEXT: .LBB0_1: @ %if.then ; CHECK-NEXT: bl callVoid ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: pop {r4, lr} ; CHECK-NEXT: b callNonVoid ; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: .LCPI0_0: ; CHECK-NEXT: .long a entry: diff --git a/llvm/test/CodeGen/ARM/fp16-args.ll b/llvm/test/CodeGen/ARM/fp16-args.ll index 7ed1e883eef19..18bbcd12c768a 100644 --- a/llvm/test/CodeGen/ARM/fp16-args.ll +++ b/llvm/test/CodeGen/ARM/fp16-args.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT -; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD -; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-SOFT -; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-HARD -; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT -; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD -; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-SOFT -; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-HARD +; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=SOFT +; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=HARD +; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FULL-SOFT --check-prefix=FULL-SOFT-LE +; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FULL-HARD --check-prefix=FULL-HARD-LE +; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=SOFT +; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=HARD +; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FULL-SOFT --check-prefix=FULL-SOFT-BE +; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FULL-HARD --check-prefix=FULL-HARD-BE define half @foo(half %a, half %b) { ; SOFT-LABEL: foo: @@ -44,3 +44,76 @@ entry: %0 = fadd half %a, %b ret half %0 } + +define <4 x half> @foo_vec(<4 x half> %a) { +; SOFT-LABEL: foo_vec: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov s0, r3 +; SOFT-NEXT: vmov s2, r1 +; SOFT-NEXT: vcvtb.f32.f16 s0, s0 +; SOFT-NEXT: vmov s4, r0 +; SOFT-NEXT: vcvtb.f32.f16 s2, s2 +; SOFT-NEXT: vmov s6, r2 +; SOFT-NEXT: vcvtb.f32.f16 s4, s4 +; SOFT-NEXT: vcvtb.f32.f16 s6, s6 +; SOFT-NEXT: vadd.f32 s0, s0, s0 +; SOFT-NEXT: vadd.f32 s2, s2, s2 +; SOFT-NEXT: vcvtb.f16.f32 s0, s0 +; SOFT-NEXT: vadd.f32 s4, s4, s4 +; SOFT-NEXT: vcvtb.f16.f32 s2, s2 +; SOFT-NEXT: vadd.f32 s6, s6, s6 +; SOFT-NEXT: vcvtb.f16.f32 s4, s4 +; SOFT-NEXT: vcvtb.f16.f32 s6, s6 +; SOFT-NEXT: vmov r0, s4 +; SOFT-NEXT: vmov r1, s2 +; SOFT-NEXT: vmov r2, s6 +; SOFT-NEXT: vmov r3, s0 +; SOFT-NEXT: bx lr +; +; HARD-LABEL: foo_vec: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vcvtb.f32.f16 s4, s3 +; HARD-NEXT: vcvtb.f32.f16 s2, s2 +; HARD-NEXT: vcvtb.f32.f16 s6, s1 +; HARD-NEXT: vcvtb.f32.f16 s0, s0 +; HARD-NEXT: vadd.f32 s2, s2, s2 +; HARD-NEXT: vadd.f32 s0, s0, s0 +; HARD-NEXT: vcvtb.f16.f32 s2, s2 +; HARD-NEXT: vadd.f32 s4, s4, s4 +; HARD-NEXT: vcvtb.f16.f32 s0, s0 +; HARD-NEXT: vadd.f32 s6, s6, s6 +; HARD-NEXT: vcvtb.f16.f32 s3, s4 +; HARD-NEXT: vcvtb.f16.f32 s1, s6 +; HARD-NEXT: bx lr +; +; FULL-SOFT-LE-LABEL: foo_vec: +; FULL-SOFT-LE: @ %bb.0: @ %entry +; FULL-SOFT-LE-NEXT: vmov d16, r0, r1 +; FULL-SOFT-LE-NEXT: vadd.f16 d16, d16, d16 +; FULL-SOFT-LE-NEXT: vmov r0, r1, d16 +; FULL-SOFT-LE-NEXT: bx lr +; +; FULL-HARD-LE-LABEL: foo_vec: +; FULL-HARD-LE: @ %bb.0: @ %entry +; FULL-HARD-LE-NEXT: vadd.f16 d0, d0, d0 +; FULL-HARD-LE-NEXT: bx lr +; +; FULL-SOFT-BE-LABEL: foo_vec: +; FULL-SOFT-BE: @ %bb.0: @ %entry +; FULL-SOFT-BE-NEXT: vmov d16, r1, r0 +; FULL-SOFT-BE-NEXT: vrev64.16 d16, d16 +; FULL-SOFT-BE-NEXT: vadd.f16 d16, d16, d16 +; FULL-SOFT-BE-NEXT: vrev64.16 d16, d16 +; FULL-SOFT-BE-NEXT: vmov r1, r0, d16 +; FULL-SOFT-BE-NEXT: bx lr +; +; FULL-HARD-BE-LABEL: foo_vec: +; FULL-HARD-BE: @ %bb.0: @ %entry +; FULL-HARD-BE-NEXT: vrev64.16 d16, d0 +; FULL-HARD-BE-NEXT: vadd.f16 d16, d16, d16 +; FULL-HARD-BE-NEXT: vrev64.16 d0, d16 +; FULL-HARD-BE-NEXT: bx lr +entry: + %0 = fadd <4 x half> %a, %a + ret <4 x half> %0 +} diff --git a/llvm/test/CodeGen/ARM/fp16-v3.ll b/llvm/test/CodeGen/ARM/fp16-v3.ll index e84fee2c2e1b5..085503e80c7f2 100644 --- a/llvm/test/CodeGen/ARM/fp16-v3.ll +++ b/llvm/test/CodeGen/ARM/fp16-v3.ll @@ -28,9 +28,6 @@ define void @test_vec3(<3 x half>* %arr, i32 %i) #0 { } ; CHECK-LABEL: test_bitcast: -; CHECK: vcvtb.f16.f32 -; CHECK: vcvtb.f16.f32 -; CHECK: vcvtb.f16.f32 ; CHECK: pkhbt ; CHECK: uxth define void @test_bitcast(<3 x half> %inp, <3 x i16>* %arr) #0 { diff --git a/llvm/test/CodeGen/ARM/machine-outliner-tail.ll b/llvm/test/CodeGen/ARM/machine-outliner-tail.ll index baf419c478712..30355988e8563 100644 --- a/llvm/test/CodeGen/ARM/machine-outliner-tail.ll +++ b/llvm/test/CodeGen/ARM/machine-outliner-tail.ll @@ -7,6 +7,8 @@ ; RUN: | FileCheck %s --check-prefix=MACHO ; RUN: llc -enable-machine-outliner -verify-machineinstrs -mtriple=thumbv5-- \ ; RUN: --stop-after=machine-outliner < %s | FileCheck %s --check-prefix=THUMB1 +; RUN: llc -verify-machineinstrs -mtriple=thumbv8m.main \ +; RUN: --stop-after=machine-outliner < %s | FileCheck %s --check-prefix=THUMB ; ARM-LABEL: name: OUTLINED_FUNCTION_0 ; ARM: $r0 = MOVi 1, 14 /* CC::al */, $noreg, $noreg @@ -31,7 +33,7 @@ ; THUMB1-NOT: OUTLINED_FUNCTION_0 -define void @a() { +define void @a() #0 { entry: tail call void @z(i32 1, i32 2, i32 3, i32 4) ret void @@ -39,8 +41,10 @@ entry: declare void @z(i32, i32, i32, i32) -define dso_local void @b(i32* nocapture readnone %p) { +define dso_local void @b(i32* nocapture readnone %p) #0 { entry: tail call void @z(i32 1, i32 2, i32 3, i32 4) ret void } + +attributes #0 = { minsize optsize } diff --git a/llvm/test/CodeGen/ARM/machine-outliner-thunk.ll b/llvm/test/CodeGen/ARM/machine-outliner-thunk.ll index e3f2ffa08f546..807e16202f2c5 100644 --- a/llvm/test/CodeGen/ARM/machine-outliner-thunk.ll +++ b/llvm/test/CodeGen/ARM/machine-outliner-thunk.ll @@ -7,10 +7,12 @@ ; RUN: | FileCheck %s --check-prefix=MACHO ; RUN: llc -enable-machine-outliner -verify-machineinstrs -mtriple=thumbv5-- \ ; RUN: --stop-after=machine-outliner < %s | FileCheck %s --check-prefix=THUMB1 +; RUN: llc -verify-machineinstrs -mtriple=thumbv8m.main \ +; RUN: --stop-after=machine-outliner < %s | FileCheck %s --check-prefix=THUMB declare i32 @thunk_called_fn(i32, i32, i32, i32) -define i32 @a() { +define i32 @a() #0 { ; ARM-LABEL: name: a ; ARM: bb.0.entry: ; ARM-NEXT: liveins: $r11, $lr @@ -52,7 +54,7 @@ entry: ret i32 %cx } -define i32 @b() { +define i32 @b() #0 { ; ARM-LABEL: name: b ; ARM: bb.0.entry: ; ARM-NEXT: liveins: $r11, $lr @@ -117,3 +119,5 @@ entry: ; MACHO-NEXT: $r2, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg ; MACHO-NEXT: $r3, dead $cpsr = tMOVi8 4, 14 /* CC::al */, $noreg ; MACHO-NEXT: tTAILJMPd @thunk_called_fn, 14 /* CC::al */, $noreg, implicit $sp + +attributes #0 = { minsize optsize } diff --git a/llvm/test/CodeGen/ARM/machine-sink-multidef.ll b/llvm/test/CodeGen/ARM/machine-sink-multidef.ll index 81be728362410..a287373c695db 100644 --- a/llvm/test/CodeGen/ARM/machine-sink-multidef.ll +++ b/llvm/test/CodeGen/ARM/machine-sink-multidef.ll @@ -21,10 +21,11 @@ define arm_aapcscc void @g() { ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: popne {r11, lr} ; CHECK-NEXT: movne pc, lr +; CHECK-NEXT: .LBB0_1: @ %if.then5 ; CHECK-NEXT: ldr r1, [r1, #4] ; CHECK-NEXT: bl k ; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: .LCPI0_0: ; CHECK-NEXT: .long f ; CHECK-NEXT: .LCPI0_1: diff --git a/llvm/test/CodeGen/ARM/peephole-bitcast.ll b/llvm/test/CodeGen/ARM/peephole-bitcast.ll index dff6b4be5c16b..24ecdad87b515 100644 --- a/llvm/test/CodeGen/ARM/peephole-bitcast.ll +++ b/llvm/test/CodeGen/ARM/peephole-bitcast.ll @@ -13,6 +13,7 @@ define void @t(float %x) nounwind ssp { ; CHECK-NEXT: movt r1, #32639 ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: bxhi lr +; CHECK-NEXT: .LBB0_1: @ %if.then ; CHECK-NEXT: b doSomething entry: %0 = bitcast float %x to i32 diff --git a/llvm/test/CodeGen/ARM/reg_sequence.ll b/llvm/test/CodeGen/ARM/reg_sequence.ll index 15896b14e5e95..976dddc694d8f 100644 --- a/llvm/test/CodeGen/ARM/reg_sequence.ll +++ b/llvm/test/CodeGen/ARM/reg_sequence.ll @@ -285,6 +285,7 @@ define arm_aapcs_vfpcc i32 @t10(float %x) nounwind { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: movne r0, #0 ; CHECK-NEXT: bxne lr +; CHECK-NEXT: LBB9_1: ; CHECK-NEXT: trap entry: %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/ARM/sched-it-debug-nodes.mir b/llvm/test/CodeGen/ARM/sched-it-debug-nodes.mir deleted file mode 100644 index f7b247c5ee0de..0000000000000 --- a/llvm/test/CodeGen/ARM/sched-it-debug-nodes.mir +++ /dev/null @@ -1,157 +0,0 @@ -# RUN: llc -mtriple thumbv7 -verify-machineinstrs -start-after if-converter -print-before post-RA-sched -print-after post-RA-sched %s -o /dev/null 2>&1 | FileCheck %s ---- | - ; ModuleID = '/Volumes/Data/llvm/test/CodeGen/ARM/sched-it-debug-nodes.ll' - target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv7" - - %struct.s = type opaque - - ; Function Attrs: nounwind - define arm_aapcscc i32 @f(%struct.s* %s, i32 %u, i8* %b, i32 %n) #0 !dbg !4 { - entry: - tail call void @llvm.dbg.value(metadata %struct.s* %s, i64 0, metadata !18, metadata !27), !dbg !28 - tail call void @llvm.dbg.value(metadata i32 %u, i64 0, metadata !19, metadata !27), !dbg !28 - tail call void @llvm.dbg.value(metadata i8* %b, i64 0, metadata !20, metadata !27), !dbg !28 - tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !21, metadata !27), !dbg !28 - %cmp = icmp ult i32 %n, 4, !dbg !29 - br i1 %cmp, label %return, label %if.end, !dbg !31 - - if.end: ; preds = %entry - tail call arm_aapcscc void @g(%struct.s* %s, i8* %b, i32 %n) #3, !dbg !32 - br label %return, !dbg !33 - - return: ; preds = %if.end, %entry - %retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ] - ret i32 %retval.0, !dbg !34 - } - - ; NOTE: This is checking that the register in the DEBUG_VALUE node is not - ; accidentally being marked as KILL. The DBG_VALUE node gets introduced in - ; If-Conversion, and gets bundled into the IT block. The Post RA Scheduler - ; attempts to schedule the Machine Instr, and tries to tag the register in the - ; debug value as KILL'ed, resulting in a DEBUG_VALUE node changing codegen! (or - ; hopefully, triggering an assert). - - ; CHECK: BUNDLE implicit-def dead $itstate{{.*}} { - ; CHECK: DBG_VALUE $r1, $noreg, !"u" - ; CHECK-NOT: DBG_VALUE killed $r1, $noreg, !"u" - - declare arm_aapcscc void @g(%struct.s*, i8*, i32) #1 - - ; Function Attrs: nounwind readnone - declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 - - attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #2 = { nounwind readnone } - attributes #3 = { nounwind } - - !llvm.dbg.cu = !{!0} - !llvm.module.flags = !{!22, !23, !24, !25} - !llvm.ident = !{!26} - - !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (llvm/trunk 237059)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2) - !1 = !DIFile(filename: "", directory: "/Users/compnerd/Source/llvm") - !2 = !{} - !4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 9, type: !5, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) - !5 = !DISubroutineType(types: !6) - !6 = !{!7, !8, !11, !12, !16} - !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) - !8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 32, align: 32) - !9 = !DIDerivedType(tag: DW_TAG_typedef, name: "s", file: !1, line: 5, baseType: !10) - !10 = !DICompositeType(tag: DW_TAG_structure_type, name: "s", file: !1, line: 5, flags: DIFlagFwdDecl) - !11 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned) - !12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 32, align: 32) - !13 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !14) - !14 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint8_t", file: !1, line: 2, baseType: !15) - !15 = !DIBasicType(name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char) - !16 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !1, line: 3, baseType: !11) - !17 = !{!18, !19, !20, !21} - !18 = !DILocalVariable(name: "s", arg: 1, scope: !4, file: !1, line: 9, type: !8) - !19 = !DILocalVariable(name: "u", arg: 2, scope: !4, file: !1, line: 9, type: !11) - !20 = !DILocalVariable(name: "b", arg: 3, scope: !4, file: !1, line: 9, type: !12) - !21 = !DILocalVariable(name: "n", arg: 4, scope: !4, file: !1, line: 9, type: !16) - !22 = !{i32 2, !"Dwarf Version", i32 4} - !23 = !{i32 2, !"Debug Info Version", i32 3} - !24 = !{i32 1, !"wchar_size", i32 4} - !25 = !{i32 1, !"min_enum_size", i32 4} - !26 = !{!"clang version 3.7.0 (llvm/trunk 237059)"} - !27 = !DIExpression() - !28 = !DILocation(line: 9, scope: !4) - !29 = !DILocation(line: 10, scope: !30) - !30 = distinct !DILexicalBlock(scope: !4, file: !1, line: 10) - !31 = !DILocation(line: 10, scope: !4) - !32 = !DILocation(line: 13, scope: !4) - !33 = !DILocation(line: 14, scope: !4) - !34 = !DILocation(line: 15, scope: !4) - -... ---- -name: f -alignment: 2 -exposesReturnsTwice: false -tracksRegLiveness: true -liveins: - - { reg: '$r0' } - - { reg: '$r1' } - - { reg: '$r2' } - - { reg: '$r3' } -calleeSavedRegisters: [ '$lr', '$d8', '$d9', '$d10', '$d11', '$d12', '$d13', - '$d14', '$d15', '$q4', '$q5', '$q6', '$q7', '$r4', - '$r5', '$r6', '$r7', '$r8', '$r9', '$r10', '$r11', - '$s16', '$s17', '$s18', '$s19', '$s20', '$s21', - '$s22', '$s23', '$s24', '$s25', '$s26', '$s27', - '$s28', '$s29', '$s30', '$s31', '$d8_d10', '$d9_d11', - '$d10_d12', '$d11_d13', '$d12_d14', '$d13_d15', - '$q4_q5', '$q5_q6', '$q6_q7', '$q4_q5_q6_q7', '$r4_r5', - '$r6_r7', '$r8_r9', '$r10_r11', '$d8_d9_d10', '$d9_d10_d11', - '$d10_d11_d12', '$d11_d12_d13', '$d12_d13_d14', - '$d13_d14_d15', '$d8_d10_d12', '$d9_d11_d13', '$d10_d12_d14', - '$d11_d13_d15', '$d8_d10_d12_d14', '$d9_d11_d13_d15', - '$d9_d10', '$d11_d12', '$d13_d14', '$d9_d10_d11_d12', - '$d11_d12_d13_d14' ] -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 8 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: true - hasCalls: true - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -stack: - - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '$lr', callee-saved-restored: false } - - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '$r7' } -body: | - bb.0.entry: - liveins: $r0, $r1, $r2, $r3, $lr, $r7 - - DBG_VALUE $r0, $noreg, !18, !27, debug-location !28 - DBG_VALUE $r1, $noreg, !19, !27, debug-location !28 - DBG_VALUE $r2, $noreg, !20, !27, debug-location !28 - DBG_VALUE $r3, $noreg, !21, !27, debug-location !28 - t2CMPri $r3, 4, 14, $noreg, implicit-def $cpsr, debug-location !31 - DBG_VALUE $r1, $noreg, !19, !27, debug-location !28 - $r0 = t2MOVi -1, 3, $cpsr, $noreg, implicit undef $r0 - DBG_VALUE $r1, $noreg, !19, !27, debug-location !28 - tBX_RET 3, $cpsr, implicit $r0, debug-location !34 - $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr - frame-setup CFI_INSTRUCTION def_cfa_offset 8 - frame-setup CFI_INSTRUCTION offset $lr, -4 - frame-setup CFI_INSTRUCTION offset $r7, -8 - DBG_VALUE $r0, $noreg, !18, !27, debug-location !28 - DBG_VALUE $r1, $noreg, !19, !27, debug-location !28 - DBG_VALUE $r2, $noreg, !20, !27, debug-location !28 - DBG_VALUE $r3, $noreg, !21, !27, debug-location !28 - $r1 = tMOVr killed $r2, 14, $noreg, debug-location !32 - $r2 = tMOVr killed $r3, 14, $noreg, debug-location !32 - tBL 14, $noreg, @g, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit-def $sp, debug-location !32 - $r0 = t2MOVi 0, 14, $noreg, $noreg - $sp = t2LDMIA_RET $sp, 14, $noreg, def $r7, def $pc, implicit $r0 - -... diff --git a/llvm/test/CodeGen/Hexagon/autohvx/arith.ll b/llvm/test/CodeGen/Hexagon/autohvx/arith.ll index f7b403365c2ad..99e287dce2144 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/arith.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/arith.ll @@ -252,25 +252,43 @@ define <64 x i16> @mpyh_128(<64 x i16> %v0, <64 x i16> %v1) #1 { ret <64 x i16> %p } -; CHECK-LABEL: mpyw_64: +; CHECK-LABEL: mpyw_64_v60: ; CHECK-DAG: r[[T00:[0-9]+]] = #16 ; CHECK-DAG: v[[T01:[0-9]+]].w = vmpyio(v0.w,v1.h) ; CHECK: v[[T02:[0-9]+]].w = vasl(v[[T01]].w,r[[T00]]) ; CHECK: v[[T02]].w += vmpyie(v0.w,v1.uh) -define <16 x i32> @mpyw_64(<16 x i32> %v0, <16 x i32> %v1) #0 { +define <16 x i32> @mpyw_64_v60(<16 x i32> %v0, <16 x i32> %v1) #0 { %p = mul <16 x i32> %v0, %v1 ret <16 x i32> %p } -; CHECK-LABEL: mpyw_128: +; CHECK-LABEL: mpyw_128_v60: ; CHECK-DAG: r[[T10:[0-9]+]] = #16 ; CHECK-DAG: v[[T11:[0-9]+]].w = vmpyio(v0.w,v1.h) ; CHECK: v[[T12:[0-9]+]].w = vasl(v[[T11]].w,r[[T10]]) ; CHECK: v[[T12]].w += vmpyie(v0.w,v1.uh) -define <32 x i32> @mpyw_128(<32 x i32> %v0, <32 x i32> %v1) #1 { +define <32 x i32> @mpyw_128_v60(<32 x i32> %v0, <32 x i32> %v1) #1 { + %p = mul <32 x i32> %v0, %v1 + ret <32 x i32> %p +} + +; CHECK-LABEL: mpyw_64_v62: +; CHECK: v[[T00:[0-9]+]]:[[T01:[0-9]+]] = vmpye(v0.w,v1.uh) +; CHECK: v[[T00]]:[[T01]] += vmpyo(v0.w,v1.h) +define <16 x i32> @mpyw_64_v62(<16 x i32> %v0, <16 x i32> %v1) #3 { + %p = mul <16 x i32> %v0, %v1 + ret <16 x i32> %p +} + +; CHECK-LABEL: mpyw_128_v62: +; CHECK: v[[T00:[0-9]+]]:[[T01:[0-9]+]] = vmpye(v0.w,v1.uh) +; CHECK: v[[T00]]:[[T01]] += vmpyo(v0.w,v1.h) +define <32 x i32> @mpyw_128_v62(<32 x i32> %v0, <32 x i32> %v1) #4 { %p = mul <32 x i32> %v0, %v1 ret <32 x i32> %p } attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length64b" } attributes #1 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length128b" } +attributes #3 = { nounwind "target-cpu"="hexagonv62" "target-features"="+hvxv62,+hvx-length64b" } +attributes #4 = { nounwind "target-cpu"="hexagonv62" "target-features"="+hvxv62,+hvx-length128b" } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll b/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll new file mode 100644 index 0000000000000..9836d2d5cb5ca --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll @@ -0,0 +1,35 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; CHECK-LABEL: f0: +; CHECK: vmemu +; CHECK: vmux +define <128 x i8> @f0(<128 x i8>* %a0, i32 %a1, i32 %a2) #0 { + %q0 = call <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32 %a2) + %v0 = call <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32 %a1) + %v1 = bitcast <32 x i32> %v0 to <128 x i8> + %v2 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %q0, <128 x i8> %v1) + ret <128 x i8> %v2 +} + +; CHECK-LABEL: f1: +; CHECK: vlalign +; CHECK: if (q{{.}}) vmem{{.*}} = v +define void @f1(<128 x i8>* %a0, i32 %a1, i32 %a2) #0 { + %q0 = call <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32 %a2) + %v0 = call <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32 %a1) + %v1 = bitcast <32 x i32> %v0 to <128 x i8> + call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %v1, <128 x i8>* %a0, i32 4, <128 x i1> %q0) + ret void +} + +declare <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32) #1 +declare <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32) #1 +declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32 immarg, <128 x i1>, <128 x i8>) #2 +declare void @llvm.masked.store.v128i8.p0v128i8(<128 x i8>, <128 x i8>*, i32 immarg, <128 x i1>) #2 + +attributes #0 = { "target-cpu"="hexagonv65" "target-features"="+hvx,+hvx-length128b" } +attributes #1 = { nounwind readnone } +attributes #2 = { argmemonly nounwind readonly willreturn } +attributes #3 = { argmemonly nounwind willreturn } + + diff --git a/llvm/test/CodeGen/Hexagon/autohvx/short-store-widen.ll b/llvm/test/CodeGen/Hexagon/autohvx/short-store-widen.ll new file mode 100644 index 0000000000000..311450502f248 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/short-store-widen.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=hexagon -hexagon-hvx-widen=16 < %s | FileCheck %s + +; CHECK-LABEL: f0: +; CHECK: q[[Q0:[0-3]]] = vsetq(r{{[0-9]+}}) +; CHECK: if (q[[Q0]]) vmem({{.*}}) = v +define void @f0(<32 x i8>* %a0) #0 { + %v0 = load <32 x i8>, <32 x i8>* %a0, align 128 + %v1 = insertelement <32 x i8> undef, i8 1, i32 0 + %v2 = shufflevector <32 x i8> %v1, <32 x i8> undef, <32 x i32> zeroinitializer + %v3 = add <32 x i8> %v0, %v2 + store <32 x i8> %v3, <32 x i8>* %a0, align 128 + ret void +} + +attributes #0 = { nounwind "target-cpu"="hexagonv65" "target-features"="+hvx,+hvx-length128b" } + diff --git a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll index c44e7a863840e..cb135f72448fe 100644 --- a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll +++ b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-instsimplify=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-instsimplify=0 -hexagon-masked-vmem=0 < %s | FileCheck %s ; Test that LLVM does not assert and bitcast v64i1 to i64 is lowered ; without crashing. diff --git a/llvm/test/CodeGen/Hexagon/hvx-isel-vselect-v256i16.ll b/llvm/test/CodeGen/Hexagon/hvx-isel-vselect-v256i16.ll new file mode 100644 index 0000000000000..7ba2bb2948c17 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/hvx-isel-vselect-v256i16.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s +; This used to crash with +; "llvm::MVT llvm::EVT::getSimpleVT() const: Assertion `isSimple() && +; Expected a SimpleValueType!' failed." + +; CHECK: vmax +define <256 x i16> @f0(<128 x i16> %v0, <128 x i16> %v1) #0 { + %v01 = shufflevector <128 x i16> %v0, <128 x i16> %v1, <256 x i32> + %v10 = shufflevector <128 x i16> %v1, <128 x i16> %v0, <256 x i32> + %p0 = icmp sgt <256 x i16> %v01, %v10 + %res = select <256 x i1> %p0, <256 x i16> %v01, <256 x i16> %v10 + ret <256 x i16> %res +} + +attributes #0 = { nounwind readnone "target-cpu"="hexagonv65" "target-features"="+hvxv65,+hvx-length128b" } diff --git a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll index a177f87ddfbd5..d9d841cacc5bb 100644 --- a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll +++ b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-instsimplify=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-instsimplify=0 -hexagon-masked-vmem=0 < %s | FileCheck %s ; This test checks that store a vector predicate of type v128i1 is lowered ; without crashing. diff --git a/llvm/test/CodeGen/MIR/AArch64/generic-virtual-registers-error.mir b/llvm/test/CodeGen/MIR/AArch64/generic-virtual-registers-error.mir index 9236b69fb70df..8690e9ca18e7f 100644 --- a/llvm/test/CodeGen/MIR/AArch64/generic-virtual-registers-error.mir +++ b/llvm/test/CodeGen/MIR/AArch64/generic-virtual-registers-error.mir @@ -1,6 +1,5 @@ # RUN: not llc -mtriple=aarch64-apple-ios -run-pass none -o - %s 2>&1 \ # RUN: | FileCheck %s --check-prefix=ERR -# REQUIRES: global-isel # This test ensures that the MIR parser errors out when # generic virtual register definitions are not correct. diff --git a/llvm/test/CodeGen/MIR/AArch64/generic-virtual-registers-with-regbank-error.mir b/llvm/test/CodeGen/MIR/AArch64/generic-virtual-registers-with-regbank-error.mir index 1cf5e3854857f..98ff359eedc73 100644 --- a/llvm/test/CodeGen/MIR/AArch64/generic-virtual-registers-with-regbank-error.mir +++ b/llvm/test/CodeGen/MIR/AArch64/generic-virtual-registers-with-regbank-error.mir @@ -1,6 +1,5 @@ # RUN: not llc -mtriple=aarch64-apple-ios -run-pass none -o - %s 2>&1 \ # RUN: | FileCheck %s --check-prefix=ERR -# REQUIRES: global-isel # This test ensures that the MIR parser errors out when # generic virtual register definitions are not correct. # In that case, it is defined by a register bank. diff --git a/llvm/test/CodeGen/MIR/AArch64/register-operand-bank.mir b/llvm/test/CodeGen/MIR/AArch64/register-operand-bank.mir index 3da69342c9352..5b5102f9e1968 100644 --- a/llvm/test/CodeGen/MIR/AArch64/register-operand-bank.mir +++ b/llvm/test/CodeGen/MIR/AArch64/register-operand-bank.mir @@ -1,5 +1,4 @@ # RUN: llc -o - %s -mtriple=aarch64-- -run-pass=none | FileCheck %s -# REQUIRES: global-isel # Test various aspects of register bank specification on machine operands. --- | define void @func() { ret void } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/subreg-def-is-not-ssa.mir b/llvm/test/CodeGen/MIR/AMDGPU/subreg-def-is-not-ssa.mir new file mode 100644 index 0000000000000..10f0d1a94fafb --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/subreg-def-is-not-ssa.mir @@ -0,0 +1,15 @@ +# REQUIRES: asserts +# RUN: not --crash llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s + +# CHECK: MachineFunctionProperties required by InstructionSelect pass are not met by function subreg_def_is_not_ssa. +# CHECK-NEXT: Required properties: IsSSA +# CHECK-NEXT: Current properties: NoPHIs +# CHECK-NEXT: MachineFunctionProperties check failed + +--- +name: subreg_def_is_not_ssa +body: | + bb.0: + %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + +... diff --git a/llvm/test/CodeGen/MIR/X86/generic-instr-type.mir b/llvm/test/CodeGen/MIR/X86/generic-instr-type.mir index a0948319878b0..710a18ac3aeff 100644 --- a/llvm/test/CodeGen/MIR/X86/generic-instr-type.mir +++ b/llvm/test/CodeGen/MIR/X86/generic-instr-type.mir @@ -1,5 +1,4 @@ # RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s -# REQUIRES: global-isel # Test that the MIR parser parses types on generic instructions correctly. --- | diff --git a/llvm/test/CodeGen/Mips/GlobalISel/irtranslator/call.ll b/llvm/test/CodeGen/Mips/GlobalISel/irtranslator/call.ll index 4cc956c2040c7..67265c95b64bc 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/irtranslator/call.ll +++ b/llvm/test/CodeGen/Mips/GlobalISel/irtranslator/call.ll @@ -153,7 +153,7 @@ define void @call_symbol(i8* nocapture readonly %src, i8* nocapture %dest, i32 s ; MIPS32: [[COPY:%[0-9]+]]:_(p0) = COPY $a0 ; MIPS32: [[COPY1:%[0-9]+]]:_(p0) = COPY $a1 ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2 - ; MIPS32: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[COPY1]](p0), [[COPY]](p0), [[COPY2]](s32), 0 :: (store 1 into %ir.dest), (load 1 from %ir.src) + ; MIPS32: G_MEMCPY [[COPY1]](p0), [[COPY]](p0), [[COPY2]](s32), 0 :: (store 1 into %ir.dest), (load 1 from %ir.src) ; MIPS32: RetRA ; MIPS32_PIC-LABEL: name: call_symbol ; MIPS32_PIC: bb.1.entry: @@ -161,7 +161,7 @@ define void @call_symbol(i8* nocapture readonly %src, i8* nocapture %dest, i32 s ; MIPS32_PIC: [[COPY:%[0-9]+]]:_(p0) = COPY $a0 ; MIPS32_PIC: [[COPY1:%[0-9]+]]:_(p0) = COPY $a1 ; MIPS32_PIC: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2 - ; MIPS32_PIC: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[COPY1]](p0), [[COPY]](p0), [[COPY2]](s32), 0 :: (store 1 into %ir.dest), (load 1 from %ir.src) + ; MIPS32_PIC: G_MEMCPY [[COPY1]](p0), [[COPY]](p0), [[COPY2]](s32), 0 :: (store 1 into %ir.dest), (load 1 from %ir.src) ; MIPS32_PIC: RetRA entry: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 %length, i1 false) diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/dyn_stackalloc.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/dyn_stackalloc.mir index 773933018ed3c..7f9f561c4b411 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/dyn_stackalloc.mir +++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/dyn_stackalloc.mir @@ -73,7 +73,7 @@ body: | %8:_(s32) = G_CONSTANT i32 -8 %9:_(s32) = G_AND %7, %8 %10:_(p0) = G_DYN_STACKALLOC %9(s32), 0 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %10(p0), %0(s8), %1(s32), 0 :: (store 1 into %ir.vla) + G_MEMSET %10(p0), %0(s8), %1(s32), 0 :: (store 1 into %ir.vla) %11:_(p0) = G_PTR_ADD %10, %1(s32) %12:_(p0) = COPY %11(p0) G_STORE %13(s8), %12(p0) :: (store 1 into %ir.arrayidx) diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-large.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-large.ll new file mode 100644 index 0000000000000..18aea72f6681f --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-large.ll @@ -0,0 +1,90 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mtriple powerpc-ibm-aix-xcoff \ +; RUN: -filetype=obj -code-model=large -o %t.o < %s +; RUN: llvm-readobj --relocs --expand-relocs %t.o | FileCheck --check-prefixes=RELOC %s +; RUN: llvm-objdump -D -r --symbol-description %t.o | FileCheck --check-prefix=DIS %s + +@a = global i32 2, align 4 +@b = global i32 10, align 4 +@c = global i32 11, align 4 + +define i32 @foo() { +entry: + %0 = load i32, i32* @a, align 4 + %1 = load i32, i32* @b, align 4 + %add = add nsw i32 %0, %1 + %2 = load i32, i32* @c, align 4 + %add1 = add nsw i32 %add, %2 + ret i32 %add1 +} + +; RELOC: Section (index: {{[0-9]+}}) .text { +; RELOC-NEXT: Relocation { +; RELOC-NEXT: Virtual Address: 0x2 +; RELOC-NEXT: Symbol: a ([[#INDX:]]) +; RELOC-NEXT: IsSigned: No +; RELOC-NEXT: FixupBitValue: 0 +; RELOC-NEXT: Length: 16 +; RELOC-NEXT: Type: R_TOCU (0x30) +; RELOC-NEXT: } +; RELOC-NEXT: Relocation { +; RELOC-NEXT: Virtual Address: 0x6 +; RELOC-NEXT: Symbol: a ([[#INDX]]) +; RELOC-NEXT: IsSigned: No +; RELOC-NEXT: FixupBitValue: 0 +; RELOC-NEXT: Length: 16 +; RELOC-NEXT: Type: R_TOCL (0x31) +; RELOC-NEXT: } +; RELOC-NEXT: Relocation { +; RELOC-NEXT: Virtual Address: 0xE +; RELOC-NEXT: Symbol: b ([[#INDX+2]]) +; RELOC-NEXT: IsSigned: No +; RELOC-NEXT: FixupBitValue: 0 +; RELOC-NEXT: Length: 16 +; RELOC-NEXT: Type: R_TOCU (0x30) +; RELOC-NEXT: } +; RELOC-NEXT: Relocation { +; RELOC-NEXT: Virtual Address: 0x12 +; RELOC-NEXT: Symbol: b ([[#INDX+2]]) +; RELOC-NEXT: IsSigned: No +; RELOC-NEXT: FixupBitValue: 0 +; RELOC-NEXT: Length: 16 +; RELOC-NEXT: Type: R_TOCL (0x31) +; RELOC-NEXT: } +; RELOC-NEXT: Relocation { +; RELOC-NEXT: Virtual Address: 0x1A +; RELOC-NEXT: Symbol: c ([[#INDX+4]]) +; RELOC-NEXT: IsSigned: No +; RELOC-NEXT: FixupBitValue: 0 +; RELOC-NEXT: Length: 16 +; RELOC-NEXT: Type: R_TOCU (0x30) +; RELOC-NEXT: } +; RELOC-NEXT: Relocation { +; RELOC-NEXT: Virtual Address: 0x1E +; RELOC-NEXT: Symbol: c ([[#INDX+4]]) +; RELOC-NEXT: IsSigned: No +; RELOC-NEXT: FixupBitValue: 0 +; RELOC-NEXT: Length: 16 +; RELOC-NEXT: Type: R_TOCL (0x31) +; RELOC-NEXT: } + +; DIS: Disassembly of section .text: +; DIS-EMPTY: +; DIS-NEXT: 00000000 (idx: {{[0-9]+}}) .foo: +; DIS-NEXT: 0: 3c 62 00 00 addis 3, 2, 0 +; DIS-NEXT: 00000002: R_TOCU (idx: [[#INDX:]]) a[TE] +; DIS-NEXT: 4: 80 63 00 00 lwz 3, 0(3) +; DIS-NEXT: 00000006: R_TOCL (idx: [[#INDX]]) a[TE] +; DIS-NEXT: 8: 80 63 00 00 lwz 3, 0(3) +; DIS-NEXT: c: 3c 82 00 00 addis 4, 2, 0 +; DIS-NEXT: 0000000e: R_TOCU (idx: [[#INDX+2]]) b[TE] +; DIS-NEXT: 10: 80 84 00 04 lwz 4, 4(4) +; DIS-NEXT: 00000012: R_TOCL (idx: [[#INDX+2]]) b[TE] +; DIS-NEXT: 14: 80 84 00 00 lwz 4, 0(4) +; DIS-NEXT: 18: 3c a2 00 00 addis 5, 2, 0 +; DIS-NEXT: 0000001a: R_TOCU (idx: [[#INDX+4]]) c[TE] +; DIS-NEXT: 1c: 80 a5 00 08 lwz 5, 8(5) +; DIS-NEXT: 0000001e: R_TOCL (idx: [[#INDX+4]]) c[TE] +; DIS-NEXT: 20: 7c 63 22 14 add 3, 3, 4 +; DIS-NEXT: 24: 80 a5 00 00 lwz 5, 0(5) +; DIS-NEXT: 28: 7c 63 2a 14 add 3, 3, 5 +; DIS-NEXT: 2c: 4e 80 00 20 blr diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-visibility.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-visibility.ll index 42bdf5d22a2a9..04c2e0cfe22b8 100644 --- a/llvm/test/CodeGen/PowerPC/aix-xcoff-visibility.ll +++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-visibility.ll @@ -3,16 +3,6 @@ ; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr4 -mattr=-altivec < %s |\ ; RUN: FileCheck %s -; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr4 \ -; RUN: -mattr=-altivec -filetype=obj -o %t.o < %s -; RUN: llvm-readobj --symbols %t.o | \ -; RUN: FileCheck --check-prefix=XCOFF32 %s - -; RUN: not --crash llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff \ -; RUN: -mcpu=pwr4 -mattr=-altivec -filetype=obj -o %t.o 2>&1 < %s | \ -; RUN: FileCheck --check-prefix=XCOFF64 %s -; XCOFF64: LLVM ERROR: 64-bit XCOFF object files are not supported yet. - @b = global i32 0, align 4 @b_h = hidden global i32 0, align 4 @@ -66,446 +56,3 @@ declare hidden i32 @bar_h(i32*) ; CHECK: .weak zoo_weak_extern_h[DS],hidden ; CHECK: .extern .bar_h[PR],hidden ; CHECK: .extern bar_h[DS],hidden - -; XCOFF32: Symbols [ -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index:]] -; XCOFF32-NEXT: Name: .bar_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0x0 -; XCOFF32-NEXT: Section: N_UNDEF -; XCOFF32-NEXT: Type: 0x2000 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+1]] -; XCOFF32-NEXT: SectionLen: 0 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_ER (0x0) -; XCOFF32-NEXT: StorageMappingClass: XMC_PR (0x0) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+2]] -; XCOFF32-NEXT: Name: zoo_weak_extern_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0x0 -; XCOFF32-NEXT: Section: N_UNDEF -; XCOFF32-NEXT: Type: 0x2000 -; XCOFF32-NEXT: StorageClass: C_WEAKEXT (0x6F) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+3]] -; XCOFF32-NEXT: SectionLen: 0 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_ER (0x0) -; XCOFF32-NEXT: StorageMappingClass: XMC_DS (0xA) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+4]] -; XCOFF32-NEXT: Name: .zoo_weak_extern_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0x0 -; XCOFF32-NEXT: Section: N_UNDEF -; XCOFF32-NEXT: Type: 0x2000 -; XCOFF32-NEXT: StorageClass: C_WEAKEXT (0x6F) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+5]] -; XCOFF32-NEXT: SectionLen: 0 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_ER (0x0) -; XCOFF32-NEXT: StorageMappingClass: XMC_PR (0x0) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+6]] -; XCOFF32-NEXT: Name: bar_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0x0 -; XCOFF32-NEXT: Section: N_UNDEF -; XCOFF32-NEXT: Type: 0x2000 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+7]] -; XCOFF32-NEXT: SectionLen: 0 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_ER (0x0) -; XCOFF32-NEXT: StorageMappingClass: XMC_DS (0xA) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+8]] -; XCOFF32-NEXT: Name: .text -; XCOFF32-NEXT: Value (RelocatableAddress): 0x0 -; XCOFF32-NEXT: Section: .text -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_HIDEXT (0x6B) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+9]] -; XCOFF32-NEXT: SectionLen: 152 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 4 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_PR (0x0) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+10]] -; XCOFF32-NEXT: Name: .foo -; XCOFF32-NEXT: Value (RelocatableAddress): 0x0 -; XCOFF32-NEXT: Section: .text -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+11]] -; XCOFF32-NEXT: ContainingCsectSymbolIndex: 8 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_LD (0x2) -; XCOFF32-NEXT: StorageMappingClass: XMC_PR (0x0) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+12]] -; XCOFF32-NEXT: Name: .foo_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0x10 -; XCOFF32-NEXT: Section: .text -; XCOFF32-NEXT: Type: 0x2000 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+13]] -; XCOFF32-NEXT: ContainingCsectSymbolIndex: 8 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_LD (0x2) -; XCOFF32-NEXT: StorageMappingClass: XMC_PR (0x0) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+14]] -; XCOFF32-NEXT: Name: .foo_protected -; XCOFF32-NEXT: Value (RelocatableAddress): 0x20 -; XCOFF32-NEXT: Section: .text -; XCOFF32-NEXT: Type: 0x3000 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+15]] -; XCOFF32-NEXT: ContainingCsectSymbolIndex: 8 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_LD (0x2) -; XCOFF32-NEXT: StorageMappingClass: XMC_PR (0x0) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+16]] -; XCOFF32-NEXT: Name: .foo_weak_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0x30 -; XCOFF32-NEXT: Section: .text -; XCOFF32-NEXT: Type: 0x2000 -; XCOFF32-NEXT: StorageClass: C_WEAKEXT (0x6F) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+17]] -; XCOFF32-NEXT: ContainingCsectSymbolIndex: 8 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_LD (0x2) -; XCOFF32-NEXT: StorageMappingClass: XMC_PR (0x0) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+18]] -; XCOFF32-NEXT: Name: .main -; XCOFF32-NEXT: Value (RelocatableAddress): 0x40 -; XCOFF32-NEXT: Section: .text -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+19]] -; XCOFF32-NEXT: ContainingCsectSymbolIndex: 8 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_LD (0x2) -; XCOFF32-NEXT: StorageMappingClass: XMC_PR (0x0) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+20]] -; XCOFF32-NEXT: Name: .data -; XCOFF32-NEXT: Value (RelocatableAddress): 0x98 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_HIDEXT (0x6B) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+21]] -; XCOFF32-NEXT: SectionLen: 12 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 2 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_RW (0x5) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+22]] -; XCOFF32-NEXT: Name: b -; XCOFF32-NEXT: Value (RelocatableAddress): 0x98 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+23]] -; XCOFF32-NEXT: ContainingCsectSymbolIndex: 20 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_LD (0x2) -; XCOFF32-NEXT: StorageMappingClass: XMC_RW (0x5) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+24]] -; XCOFF32-NEXT: Name: b_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0x9C -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x2000 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+25]] -; XCOFF32-NEXT: ContainingCsectSymbolIndex: 20 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_LD (0x2) -; XCOFF32-NEXT: StorageMappingClass: XMC_RW (0x5) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+26]] -; XCOFF32-NEXT: Name: foo_p -; XCOFF32-NEXT: Value (RelocatableAddress): 0xA0 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+27]] -; XCOFF32-NEXT: ContainingCsectSymbolIndex: 20 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 0 -; XCOFF32-NEXT: SymbolType: XTY_LD (0x2) -; XCOFF32-NEXT: StorageMappingClass: XMC_RW (0x5) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+28]] -; XCOFF32-NEXT: Name: foo -; XCOFF32-NEXT: Value (RelocatableAddress): 0xA4 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+29]] -; XCOFF32-NEXT: SectionLen: 12 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 2 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_DS (0xA) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+30]] -; XCOFF32-NEXT: Name: foo_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0xB0 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x2000 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+31]] -; XCOFF32-NEXT: SectionLen: 12 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 2 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_DS (0xA) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+32]] -; XCOFF32-NEXT: Name: foo_protected -; XCOFF32-NEXT: Value (RelocatableAddress): 0xBC -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x3000 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+33]] -; XCOFF32-NEXT: SectionLen: 12 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 2 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_DS (0xA) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+34]] -; XCOFF32-NEXT: Name: foo_weak_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0xC8 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x2000 -; XCOFF32-NEXT: StorageClass: C_WEAKEXT (0x6F) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+35]] -; XCOFF32-NEXT: SectionLen: 12 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 2 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_DS (0xA) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+36]] -; XCOFF32-NEXT: Name: main -; XCOFF32-NEXT: Value (RelocatableAddress): 0xD4 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_EXT (0x2) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+37]] -; XCOFF32-NEXT: SectionLen: 12 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 2 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_DS (0xA) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+38]] -; XCOFF32-NEXT: Name: TOC -; XCOFF32-NEXT: Value (RelocatableAddress): 0xE0 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_HIDEXT (0x6B) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+39]] -; XCOFF32-NEXT: SectionLen: 0 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 2 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_TC0 (0xF) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+40]] -; XCOFF32-NEXT: Name: b_h -; XCOFF32-NEXT: Value (RelocatableAddress): 0xE0 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_HIDEXT (0x6B) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+41]] -; XCOFF32-NEXT: SectionLen: 4 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 2 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_TC (0x3) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: Symbol { -; XCOFF32-NEXT: Index: [[#Index+42]] -; XCOFF32-NEXT: Name: foo_p -; XCOFF32-NEXT: Value (RelocatableAddress): 0xE4 -; XCOFF32-NEXT: Section: .data -; XCOFF32-NEXT: Type: 0x0 -; XCOFF32-NEXT: StorageClass: C_HIDEXT (0x6B) -; XCOFF32-NEXT: NumberOfAuxEntries: 1 -; XCOFF32-NEXT: CSECT Auxiliary Entry { -; XCOFF32-NEXT: Index: [[#Index+43]] -; XCOFF32-NEXT: SectionLen: 4 -; XCOFF32-NEXT: ParameterHashIndex: 0x0 -; XCOFF32-NEXT: TypeChkSectNum: 0x0 -; XCOFF32-NEXT: SymbolAlignmentLog2: 2 -; XCOFF32-NEXT: SymbolType: XTY_SD (0x1) -; XCOFF32-NEXT: StorageMappingClass: XMC_TC (0x3) -; XCOFF32-NEXT: StabInfoIndex: 0x0 -; XCOFF32-NEXT: StabSectNum: 0x0 -; XCOFF32-NEXT: } -; XCOFF32-NEXT: } -; XCOFF32-NEXT: ] diff --git a/llvm/test/CodeGen/PowerPC/fast-isel-pcrel.ll b/llvm/test/CodeGen/PowerPC/fast-isel-pcrel.ll index 13fa8a8119bfd..3758f8db10cef 100644 --- a/llvm/test/CodeGen/PowerPC/fast-isel-pcrel.ll +++ b/llvm/test/CodeGen/PowerPC/fast-isel-pcrel.ll @@ -13,17 +13,21 @@ ; Function Attrs: noinline nounwind optnone define internal void @loadFP(double* %d) #0 { ; CHECK-LABEL: loadFP: -; CHECK: # %bb.0: # %entry +; CHECK: .localentry loadFP, 1 +; CHECK-NEXT: # %bb.0: # %entry ; CHECK-NEXT: mflr r0 ; CHECK-NEXT: std r0, 16(r1) ; CHECK-NEXT: stdu r1, -112(r1) ; CHECK-NEXT: std r3, 104(r1) ; CHECK-NEXT: paddi r3, 0, .L.str@PCREL, 1 ; CHECK-NEXT: bl printf@notoc -; CHECK-NEXT: addis r4, r2, .LCPI0_0@toc@ha -; CHECK-NEXT: lfd f0, .LCPI0_0@toc@l(r4) ; CHECK-NEXT: ld r4, 104(r1) -; CHECK-NEXT: stfd f0, 0(r4) +; CHECK-NEXT: lis r5, 16403 +; CHECK-NEXT: ori r5, r5, 62914 +; CHECK-NEXT: sldi r5, r5, 32 +; CHECK-NEXT: oris r5, r5, 36700 +; CHECK-NEXT: ori r5, r5, 10486 +; CHECK-NEXT: std r5, 0(r4) ; CHECK-NEXT: addi r1, r1, 112 ; CHECK-NEXT: ld r0, 16(r1) ; CHECK-NEXT: mtlr r0 @@ -42,21 +46,19 @@ declare signext i32 @printf(i8*, ...) ; Function Attrs: noinline nounwind optnone define internal void @loadGV() #0 { ; CHECK-LABEL: loadGV: -; CHECK: # %bb.0: # %entry +; CHECK: .localentry loadGV, 1 +; CHECK-NEXT: # %bb.0: # %entry ; CHECK-NEXT: mflr r0 ; CHECK-NEXT: std r0, 16(r1) ; CHECK-NEXT: stdu r1, -112(r1) ; CHECK-NEXT: paddi r3, 0, .L.str.1@PCREL, 1 ; CHECK-NEXT: bl printf@notoc -; CHECK-NEXT: addis r4, r2, .LC0@toc@ha -; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: pld r4, stdout@got@pcrel(0), 1 ; CHECK-NEXT: ld r4, 0(r4) ; CHECK-NEXT: li r5, 97 -; CHECK-NEXT: extsw r5, r5 ; CHECK-NEXT: std r3, 104(r1) # 8-byte Folded Spill ; CHECK-NEXT: mr r3, r5 -; CHECK-NEXT: bl _IO_putc -; CHECK-NEXT: nop +; CHECK-NEXT: bl _IO_putc@notoc ; CHECK-NEXT: addi r1, r1, 112 ; CHECK-NEXT: ld r0, 16(r1) ; CHECK-NEXT: mtlr r0 diff --git a/llvm/test/CodeGen/PowerPC/future-check-features.ll b/llvm/test/CodeGen/PowerPC/future-check-features.ll index 4d9b6e7555924..ce4305ac44c2b 100644 --- a/llvm/test/CodeGen/PowerPC/future-check-features.ll +++ b/llvm/test/CodeGen/PowerPC/future-check-features.ll @@ -1,7 +1,7 @@ -; RUN: llc -mattr=pcrelative-memops,prefix-instrs,paired-vector-memops \ +; RUN: llc -mattr=pcrelative-memops,prefix-instrs,paired-vector-memops,mma \ ; RUN: -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \ ; RUN: -ppc-asm-full-reg-names %s -o - 2>&1 | FileCheck %s -; RUN: llc -mattr=pcrelative-memops,prefix-instrs,paired-vector-memops \ +; RUN: llc -mattr=pcrelative-memops,prefix-instrs,paired-vector-memops,mma \ ; RUN: -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \ ; RUN: -ppc-asm-full-reg-names %s -o - 2>&1 | FileCheck %s diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll index 12c9dfec50555..4d339e2383b30 100644 --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s ; test_no_prep: @@ -19,8 +20,21 @@ define i64 @test_no_prep(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_no_prep: -; CHECK: addi r3, r3, 4004 -; CHECK: .LBB0_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: beq cr0, .LBB0_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: addi r3, r3, 4004 +; CHECK-NEXT: li r6, -3 +; CHECK-NEXT: li r7, -2 +; CHECK-NEXT: li r8, -1 +; CHECK-NEXT: iselgt r5, r4, r5 +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: ldx r9, r3, r6 ; CHECK-NEXT: ldx r10, r3, r7 ; CHECK-NEXT: ldx r11, r3, r8 @@ -30,6 +44,12 @@ define i64 @test_no_prep(i8* %0, i32 signext %1) { ; CHECK-NEXT: mulld r9, r9, r11 ; CHECK-NEXT: maddld r5, r9, r12, r5 ; CHECK-NEXT: bdnz .LBB0_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: add r3, r5, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: addi r3, r4, 0 +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %27, label %5 @@ -83,8 +103,19 @@ define i64 @test_no_prep(i8* %0, i32 signext %1) { define i64 @test_ds_prep(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_ds_prep: -; CHECK: addi r6, r3, 4002 -; CHECK: .LBB1_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: beq cr0, .LBB1_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: addi r6, r3, 4002 +; CHECK-NEXT: li r7, -1 +; CHECK-NEXT: iselgt r3, r4, r5 +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: ldx r9, r6, r7 ; CHECK-NEXT: ld r10, 0(r6) ; CHECK-NEXT: ldx r11, r6, r5 @@ -95,6 +126,12 @@ define i64 @test_ds_prep(i8* %0, i32 signext %1) { ; CHECK-NEXT: maddld r3, r9, r6, r3 ; CHECK-NEXT: mr r6, r8 ; CHECK-NEXT: bdnz .LBB1_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: add r3, r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: addi r3, r4, 0 +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %27, label %5 @@ -158,8 +195,28 @@ define i64 @test_ds_prep(i8* %0, i32 signext %1) { define i64 @test_max_number_reminder(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_max_number_reminder: -; CHECK: addi r9, r3, 4002 -; CHECK: .LBB2_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: beq cr0, .LBB2_3 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: addi r9, r3, 4002 +; CHECK-NEXT: li r6, -1 +; CHECK-NEXT: li r7, 3 +; CHECK-NEXT: li r8, 5 +; CHECK-NEXT: li r10, 9 +; CHECK-NEXT: iselgt r3, r4, r5 +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: ldx r12, r9, r6 ; CHECK-NEXT: ld r0, 0(r9) ; CHECK-NEXT: ldx r30, r9, r5 @@ -180,6 +237,18 @@ define i64 @test_max_number_reminder(i8* %0, i32 signext %1) { ; CHECK-NEXT: maddld r3, r12, r9, r3 ; CHECK-NEXT: mr r9, r11 ; CHECK-NEXT: bdnz .LBB2_2 +; CHECK-NEXT: b .LBB2_4 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r3, r3, r4 +; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %47, label %5 @@ -253,8 +322,19 @@ define i64 @test_max_number_reminder(i8* %0, i32 signext %1) { define dso_local i64 @test_update_ds_prep_interact(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_update_ds_prep_interact: -; CHECK: addi r3, r3, 3998 -; CHECK: .LBB3_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: beq cr0, .LBB3_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r6, 1 +; CHECK-NEXT: addi r3, r3, 3998 +; CHECK-NEXT: li r7, -1 +; CHECK-NEXT: iselgt r5, r4, r6 +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: ldu r8, 4(r3) ; CHECK-NEXT: ldx r9, r3, r7 ; CHECK-NEXT: ldx r10, r3, r6 @@ -263,6 +343,12 @@ define dso_local i64 @test_update_ds_prep_interact(i8* %0, i32 signext %1) { ; CHECK-NEXT: mulld r8, r8, r10 ; CHECK-NEXT: maddld r5, r8, r11, r5 ; CHECK-NEXT: bdnz .LBB3_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: add r3, r5, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: addi r3, r4, 0 +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %28, label %5 @@ -317,9 +403,20 @@ define dso_local i64 @test_update_ds_prep_interact(i8* %0, i32 signext %1) { define i64 @test_update_ds_prep_nointeract(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_update_ds_prep_nointeract: -; CHECK: addi r5, r3, 4000 -; CHECK: addi r3, r3, 4003 -; CHECK: .LBB4_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: beq cr0, .LBB4_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r6, 1 +; CHECK-NEXT: addi r5, r3, 4000 +; CHECK-NEXT: addi r3, r3, 4003 +; CHECK-NEXT: li r7, -1 +; CHECK-NEXT: iselgt r6, r4, r6 +; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: lbzu r8, 1(r5) ; CHECK-NEXT: ldx r9, r3, r7 ; CHECK-NEXT: ld r10, 0(r3) @@ -329,6 +426,12 @@ define i64 @test_update_ds_prep_nointeract(i8* %0, i32 signext %1) { ; CHECK-NEXT: mulld r8, r8, r10 ; CHECK-NEXT: maddld r6, r8, r11, r6 ; CHECK-NEXT: bdnz .LBB4_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: add r3, r6, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB4_4: +; CHECK-NEXT: addi r3, r4, 0 +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %27, label %5 @@ -386,9 +489,21 @@ define i64 @test_update_ds_prep_nointeract(i8* %0, i32 signext %1) { define dso_local i64 @test_ds_multiple_chains(i8* %0, i8* %1, i32 signext %2) { ; CHECK-LABEL: test_ds_multiple_chains: -; CHECK: addi r3, r3, 4001 -; CHECK: addi r4, r4, 4001 -; CHECK: .LBB5_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r5, 0 +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: beq cr0, .LBB5_3 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r5, 1 +; CHECK-NEXT: li r6, 1 +; CHECK-NEXT: addi r3, r3, 4001 +; CHECK-NEXT: addi r4, r4, 4001 +; CHECK-NEXT: li r7, 9 +; CHECK-NEXT: iselgt r6, r5, r6 +; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: ld r8, 0(r3) ; CHECK-NEXT: ldx r9, r3, r7 ; CHECK-NEXT: ld r10, 4(r3) @@ -407,6 +522,13 @@ define dso_local i64 @test_ds_multiple_chains(i8* %0, i8* %1, i32 signext %2) { ; CHECK-NEXT: mulld r8, r8, r30 ; CHECK-NEXT: maddld r6, r8, r9, r6 ; CHECK-NEXT: bdnz .LBB5_2 +; CHECK-NEXT: b .LBB5_4 +; CHECK-NEXT: .LBB5_3: +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: .LBB5_4: +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r3, r6, r5 +; CHECK-NEXT: blr %4 = sext i32 %2 to i64 %5 = icmp eq i32 %2, 0 br i1 %5, label %45, label %6 @@ -493,13 +615,43 @@ define dso_local i64 @test_ds_multiple_chains(i8* %0, i8* %1, i32 signext %2) { define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_ds_cross_basic_blocks: -; CHECK: addi r6, r3, 4009 -; CHECK: .LBB6_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: beq cr0, .LBB6_8 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r7, 1 +; CHECK-NEXT: addi r6, r3, 4009 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: ld r5, .LC0@toc@l(r5) +; CHECK-NEXT: iselgt r8, r4, r7 +; CHECK-NEXT: lis r4, -21846 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: li r9, -7 +; CHECK-NEXT: li r10, -6 +; CHECK-NEXT: li r11, 1 +; CHECK-NEXT: li r12, 1 +; CHECK-NEXT: li r30, 1 +; CHECK-NEXT: ld r5, 0(r5) +; CHECK-NEXT: mtctr r8 +; CHECK-NEXT: li r8, -9 +; CHECK-NEXT: addi r5, r5, -1 +; CHECK-NEXT: ori r4, r4, 43691 +; CHECK-NEXT: li r29, 1 +; CHECK-NEXT: li r28, 1 +; CHECK-NEXT: b .LBB6_4 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB6_2: ; CHECK-NEXT: ldx r0, r6, r8 ; CHECK-NEXT: add r28, r0, r28 ; CHECK-NEXT: ld r0, -8(r6) ; CHECK-NEXT: add r29, r0, r29 -; CHECK-NEXT: .LBB6_3: # +; CHECK-NEXT: .LBB6_3: ; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: mulld r0, r29, r28 ; CHECK-NEXT: mulld r0, r0, r30 @@ -507,7 +659,7 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) { ; CHECK-NEXT: mulld r0, r0, r11 ; CHECK-NEXT: maddld r3, r0, r7, r3 ; CHECK-NEXT: bdz .LBB6_9 -; CHECK-NEXT: .LBB6_4: # +; CHECK-NEXT: .LBB6_4: ; CHECK-NEXT: lbzu r0, 1(r5) ; CHECK-NEXT: mulhwu r27, r0, r4 ; CHECK-NEXT: rlwinm r26, r27, 0, 0, 30 @@ -516,22 +668,32 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) { ; CHECK-NEXT: sub r0, r0, r27 ; CHECK-NEXT: cmplwi r0, 1 ; CHECK-NEXT: beq cr0, .LBB6_2 -; CHECK-NEXT: # %bb.5: # +; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: clrlwi r0, r0, 24 ; CHECK-NEXT: cmplwi r0, 2 ; CHECK-NEXT: bne cr0, .LBB6_7 -; CHECK-NEXT: # %bb.6: # +; CHECK-NEXT: # %bb.6: ; CHECK-NEXT: ldx r0, r6, r9 ; CHECK-NEXT: add r30, r0, r30 ; CHECK-NEXT: ld r0, -4(r6) ; CHECK-NEXT: add r12, r0, r12 ; CHECK-NEXT: b .LBB6_3 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB6_7: # +; CHECK-NEXT: .LBB6_7: ; CHECK-NEXT: ldx r0, r6, r10 ; CHECK-NEXT: add r11, r0, r11 ; CHECK-NEXT: ld r0, 0(r6) ; CHECK-NEXT: add r7, r0, r7 +; CHECK-NEXT: b .LBB6_3 +; CHECK-NEXT: .LBB6_8: +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: .LBB6_9: +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %66, label %5 @@ -636,8 +798,17 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) { define float @test_ds_float(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_ds_float: -; CHECK: addi r3, r3, 4002 -; CHECK: .LBB7_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmpwi r4, 1 +; CHECK-NEXT: blt cr0, .LBB7_4 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addi r3, r3, 4002 +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: xxlxor f1, f1, f1 +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: li r4, -1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB7_2: ; CHECK-NEXT: lfsx f0, r3, r4 ; CHECK-NEXT: lfs f2, 0(r3) ; CHECK-NEXT: xsmulsp f0, f0, f2 @@ -648,6 +819,11 @@ define float @test_ds_float(i8* %0, i32 signext %1) { ; CHECK-NEXT: xsmulsp f0, f0, f4 ; CHECK-NEXT: xsaddsp f1, f1, f0 ; CHECK-NEXT: bdnz .LBB7_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: xxlxor f1, f1, f1 +; CHECK-NEXT: blr %3 = icmp sgt i32 %1, 0 br i1 %3, label %4, label %28 @@ -704,8 +880,17 @@ define float @test_ds_float(i8* %0, i32 signext %1) { define float @test_ds_combine_float_int(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_ds_combine_float_int: -; CHECK: addi r3, r3, 4002 -; CHECK: .LBB8_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmpwi r4, 1 +; CHECK-NEXT: blt cr0, .LBB8_4 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addi r3, r3, 4002 +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: xxlxor f1, f1, f1 +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: li r4, -1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB8_2: ; CHECK-NEXT: lfd f4, 0(r3) ; CHECK-NEXT: lfsx f0, r3, r4 ; CHECK-NEXT: xscvuxdsp f4, f4 @@ -717,6 +902,11 @@ define float @test_ds_combine_float_int(i8* %0, i32 signext %1) { ; CHECK-NEXT: xsmulsp f0, f3, f0 ; CHECK-NEXT: xsaddsp f1, f1, f0 ; CHECK-NEXT: bdnz .LBB8_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB8_4: +; CHECK-NEXT: xxlxor f1, f1, f1 +; CHECK-NEXT: blr %3 = icmp sgt i32 %1, 0 br i1 %3, label %4, label %29 @@ -773,9 +963,16 @@ define float @test_ds_combine_float_int(i8* %0, i32 signext %1) { define i64 @test_ds_lwa_prep(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_ds_lwa_prep: -; CHECK: addi r5, r3, 2 -; CHECK: li r6, -1 -; CHECK: .LBB9_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmpwi r4, 1 +; CHECK-NEXT: blt cr0, .LBB9_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: addi r5, r3, 2 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: li r6, -1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB9_2: ; CHECK-NEXT: lwax r7, r5, r6 ; CHECK-NEXT: lwa r8, 0(r5) ; CHECK-NEXT: lwa r9, 4(r5) @@ -785,6 +982,12 @@ define i64 @test_ds_lwa_prep(i8* %0, i32 signext %1) { ; CHECK-NEXT: mulld r7, r7, r9 ; CHECK-NEXT: maddld r3, r7, r10, r3 ; CHECK-NEXT: bdnz .LBB9_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: add r3, r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB9_4: +; CHECK-NEXT: addi r3, r4, 0 +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp sgt i32 %1, 0 diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll b/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll index 4ecc3a17fedbe..dc21b4fb49eef 100644 --- a/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll +++ b/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll @@ -49,3 +49,49 @@ entry: %div = sdiv <4 x i32> %a, %b ret <4 x i32> %div } + +; Test the vector divide extended intrinsics. +declare <4 x i32> @llvm.ppc.altivec.vdivesw(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.ppc.altivec.vdiveuw(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.ppc.altivec.vdivesd(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.ppc.altivec.vdiveud(<2 x i64>, <2 x i64>) + +define <4 x i32> @test_vdivesw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vdivesw: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vdivesw v2, v2, v3 +; CHECK-NEXT: blr +entry: + %div = tail call <4 x i32> @llvm.ppc.altivec.vdivesw(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %div +} + +define <4 x i32> @test_vdiveuw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vdiveuw: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vdiveuw v2, v2, v3 +; CHECK-NEXT: blr +entry: + %div = tail call <4 x i32> @llvm.ppc.altivec.vdiveuw(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %div +} + +define <2 x i64> @test_vdivesd(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vdivesd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vdivesd v2, v2, v3 +; CHECK-NEXT: blr +entry: + %div = tail call <2 x i64> @llvm.ppc.altivec.vdivesd(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %div +} + +define <2 x i64> @test_vdiveud(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vdiveud: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vdiveud v2, v2, v3 +; CHECK-NEXT: blr +entry: + %div = tail call <2 x i64> @llvm.ppc.altivec.vdiveud(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %div +} diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll index 75c6d8c24038e..fd58654d0ae1e 100644 --- a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll +++ b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll @@ -76,3 +76,49 @@ entry: %tr = trunc <4 x i64> %shr to <4 x i32> ret <4 x i32> %tr } + +; Test the vector multiply high intrinsics. +declare <4 x i32> @llvm.ppc.altivec.vmulhsw(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.ppc.altivec.vmulhuw(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.ppc.altivec.vmulhsd(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.ppc.altivec.vmulhud(<2 x i64>, <2 x i64>) + +define <4 x i32> @test_vmulhsw_intrinsic(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmulhsw_intrinsic: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhsw v2, v2, v3 +; CHECK-NEXT: blr +entry: + %mulh = tail call <4 x i32> @llvm.ppc.altivec.vmulhsw(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %mulh +} + +define <4 x i32> @test_vmulhuw_intrinsic(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmulhuw_intrinsic: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhuw v2, v2, v3 +; CHECK-NEXT: blr +entry: + %mulh = tail call <4 x i32> @llvm.ppc.altivec.vmulhuw(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %mulh +} + +define <2 x i64> @test_vmulhsd_intrinsic(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vmulhsd_intrinsic: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhsd v2, v2, v3 +; CHECK-NEXT: blr +entry: + %mulh = tail call <2 x i64> @llvm.ppc.altivec.vmulhsd(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %mulh +} + +define <2 x i64> @test_vmulhud_intrinsic(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vmulhud_intrinsic: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhud v2, v2, v3 +; CHECK-NEXT: blr +entry: + %mulh = tail call <2 x i64> @llvm.ppc.altivec.vmulhud(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %mulh +} diff --git a/llvm/test/CodeGen/PowerPC/pr44183.ll b/llvm/test/CodeGen/PowerPC/pr44183.ll index a2cf40521f556..c639d47cdffb5 100644 --- a/llvm/test/CodeGen/PowerPC/pr44183.ll +++ b/llvm/test/CodeGen/PowerPC/pr44183.ll @@ -8,37 +8,33 @@ define void @_ZN1m1nEv(%struct.m.2.5.8.11* %this) local_unnamed_addr nounwind al ; CHECK-LABEL: _ZN1m1nEv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mflr r0 -; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r0, 16(r1) -; CHECK-NEXT: stdu r1, -64(r1) +; CHECK-NEXT: stdu r1, -48(r1) ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: li r3, 4 ; CHECK-NEXT: ld r4, 16(r30) ; CHECK-NEXT: ld r5, 8(r30) -; CHECK-NEXT: subfic r29, r3, 64 -; CHECK-NEXT: rldicl r3, r5, 60, 4 -; CHECK-NEXT: sld r4, r4, r29 -; CHECK-NEXT: lwz r5, 36(r30) -; CHECK-NEXT: or r3, r4, r3 -; CHECK-NEXT: rlwinm r3, r3, 31, 0, 0 -; CHECK-NEXT: clrlwi r4, r5, 31 +; CHECK-NEXT: lwz r6, 36(r30) +; CHECK-NEXT: rldicl r5, r5, 60, 4 +; CHECK-NEXT: sldi r4, r4, 60 +; CHECK-NEXT: or r4, r4, r5 +; CHECK-NEXT: rlwinm r3, r4, 31, 0, 0 +; CHECK-NEXT: clrlwi r4, r6, 31 ; CHECK-NEXT: or r4, r4, r3 ; CHECK-NEXT: bl _ZN1llsE1d ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 16(r30) ; CHECK-NEXT: ld r4, 8(r30) ; CHECK-NEXT: rldicl r4, r4, 60, 4 -; CHECK-NEXT: sld r3, r3, r29 +; CHECK-NEXT: sldi r3, r3, 60 ; CHECK-NEXT: or r3, r3, r4 ; CHECK-NEXT: sldi r3, r3, 31 ; CHECK-NEXT: clrldi r4, r3, 32 ; CHECK-NEXT: bl _ZN1llsE1d ; CHECK-NEXT: nop -; CHECK-NEXT: addi r1, r1, 64 +; CHECK-NEXT: addi r1, r1, 48 ; CHECK-NEXT: ld r0, 16(r1) ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-NEXT: mtlr r0 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/pr46923.ll b/llvm/test/CodeGen/PowerPC/pr46923.ll new file mode 100644 index 0000000000000..3e9faa60422af --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/pr46923.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s + +@bar = external constant i64, align 8 + +define i1 @foo() { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: isel r3, 0, r3, 4*cr5+lt +; CHECK-NEXT: blr +entry: + br label %next + +next: + br i1 undef, label %true, label %false + +true: + br label %end + +false: + br label %end + +end: + %a = phi i1 [ icmp ugt (i64 0, i64 ptrtoint (i64* @bar to i64)), %true ], + [ icmp ugt (i64 0, i64 2), %false ] + ret i1 %a +} diff --git a/llvm/test/CodeGen/PowerPC/vec-trunc2.ll b/llvm/test/CodeGen/PowerPC/vec-trunc2.ll new file mode 100644 index 0000000000000..5eea2389d710c --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vec-trunc2.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mattr=+vsx -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mattr=+vsx -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE + +define dso_local <8 x i8> @test8x32(i32 %i1, i32 %i2, i32 %i3, i32 %i4, + i32 %i5, i32 %i6, i32 %i7, i32 %i8) { +; CHECK-LABEL: test8x32: +; CHECK: # %bb.0: +; CHECK-NEXT: rldimi r3, r4, 32, 0 +; CHECK-NEXT: rldimi r5, r6, 32, 0 +; CHECK-NEXT: addis r11, r2, .LCPI0_0@toc@ha +; CHECK-NEXT: rldimi r7, r8, 32, 0 +; CHECK-NEXT: rldimi r9, r10, 32, 0 +; CHECK-NEXT: mtfprd f0, r3 +; CHECK-NEXT: addi r3, r11, .LCPI0_0@toc@l +; CHECK-NEXT: mtfprd f1, r5 +; CHECK-NEXT: lvx v4, 0, r3 +; CHECK-NEXT: mtfprd f2, r7 +; CHECK-NEXT: mtfprd f3, r9 +; CHECK-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-NEXT: xxmrghd v3, vs3, vs2 +; CHECK-NEXT: vperm v2, v3, v2, v4 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test8x32: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: stw r10, -80(r1) +; CHECK-BE-NEXT: stw r9, -96(r1) +; CHECK-BE-NEXT: stw r8, -112(r1) +; CHECK-BE-NEXT: stw r7, -128(r1) +; CHECK-BE-NEXT: stw r6, -16(r1) +; CHECK-BE-NEXT: stw r5, -32(r1) +; CHECK-BE-NEXT: stw r4, -48(r1) +; CHECK-BE-NEXT: stw r3, -64(r1) +; CHECK-BE-NEXT: addi r3, r1, -80 +; CHECK-BE-NEXT: lxvw4x v2, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -96 +; CHECK-BE-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -112 +; CHECK-BE-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -128 +; CHECK-BE-NEXT: lxvw4x v5, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -16 +; CHECK-BE-NEXT: lxvw4x v0, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -32 +; CHECK-BE-NEXT: lxvw4x v1, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -48 +; CHECK-BE-NEXT: lxvw4x v6, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -64 +; CHECK-BE-NEXT: lxvw4x v7, 0, r3 +; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: vmrghw v3, v5, v4 +; CHECK-BE-NEXT: vmrghw v4, v1, v0 +; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-BE-NEXT: xxmrghd v2, v3, v2 +; CHECK-BE-NEXT: lxvw4x v8, 0, r3 +; CHECK-BE-NEXT: vmrghw v5, v7, v6 +; CHECK-BE-NEXT: xxmrghd v3, v5, v4 +; CHECK-BE-NEXT: vperm v2, v3, v2, v8 +; CHECK-BE-NEXT: blr +%v10 = insertelement <8 x i32> undef, i32 %i1, i32 0 +%v11 = insertelement <8 x i32> %v10, i32 %i2, i32 1 +%v12 = insertelement <8 x i32> %v11, i32 %i3, i32 2 +%v13 = insertelement <8 x i32> %v12, i32 %i4, i32 3 +%v14 = insertelement <8 x i32> %v13, i32 %i5, i32 4 +%v15 = insertelement <8 x i32> %v14, i32 %i6, i32 5 +%v16 = insertelement <8 x i32> %v15, i32 %i7, i32 6 +%v17 = insertelement <8 x i32> %v16, i32 %i8, i32 7 +%v2 = trunc <8 x i32> %v17 to <8 x i8> +ret <8 x i8> %v2 +} + +define dso_local <4 x i16> @test4x64(i64 %i1, i64 %i2, i64 %i3, i64 %i4) { +; CHECK-LABEL: test4x64: +; CHECK: # %bb.0: +; CHECK-NEXT: addis r7, r2, .LCPI1_0@toc@ha +; CHECK-NEXT: mtfprd f0, r5 +; CHECK-NEXT: mtfprd f1, r6 +; CHECK-NEXT: mtfprd f2, r3 +; CHECK-NEXT: addi r3, r7, .LCPI1_0@toc@l +; CHECK-NEXT: mtfprd f3, r4 +; CHECK-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-NEXT: lvx v4, 0, r3 +; CHECK-NEXT: xxmrghd v3, vs3, vs2 +; CHECK-NEXT: vperm v2, v2, v3, v4 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test4x64: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: std r6, -8(r1) +; CHECK-BE-NEXT: std r5, -16(r1) +; CHECK-BE-NEXT: std r4, -24(r1) +; CHECK-BE-NEXT: std r3, -32(r1) +; CHECK-BE-NEXT: addi r3, r1, -32 +; CHECK-BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-BE-NEXT: addi r7, r1, -16 +; CHECK-BE-NEXT: lxvd2x v3, 0, r3 +; CHECK-BE-NEXT: addi r3, r4, .LCPI1_0@toc@l +; CHECK-BE-NEXT: lxvd2x v2, 0, r7 +; CHECK-BE-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-NEXT: blr +%v10 = insertelement <4 x i64> undef, i64 %i1, i32 0 +%v11 = insertelement <4 x i64> %v10, i64 %i2, i32 1 +%v12 = insertelement <4 x i64> %v11, i64 %i3, i32 2 +%v13 = insertelement <4 x i64> %v12, i64 %i4, i32 3 +%v2 = trunc <4 x i64> %v13 to <4 x i16> +ret <4 x i16> %v2 +} + +define dso_local <8 x i16> @test8x24(i32 %i1, i32 %i2, i32 %i3, i32 %i4, + i32 %i5, i32 %i6, i32 %i7, i32 %i8) { +; CHECK-LABEL: test8x24: +; CHECK: # %bb.0: +; CHECK-NEXT: mtvsrd v2, r3 +; CHECK-NEXT: mtvsrd v3, r4 +; CHECK-NEXT: mtvsrd v4, r5 +; CHECK-NEXT: mtvsrd v5, r6 +; CHECK-NEXT: mtvsrd v0, r7 +; CHECK-NEXT: mtvsrd v1, r8 +; CHECK-NEXT: vmrghh v2, v3, v2 +; CHECK-NEXT: mtvsrd v3, r9 +; CHECK-NEXT: vmrghh v4, v5, v4 +; CHECK-NEXT: mtvsrd v5, r10 +; CHECK-NEXT: vmrghh v0, v1, v0 +; CHECK-NEXT: vmrghh v3, v5, v3 +; CHECK-NEXT: vmrglw v2, v4, v2 +; CHECK-NEXT: vmrglw v3, v3, v0 +; CHECK-NEXT: xxmrgld v2, v3, v2 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test8x24: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: sth r10, -16(r1) +; CHECK-BE-NEXT: sth r9, -32(r1) +; CHECK-BE-NEXT: sth r8, -48(r1) +; CHECK-BE-NEXT: sth r7, -64(r1) +; CHECK-BE-NEXT: sth r6, -80(r1) +; CHECK-BE-NEXT: sth r5, -96(r1) +; CHECK-BE-NEXT: sth r4, -112(r1) +; CHECK-BE-NEXT: sth r3, -128(r1) +; CHECK-BE-NEXT: addi r3, r1, -16 +; CHECK-BE-NEXT: lxvw4x v2, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -32 +; CHECK-BE-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -48 +; CHECK-BE-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -64 +; CHECK-BE-NEXT: lxvw4x v5, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -80 +; CHECK-BE-NEXT: lxvw4x v0, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -96 +; CHECK-BE-NEXT: lxvw4x v1, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -112 +; CHECK-BE-NEXT: lxvw4x v6, 0, r3 +; CHECK-BE-NEXT: addi r3, r1, -128 +; CHECK-BE-NEXT: lxvw4x v7, 0, r3 +; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: vmrghh v3, v5, v4 +; CHECK-BE-NEXT: vmrghh v4, v1, v0 +; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: vmrghh v5, v7, v6 +; CHECK-BE-NEXT: vmrghw v3, v5, v4 +; CHECK-BE-NEXT: xxmrghd v2, v3, v2 +; CHECK-BE-NEXT: blr +%i11 = trunc i32 %i1 to i24 +%i21 = trunc i32 %i2 to i24 +%i31 = trunc i32 %i3 to i24 +%i41 = trunc i32 %i4 to i24 +%i51 = trunc i32 %i5 to i24 +%i61 = trunc i32 %i6 to i24 +%i71 = trunc i32 %i7 to i24 +%i81 = trunc i32 %i8 to i24 +%v10 = insertelement <8 x i24> undef, i24 %i11, i32 0 +%v11 = insertelement <8 x i24> %v10, i24 %i21, i32 1 +%v12 = insertelement <8 x i24> %v11, i24 %i31, i32 2 +%v13 = insertelement <8 x i24> %v12, i24 %i41, i32 3 +%v14 = insertelement <8 x i24> %v13, i24 %i51, i32 4 +%v15 = insertelement <8 x i24> %v14, i24 %i61, i32 5 +%v16 = insertelement <8 x i24> %v15, i24 %i71, i32 6 +%v17 = insertelement <8 x i24> %v16, i24 %i81, i32 7 +%v2 = trunc <8 x i24> %v17 to <8 x i16> +ret <8 x i16> %v2 +} diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll index a84d50b5dca12..4b2e19504c40b 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll @@ -2076,37 +2076,34 @@ define i8 @atomicrmw_max_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB35_2 ; RV32I-NEXT: .LBB35_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB35_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB35_4 ; RV32I-NEXT: .LBB35_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB35_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB35_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB35_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB35_1 ; RV32I-NEXT: .LBB35_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -2151,37 +2148,34 @@ define i8 @atomicrmw_max_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB35_2 ; RV64I-NEXT: .LBB35_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB35_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB35_4 ; RV64I-NEXT: .LBB35_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB35_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB35_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB35_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB35_1 ; RV64I-NEXT: .LBB35_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -2230,37 +2224,34 @@ define i8 @atomicrmw_max_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB36_2 ; RV32I-NEXT: .LBB36_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB36_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB36_4 ; RV32I-NEXT: .LBB36_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB36_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB36_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB36_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB36_1 ; RV32I-NEXT: .LBB36_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -2305,37 +2296,34 @@ define i8 @atomicrmw_max_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB36_2 ; RV64I-NEXT: .LBB36_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB36_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB36_4 ; RV64I-NEXT: .LBB36_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB36_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB36_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB36_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB36_1 ; RV64I-NEXT: .LBB36_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -2384,37 +2372,34 @@ define i8 @atomicrmw_max_i8_release(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB37_2 ; RV32I-NEXT: .LBB37_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB37_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB37_4 ; RV32I-NEXT: .LBB37_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB37_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB37_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB37_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB37_1 ; RV32I-NEXT: .LBB37_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -2459,37 +2444,34 @@ define i8 @atomicrmw_max_i8_release(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB37_2 ; RV64I-NEXT: .LBB37_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB37_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 3 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB37_4 ; RV64I-NEXT: .LBB37_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB37_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB37_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB37_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB37_1 ; RV64I-NEXT: .LBB37_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -2538,37 +2520,34 @@ define i8 @atomicrmw_max_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB38_2 ; RV32I-NEXT: .LBB38_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB38_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB38_4 ; RV32I-NEXT: .LBB38_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB38_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB38_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB38_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB38_1 ; RV32I-NEXT: .LBB38_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -2613,37 +2592,34 @@ define i8 @atomicrmw_max_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB38_2 ; RV64I-NEXT: .LBB38_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB38_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB38_4 ; RV64I-NEXT: .LBB38_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB38_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB38_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB38_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB38_1 ; RV64I-NEXT: .LBB38_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -2692,37 +2668,34 @@ define i8 @atomicrmw_max_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB39_2 ; RV32I-NEXT: .LBB39_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB39_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB39_4 ; RV32I-NEXT: .LBB39_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB39_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB39_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB39_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB39_1 ; RV32I-NEXT: .LBB39_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -2767,37 +2740,34 @@ define i8 @atomicrmw_max_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB39_2 ; RV64I-NEXT: .LBB39_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB39_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB39_4 ; RV64I-NEXT: .LBB39_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB39_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB39_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB39_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB39_1 ; RV64I-NEXT: .LBB39_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -2846,37 +2816,34 @@ define i8 @atomicrmw_min_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB40_2 ; RV32I-NEXT: .LBB40_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB40_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB40_4 ; RV32I-NEXT: .LBB40_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB40_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB40_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB40_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB40_1 ; RV32I-NEXT: .LBB40_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -2921,37 +2888,34 @@ define i8 @atomicrmw_min_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB40_2 ; RV64I-NEXT: .LBB40_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB40_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB40_4 ; RV64I-NEXT: .LBB40_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB40_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB40_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB40_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB40_1 ; RV64I-NEXT: .LBB40_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -3000,37 +2964,34 @@ define i8 @atomicrmw_min_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB41_2 ; RV32I-NEXT: .LBB41_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB41_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB41_4 ; RV32I-NEXT: .LBB41_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB41_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB41_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB41_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB41_1 ; RV32I-NEXT: .LBB41_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -3075,37 +3036,34 @@ define i8 @atomicrmw_min_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB41_2 ; RV64I-NEXT: .LBB41_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB41_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB41_4 ; RV64I-NEXT: .LBB41_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB41_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB41_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB41_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB41_1 ; RV64I-NEXT: .LBB41_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -3154,37 +3112,34 @@ define i8 @atomicrmw_min_i8_release(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB42_2 ; RV32I-NEXT: .LBB42_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB42_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB42_4 ; RV32I-NEXT: .LBB42_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB42_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB42_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB42_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB42_1 ; RV32I-NEXT: .LBB42_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -3229,37 +3184,34 @@ define i8 @atomicrmw_min_i8_release(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB42_2 ; RV64I-NEXT: .LBB42_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB42_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 3 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB42_4 ; RV64I-NEXT: .LBB42_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB42_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB42_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB42_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB42_1 ; RV64I-NEXT: .LBB42_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -3308,37 +3260,34 @@ define i8 @atomicrmw_min_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB43_2 ; RV32I-NEXT: .LBB43_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB43_4 ; RV32I-NEXT: .LBB43_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB43_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB43_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB43_1 ; RV32I-NEXT: .LBB43_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -3383,37 +3332,34 @@ define i8 @atomicrmw_min_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB43_2 ; RV64I-NEXT: .LBB43_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB43_4 ; RV64I-NEXT: .LBB43_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB43_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB43_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB43_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB43_1 ; RV64I-NEXT: .LBB43_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -3462,37 +3408,34 @@ define i8 @atomicrmw_min_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lbu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 24 -; RV32I-NEXT: srai s0, a0, 24 -; RV32I-NEXT: addi s3, sp, 11 +; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: srai s1, a0, 24 ; RV32I-NEXT: j .LBB44_2 ; RV32I-NEXT: .LBB44_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: sb a1, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a1, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB44_4 ; RV32I-NEXT: .LBB44_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: slli a0, a3, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB44_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB44_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB44_1 ; RV32I-NEXT: .LBB44_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -3537,37 +3480,34 @@ define i8 @atomicrmw_min_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 56 -; RV64I-NEXT: srai s0, a0, 56 -; RV64I-NEXT: addi s3, sp, 7 +; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: srai s1, a0, 56 ; RV64I-NEXT: j .LBB44_2 ; RV64I-NEXT: .LBB44_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV64I-NEXT: sb a1, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a1, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB44_4 ; RV64I-NEXT: .LBB44_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: slli a0, a3, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB44_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB44_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB44_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB44_1 ; RV64I-NEXT: .LBB44_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -3616,22 +3556,20 @@ define i8 @atomicrmw_umax_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB45_2 ; RV32I-NEXT: .LBB45_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB45_4 ; RV32I-NEXT: .LBB45_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3640,11 +3578,10 @@ define i8 @atomicrmw_umax_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bltu s1, a0, .LBB45_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB45_1 ; RV32I-NEXT: .LBB45_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -3684,22 +3621,20 @@ define i8 @atomicrmw_umax_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB45_2 ; RV64I-NEXT: .LBB45_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB45_4 ; RV64I-NEXT: .LBB45_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3708,11 +3643,10 @@ define i8 @atomicrmw_umax_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB45_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB45_1 ; RV64I-NEXT: .LBB45_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -3756,22 +3690,20 @@ define i8 @atomicrmw_umax_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB46_2 ; RV32I-NEXT: .LBB46_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB46_4 ; RV32I-NEXT: .LBB46_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3780,11 +3712,10 @@ define i8 @atomicrmw_umax_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bltu s1, a0, .LBB46_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB46_1 ; RV32I-NEXT: .LBB46_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -3824,22 +3755,20 @@ define i8 @atomicrmw_umax_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB46_2 ; RV64I-NEXT: .LBB46_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB46_4 ; RV64I-NEXT: .LBB46_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3848,11 +3777,10 @@ define i8 @atomicrmw_umax_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB46_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB46_1 ; RV64I-NEXT: .LBB46_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -3896,22 +3824,20 @@ define i8 @atomicrmw_umax_i8_release(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB47_2 ; RV32I-NEXT: .LBB47_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB47_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB47_4 ; RV32I-NEXT: .LBB47_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3920,11 +3846,10 @@ define i8 @atomicrmw_umax_i8_release(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bltu s1, a0, .LBB47_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB47_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB47_1 ; RV32I-NEXT: .LBB47_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -3964,22 +3889,20 @@ define i8 @atomicrmw_umax_i8_release(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB47_2 ; RV64I-NEXT: .LBB47_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB47_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB47_4 ; RV64I-NEXT: .LBB47_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3988,11 +3911,10 @@ define i8 @atomicrmw_umax_i8_release(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB47_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB47_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB47_1 ; RV64I-NEXT: .LBB47_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -4036,22 +3958,20 @@ define i8 @atomicrmw_umax_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB48_2 ; RV32I-NEXT: .LBB48_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB48_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB48_4 ; RV32I-NEXT: .LBB48_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4060,11 +3980,10 @@ define i8 @atomicrmw_umax_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bltu s1, a0, .LBB48_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB48_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB48_1 ; RV32I-NEXT: .LBB48_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -4104,22 +4023,20 @@ define i8 @atomicrmw_umax_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB48_2 ; RV64I-NEXT: .LBB48_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB48_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB48_4 ; RV64I-NEXT: .LBB48_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4128,11 +4045,10 @@ define i8 @atomicrmw_umax_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB48_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB48_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB48_1 ; RV64I-NEXT: .LBB48_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -4176,22 +4092,20 @@ define i8 @atomicrmw_umax_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB49_2 ; RV32I-NEXT: .LBB49_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB49_4 ; RV32I-NEXT: .LBB49_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4200,11 +4114,10 @@ define i8 @atomicrmw_umax_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bltu s1, a0, .LBB49_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB49_1 ; RV32I-NEXT: .LBB49_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -4244,22 +4157,20 @@ define i8 @atomicrmw_umax_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB49_2 ; RV64I-NEXT: .LBB49_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB49_4 ; RV64I-NEXT: .LBB49_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4268,11 +4179,10 @@ define i8 @atomicrmw_umax_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB49_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB49_1 ; RV64I-NEXT: .LBB49_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -4316,22 +4226,20 @@ define i8 @atomicrmw_umin_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB50_2 ; RV32I-NEXT: .LBB50_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB50_4 ; RV32I-NEXT: .LBB50_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4340,11 +4248,10 @@ define i8 @atomicrmw_umin_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bgeu s1, a0, .LBB50_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB50_1 ; RV32I-NEXT: .LBB50_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -4384,22 +4291,20 @@ define i8 @atomicrmw_umin_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB50_2 ; RV64I-NEXT: .LBB50_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB50_4 ; RV64I-NEXT: .LBB50_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4408,11 +4313,10 @@ define i8 @atomicrmw_umin_i8_monotonic(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB50_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB50_1 ; RV64I-NEXT: .LBB50_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -4456,22 +4360,20 @@ define i8 @atomicrmw_umin_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB51_2 ; RV32I-NEXT: .LBB51_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB51_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB51_4 ; RV32I-NEXT: .LBB51_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4480,11 +4382,10 @@ define i8 @atomicrmw_umin_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bgeu s1, a0, .LBB51_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB51_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB51_1 ; RV32I-NEXT: .LBB51_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -4524,22 +4425,20 @@ define i8 @atomicrmw_umin_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB51_2 ; RV64I-NEXT: .LBB51_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB51_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB51_4 ; RV64I-NEXT: .LBB51_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4548,11 +4447,10 @@ define i8 @atomicrmw_umin_i8_acquire(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB51_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB51_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB51_1 ; RV64I-NEXT: .LBB51_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -4596,22 +4494,20 @@ define i8 @atomicrmw_umin_i8_release(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB52_2 ; RV32I-NEXT: .LBB52_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB52_4 ; RV32I-NEXT: .LBB52_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4620,11 +4516,10 @@ define i8 @atomicrmw_umin_i8_release(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bgeu s1, a0, .LBB52_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB52_1 ; RV32I-NEXT: .LBB52_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -4664,22 +4559,20 @@ define i8 @atomicrmw_umin_i8_release(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB52_2 ; RV64I-NEXT: .LBB52_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB52_4 ; RV64I-NEXT: .LBB52_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4688,11 +4581,10 @@ define i8 @atomicrmw_umin_i8_release(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB52_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB52_1 ; RV64I-NEXT: .LBB52_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -4736,22 +4628,20 @@ define i8 @atomicrmw_umin_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB53_2 ; RV32I-NEXT: .LBB53_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB53_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB53_4 ; RV32I-NEXT: .LBB53_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4760,11 +4650,10 @@ define i8 @atomicrmw_umin_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bgeu s1, a0, .LBB53_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB53_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB53_1 ; RV32I-NEXT: .LBB53_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -4804,22 +4693,20 @@ define i8 @atomicrmw_umin_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB53_2 ; RV64I-NEXT: .LBB53_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB53_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB53_4 ; RV64I-NEXT: .LBB53_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4828,11 +4715,10 @@ define i8 @atomicrmw_umin_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB53_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB53_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB53_1 ; RV64I-NEXT: .LBB53_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -4876,22 +4762,20 @@ define i8 @atomicrmw_umin_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: andi s1, a1, 255 -; RV32I-NEXT: addi s2, sp, 11 ; RV32I-NEXT: j .LBB54_2 ; RV32I-NEXT: .LBB54_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB54_2 Depth=1 -; RV32I-NEXT: sb a3, 11(sp) +; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lb a3, 11(sp) +; RV32I-NEXT: lb a3, 15(sp) ; RV32I-NEXT: bnez a0, .LBB54_4 ; RV32I-NEXT: .LBB54_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4900,11 +4784,10 @@ define i8 @atomicrmw_umin_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV32I-NEXT: bgeu s1, a0, .LBB54_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB54_2 Depth=1 -; RV32I-NEXT: mv a2, s3 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB54_1 ; RV32I-NEXT: .LBB54_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -4944,22 +4827,20 @@ define i8 @atomicrmw_umin_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: andi s1, a1, 255 -; RV64I-NEXT: addi s2, sp, 7 ; RV64I-NEXT: j .LBB54_2 ; RV64I-NEXT: .LBB54_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB54_2 Depth=1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lb a3, 7(sp) +; RV64I-NEXT: lb a3, 15(sp) ; RV64I-NEXT: bnez a0, .LBB54_4 ; RV64I-NEXT: .LBB54_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -4968,11 +4849,10 @@ define i8 @atomicrmw_umin_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB54_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB54_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB54_1 ; RV64I-NEXT: .LBB54_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -7166,37 +7046,34 @@ define i16 @atomicrmw_max_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB90_2 ; RV32I-NEXT: .LBB90_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB90_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB90_4 ; RV32I-NEXT: .LBB90_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB90_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB90_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB90_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB90_1 ; RV32I-NEXT: .LBB90_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -7242,37 +7119,34 @@ define i16 @atomicrmw_max_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB90_2 ; RV64I-NEXT: .LBB90_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB90_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB90_4 ; RV64I-NEXT: .LBB90_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB90_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB90_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB90_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB90_1 ; RV64I-NEXT: .LBB90_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -7322,37 +7196,34 @@ define i16 @atomicrmw_max_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB91_2 ; RV32I-NEXT: .LBB91_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB91_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB91_4 ; RV32I-NEXT: .LBB91_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB91_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB91_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB91_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB91_1 ; RV32I-NEXT: .LBB91_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -7398,37 +7269,34 @@ define i16 @atomicrmw_max_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB91_2 ; RV64I-NEXT: .LBB91_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB91_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB91_4 ; RV64I-NEXT: .LBB91_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB91_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB91_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB91_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB91_1 ; RV64I-NEXT: .LBB91_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -7478,37 +7346,34 @@ define i16 @atomicrmw_max_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB92_2 ; RV32I-NEXT: .LBB92_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB92_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB92_4 ; RV32I-NEXT: .LBB92_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB92_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB92_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB92_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB92_1 ; RV32I-NEXT: .LBB92_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -7554,37 +7419,34 @@ define i16 @atomicrmw_max_i16_release(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB92_2 ; RV64I-NEXT: .LBB92_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB92_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: addi a3, zero, 3 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB92_4 ; RV64I-NEXT: .LBB92_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB92_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB92_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB92_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB92_1 ; RV64I-NEXT: .LBB92_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -7634,37 +7496,34 @@ define i16 @atomicrmw_max_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB93_2 ; RV32I-NEXT: .LBB93_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB93_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB93_4 ; RV32I-NEXT: .LBB93_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB93_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB93_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB93_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB93_1 ; RV32I-NEXT: .LBB93_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -7710,37 +7569,34 @@ define i16 @atomicrmw_max_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB93_2 ; RV64I-NEXT: .LBB93_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB93_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB93_4 ; RV64I-NEXT: .LBB93_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB93_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB93_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB93_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB93_1 ; RV64I-NEXT: .LBB93_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -7790,37 +7646,34 @@ define i16 @atomicrmw_max_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB94_2 ; RV32I-NEXT: .LBB94_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB94_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB94_4 ; RV32I-NEXT: .LBB94_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt s0, a0, .LBB94_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: blt s1, a0, .LBB94_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB94_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB94_1 ; RV32I-NEXT: .LBB94_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -7866,37 +7719,34 @@ define i16 @atomicrmw_max_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB94_2 ; RV64I-NEXT: .LBB94_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB94_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB94_4 ; RV64I-NEXT: .LBB94_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt s0, a0, .LBB94_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: blt s1, a0, .LBB94_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB94_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB94_1 ; RV64I-NEXT: .LBB94_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -7946,37 +7796,34 @@ define i16 @atomicrmw_min_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB95_2 ; RV32I-NEXT: .LBB95_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB95_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB95_4 ; RV32I-NEXT: .LBB95_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB95_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB95_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB95_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB95_1 ; RV32I-NEXT: .LBB95_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -8022,37 +7869,34 @@ define i16 @atomicrmw_min_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB95_2 ; RV64I-NEXT: .LBB95_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB95_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB95_4 ; RV64I-NEXT: .LBB95_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB95_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB95_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB95_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB95_1 ; RV64I-NEXT: .LBB95_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -8102,37 +7946,34 @@ define i16 @atomicrmw_min_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB96_2 ; RV32I-NEXT: .LBB96_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB96_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB96_4 ; RV32I-NEXT: .LBB96_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB96_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB96_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB96_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB96_1 ; RV32I-NEXT: .LBB96_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -8178,37 +8019,34 @@ define i16 @atomicrmw_min_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB96_2 ; RV64I-NEXT: .LBB96_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB96_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB96_4 ; RV64I-NEXT: .LBB96_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB96_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB96_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB96_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB96_1 ; RV64I-NEXT: .LBB96_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -8258,37 +8096,34 @@ define i16 @atomicrmw_min_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB97_2 ; RV32I-NEXT: .LBB97_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB97_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB97_4 ; RV32I-NEXT: .LBB97_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB97_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB97_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB97_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB97_1 ; RV32I-NEXT: .LBB97_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -8334,37 +8169,34 @@ define i16 @atomicrmw_min_i16_release(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB97_2 ; RV64I-NEXT: .LBB97_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB97_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: addi a3, zero, 3 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB97_4 ; RV64I-NEXT: .LBB97_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB97_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB97_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB97_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB97_1 ; RV64I-NEXT: .LBB97_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -8414,37 +8246,34 @@ define i16 @atomicrmw_min_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB98_2 ; RV32I-NEXT: .LBB98_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB98_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB98_4 ; RV32I-NEXT: .LBB98_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB98_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB98_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB98_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB98_1 ; RV32I-NEXT: .LBB98_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -8490,37 +8319,34 @@ define i16 @atomicrmw_min_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB98_2 ; RV64I-NEXT: .LBB98_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB98_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB98_4 ; RV64I-NEXT: .LBB98_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB98_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB98_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB98_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB98_1 ; RV64I-NEXT: .LBB98_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -8570,37 +8396,34 @@ define i16 @atomicrmw_min_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a3, 0(a0) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) -; RV32I-NEXT: slli a0, s2, 16 -; RV32I-NEXT: srai s0, a0, 16 -; RV32I-NEXT: addi s3, sp, 10 +; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: srai s1, a0, 16 ; RV32I-NEXT: j .LBB99_2 ; RV32I-NEXT: .LBB99_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB99_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) +; RV32I-NEXT: lh a3, 14(sp) ; RV32I-NEXT: bnez a0, .LBB99_4 ; RV32I-NEXT: .LBB99_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: slli a0, a3, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bge s0, a0, .LBB99_1 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: bge s1, a0, .LBB99_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB99_2 Depth=1 ; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: j .LBB99_1 ; RV32I-NEXT: .LBB99_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -8646,37 +8469,34 @@ define i16 @atomicrmw_min_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a3, 0(a0) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) -; RV64I-NEXT: slli a0, s2, 48 -; RV64I-NEXT: srai s0, a0, 48 -; RV64I-NEXT: addi s3, sp, 6 +; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: srai s1, a0, 48 ; RV64I-NEXT: j .LBB99_2 ; RV64I-NEXT: .LBB99_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB99_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) +; RV64I-NEXT: lh a3, 14(sp) ; RV64I-NEXT: bnez a0, .LBB99_4 ; RV64I-NEXT: .LBB99_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: slli a0, a3, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bge s0, a0, .LBB99_1 +; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: bge s1, a0, .LBB99_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB99_2 Depth=1 ; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB99_1 ; RV64I-NEXT: .LBB99_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -8727,24 +8547,22 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB100_2 ; RV32I-NEXT: .LBB100_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB100_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB100_4 ; RV32I-NEXT: .LBB100_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -8757,7 +8575,6 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB100_1 ; RV32I-NEXT: .LBB100_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -8794,30 +8611,28 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i16_monotonic: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB100_2 ; RV64I-NEXT: .LBB100_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB100_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB100_4 ; RV64I-NEXT: .LBB100_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -8830,13 +8645,12 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB100_1 ; RV64I-NEXT: .LBB100_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i16_monotonic: @@ -8877,24 +8691,22 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB101_2 ; RV32I-NEXT: .LBB101_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB101_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB101_4 ; RV32I-NEXT: .LBB101_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -8907,7 +8719,6 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB101_1 ; RV32I-NEXT: .LBB101_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -8944,30 +8755,28 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i16_acquire: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB101_2 ; RV64I-NEXT: .LBB101_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB101_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB101_4 ; RV64I-NEXT: .LBB101_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -8980,13 +8789,12 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB101_1 ; RV64I-NEXT: .LBB101_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i16_acquire: @@ -9027,24 +8835,22 @@ define i16 @atomicrmw_umax_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB102_2 ; RV32I-NEXT: .LBB102_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB102_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB102_4 ; RV32I-NEXT: .LBB102_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9057,7 +8863,6 @@ define i16 @atomicrmw_umax_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB102_1 ; RV32I-NEXT: .LBB102_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -9094,30 +8899,28 @@ define i16 @atomicrmw_umax_i16_release(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i16_release: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB102_2 ; RV64I-NEXT: .LBB102_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB102_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: addi a3, zero, 3 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB102_4 ; RV64I-NEXT: .LBB102_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9130,13 +8933,12 @@ define i16 @atomicrmw_umax_i16_release(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB102_1 ; RV64I-NEXT: .LBB102_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i16_release: @@ -9177,24 +8979,22 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB103_2 ; RV32I-NEXT: .LBB103_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB103_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB103_4 ; RV32I-NEXT: .LBB103_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9207,7 +9007,6 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB103_1 ; RV32I-NEXT: .LBB103_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -9244,30 +9043,28 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i16_acq_rel: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB103_2 ; RV64I-NEXT: .LBB103_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB103_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB103_4 ; RV64I-NEXT: .LBB103_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9280,13 +9077,12 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB103_1 ; RV64I-NEXT: .LBB103_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i16_acq_rel: @@ -9327,24 +9123,22 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB104_2 ; RV32I-NEXT: .LBB104_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB104_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB104_4 ; RV32I-NEXT: .LBB104_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9357,7 +9151,6 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB104_1 ; RV32I-NEXT: .LBB104_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -9394,30 +9187,28 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i16_seq_cst: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB104_2 ; RV64I-NEXT: .LBB104_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB104_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB104_4 ; RV64I-NEXT: .LBB104_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9430,13 +9221,12 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB104_1 ; RV64I-NEXT: .LBB104_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i16_seq_cst: @@ -9477,24 +9267,22 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB105_2 ; RV32I-NEXT: .LBB105_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB105_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB105_4 ; RV32I-NEXT: .LBB105_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9507,7 +9295,6 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB105_1 ; RV32I-NEXT: .LBB105_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -9544,30 +9331,28 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i16_monotonic: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB105_2 ; RV64I-NEXT: .LBB105_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB105_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB105_4 ; RV64I-NEXT: .LBB105_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9580,13 +9365,12 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB105_1 ; RV64I-NEXT: .LBB105_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i16_monotonic: @@ -9627,24 +9411,22 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB106_2 ; RV32I-NEXT: .LBB106_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB106_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB106_4 ; RV32I-NEXT: .LBB106_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9657,7 +9439,6 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB106_1 ; RV32I-NEXT: .LBB106_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -9694,30 +9475,28 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i16_acquire: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB106_2 ; RV64I-NEXT: .LBB106_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB106_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB106_4 ; RV64I-NEXT: .LBB106_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9730,13 +9509,12 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB106_1 ; RV64I-NEXT: .LBB106_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i16_acquire: @@ -9777,24 +9555,22 @@ define i16 @atomicrmw_umin_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB107_2 ; RV32I-NEXT: .LBB107_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB107_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB107_4 ; RV32I-NEXT: .LBB107_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9807,7 +9583,6 @@ define i16 @atomicrmw_umin_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB107_1 ; RV32I-NEXT: .LBB107_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -9844,30 +9619,28 @@ define i16 @atomicrmw_umin_i16_release(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i16_release: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB107_2 ; RV64I-NEXT: .LBB107_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB107_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: addi a3, zero, 3 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB107_4 ; RV64I-NEXT: .LBB107_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9880,13 +9653,12 @@ define i16 @atomicrmw_umin_i16_release(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB107_1 ; RV64I-NEXT: .LBB107_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i16_release: @@ -9927,24 +9699,22 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB108_2 ; RV32I-NEXT: .LBB108_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB108_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB108_4 ; RV32I-NEXT: .LBB108_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -9957,7 +9727,6 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB108_1 ; RV32I-NEXT: .LBB108_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -9994,30 +9763,28 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i16_acq_rel: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB108_2 ; RV64I-NEXT: .LBB108_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB108_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB108_4 ; RV64I-NEXT: .LBB108_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -10030,13 +9797,12 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB108_1 ; RV64I-NEXT: .LBB108_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i16_acq_rel: @@ -10077,24 +9843,22 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 ; RV32I-NEXT: addi s0, a0, -1 ; RV32I-NEXT: and s1, s2, s0 -; RV32I-NEXT: addi s3, sp, 6 ; RV32I-NEXT: j .LBB109_2 ; RV32I-NEXT: .LBB109_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB109_2 Depth=1 -; RV32I-NEXT: sh a1, 6(sp) +; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 6(sp) +; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB109_4 ; RV32I-NEXT: .LBB109_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -10107,7 +9871,6 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: j .LBB109_1 ; RV32I-NEXT: .LBB109_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -10144,30 +9907,28 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i16_seq_cst: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) -; RV64I-NEXT: sd s0, 48(sp) -; RV64I-NEXT: sd s1, 40(sp) -; RV64I-NEXT: sd s2, 32(sp) -; RV64I-NEXT: sd s3, 24(sp) -; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s2, a1 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lhu a1, 0(a0) ; RV64I-NEXT: lui a0, 16 ; RV64I-NEXT: addiw s0, a0, -1 ; RV64I-NEXT: and s1, s2, s0 -; RV64I-NEXT: addi s3, sp, 14 ; RV64I-NEXT: j .LBB109_2 ; RV64I-NEXT: .LBB109_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB109_2 Depth=1 -; RV64I-NEXT: sh a1, 14(sp) +; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: mv a1, s3 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 14(sp) +; RV64I-NEXT: lh a1, 6(sp) ; RV64I-NEXT: bnez a0, .LBB109_4 ; RV64I-NEXT: .LBB109_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -10180,13 +9941,12 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV64I-NEXT: j .LBB109_1 ; RV64I-NEXT: .LBB109_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: ld s4, 16(sp) -; RV64I-NEXT: ld s3, 24(sp) -; RV64I-NEXT: ld s2, 32(sp) -; RV64I-NEXT: ld s1, 40(sp) -; RV64I-NEXT: ld s0, 48(sp) -; RV64I-NEXT: ld ra, 56(sp) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i16_seq_cst: @@ -11491,25 +11251,23 @@ define i32 @atomicrmw_xor_i32_seq_cst(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_max_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_max_i32_monotonic: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB145_2 ; RV32I-NEXT: .LBB145_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB145_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB145_4 ; RV32I-NEXT: .LBB145_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11521,11 +11279,10 @@ define i32 @atomicrmw_max_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB145_1 ; RV32I-NEXT: .LBB145_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_max_i32_monotonic: @@ -11540,22 +11297,20 @@ define i32 @atomicrmw_max_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB145_2 ; RV64I-NEXT: .LBB145_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB145_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB145_4 ; RV64I-NEXT: .LBB145_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11564,11 +11319,10 @@ define i32 @atomicrmw_max_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: blt s1, a0, .LBB145_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB145_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB145_1 ; RV64I-NEXT: .LBB145_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -11587,25 +11341,23 @@ define i32 @atomicrmw_max_i32_monotonic(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_max_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_max_i32_acquire: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB146_2 ; RV32I-NEXT: .LBB146_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB146_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB146_4 ; RV32I-NEXT: .LBB146_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11617,11 +11369,10 @@ define i32 @atomicrmw_max_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB146_1 ; RV32I-NEXT: .LBB146_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_max_i32_acquire: @@ -11636,22 +11387,20 @@ define i32 @atomicrmw_max_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB146_2 ; RV64I-NEXT: .LBB146_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB146_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB146_4 ; RV64I-NEXT: .LBB146_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11660,11 +11409,10 @@ define i32 @atomicrmw_max_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: blt s1, a0, .LBB146_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB146_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB146_1 ; RV64I-NEXT: .LBB146_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -11683,25 +11431,23 @@ define i32 @atomicrmw_max_i32_acquire(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_max_i32_release(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_max_i32_release: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB147_2 ; RV32I-NEXT: .LBB147_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB147_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB147_4 ; RV32I-NEXT: .LBB147_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11713,11 +11459,10 @@ define i32 @atomicrmw_max_i32_release(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB147_1 ; RV32I-NEXT: .LBB147_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_max_i32_release: @@ -11732,22 +11477,20 @@ define i32 @atomicrmw_max_i32_release(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB147_2 ; RV64I-NEXT: .LBB147_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB147_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB147_4 ; RV64I-NEXT: .LBB147_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11756,11 +11499,10 @@ define i32 @atomicrmw_max_i32_release(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: blt s1, a0, .LBB147_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB147_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB147_1 ; RV64I-NEXT: .LBB147_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -11779,25 +11521,23 @@ define i32 @atomicrmw_max_i32_release(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_max_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_max_i32_acq_rel: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB148_2 ; RV32I-NEXT: .LBB148_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB148_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB148_4 ; RV32I-NEXT: .LBB148_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11809,11 +11549,10 @@ define i32 @atomicrmw_max_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB148_1 ; RV32I-NEXT: .LBB148_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_max_i32_acq_rel: @@ -11828,22 +11567,20 @@ define i32 @atomicrmw_max_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB148_2 ; RV64I-NEXT: .LBB148_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB148_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB148_4 ; RV64I-NEXT: .LBB148_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11852,11 +11589,10 @@ define i32 @atomicrmw_max_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: blt s1, a0, .LBB148_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB148_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB148_1 ; RV64I-NEXT: .LBB148_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -11875,25 +11611,23 @@ define i32 @atomicrmw_max_i32_acq_rel(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_max_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_max_i32_seq_cst: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB149_2 ; RV32I-NEXT: .LBB149_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB149_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB149_4 ; RV32I-NEXT: .LBB149_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11905,11 +11639,10 @@ define i32 @atomicrmw_max_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB149_1 ; RV32I-NEXT: .LBB149_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_max_i32_seq_cst: @@ -11924,22 +11657,20 @@ define i32 @atomicrmw_max_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB149_2 ; RV64I-NEXT: .LBB149_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB149_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB149_4 ; RV64I-NEXT: .LBB149_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -11948,11 +11679,10 @@ define i32 @atomicrmw_max_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: blt s1, a0, .LBB149_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB149_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB149_1 ; RV64I-NEXT: .LBB149_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -11971,25 +11701,23 @@ define i32 @atomicrmw_max_i32_seq_cst(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_min_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_min_i32_monotonic: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB150_2 ; RV32I-NEXT: .LBB150_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB150_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB150_4 ; RV32I-NEXT: .LBB150_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12001,11 +11729,10 @@ define i32 @atomicrmw_min_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB150_1 ; RV32I-NEXT: .LBB150_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_min_i32_monotonic: @@ -12020,22 +11747,20 @@ define i32 @atomicrmw_min_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB150_2 ; RV64I-NEXT: .LBB150_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB150_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB150_4 ; RV64I-NEXT: .LBB150_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12044,11 +11769,10 @@ define i32 @atomicrmw_min_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bge s1, a0, .LBB150_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB150_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB150_1 ; RV64I-NEXT: .LBB150_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12067,25 +11791,23 @@ define i32 @atomicrmw_min_i32_monotonic(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_min_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_min_i32_acquire: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB151_2 ; RV32I-NEXT: .LBB151_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB151_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB151_4 ; RV32I-NEXT: .LBB151_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12097,11 +11819,10 @@ define i32 @atomicrmw_min_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB151_1 ; RV32I-NEXT: .LBB151_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_min_i32_acquire: @@ -12116,22 +11837,20 @@ define i32 @atomicrmw_min_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB151_2 ; RV64I-NEXT: .LBB151_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB151_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB151_4 ; RV64I-NEXT: .LBB151_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12140,11 +11859,10 @@ define i32 @atomicrmw_min_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bge s1, a0, .LBB151_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB151_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB151_1 ; RV64I-NEXT: .LBB151_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12163,25 +11881,23 @@ define i32 @atomicrmw_min_i32_acquire(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_min_i32_release(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_min_i32_release: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB152_2 ; RV32I-NEXT: .LBB152_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB152_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB152_4 ; RV32I-NEXT: .LBB152_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12193,11 +11909,10 @@ define i32 @atomicrmw_min_i32_release(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB152_1 ; RV32I-NEXT: .LBB152_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_min_i32_release: @@ -12212,22 +11927,20 @@ define i32 @atomicrmw_min_i32_release(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB152_2 ; RV64I-NEXT: .LBB152_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB152_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB152_4 ; RV64I-NEXT: .LBB152_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12236,11 +11949,10 @@ define i32 @atomicrmw_min_i32_release(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bge s1, a0, .LBB152_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB152_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB152_1 ; RV64I-NEXT: .LBB152_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12259,25 +11971,23 @@ define i32 @atomicrmw_min_i32_release(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_min_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_min_i32_acq_rel: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB153_2 ; RV32I-NEXT: .LBB153_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB153_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB153_4 ; RV32I-NEXT: .LBB153_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12289,11 +11999,10 @@ define i32 @atomicrmw_min_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB153_1 ; RV32I-NEXT: .LBB153_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_min_i32_acq_rel: @@ -12308,22 +12017,20 @@ define i32 @atomicrmw_min_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB153_2 ; RV64I-NEXT: .LBB153_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB153_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB153_4 ; RV64I-NEXT: .LBB153_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12332,11 +12039,10 @@ define i32 @atomicrmw_min_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bge s1, a0, .LBB153_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB153_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB153_1 ; RV64I-NEXT: .LBB153_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12355,25 +12061,23 @@ define i32 @atomicrmw_min_i32_acq_rel(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_min_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_min_i32_seq_cst: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB154_2 ; RV32I-NEXT: .LBB154_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB154_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB154_4 ; RV32I-NEXT: .LBB154_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12385,11 +12089,10 @@ define i32 @atomicrmw_min_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB154_1 ; RV32I-NEXT: .LBB154_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_min_i32_seq_cst: @@ -12404,22 +12107,20 @@ define i32 @atomicrmw_min_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB154_2 ; RV64I-NEXT: .LBB154_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB154_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB154_4 ; RV64I-NEXT: .LBB154_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12428,11 +12129,10 @@ define i32 @atomicrmw_min_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bge s1, a0, .LBB154_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB154_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB154_1 ; RV64I-NEXT: .LBB154_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12451,25 +12151,23 @@ define i32 @atomicrmw_min_i32_seq_cst(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_umax_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_umax_i32_monotonic: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB155_2 ; RV32I-NEXT: .LBB155_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB155_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB155_4 ; RV32I-NEXT: .LBB155_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12481,11 +12179,10 @@ define i32 @atomicrmw_umax_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB155_1 ; RV32I-NEXT: .LBB155_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umax_i32_monotonic: @@ -12500,22 +12197,20 @@ define i32 @atomicrmw_umax_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB155_2 ; RV64I-NEXT: .LBB155_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB155_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB155_4 ; RV64I-NEXT: .LBB155_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12524,11 +12219,10 @@ define i32 @atomicrmw_umax_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB155_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB155_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB155_1 ; RV64I-NEXT: .LBB155_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12547,25 +12241,23 @@ define i32 @atomicrmw_umax_i32_monotonic(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_umax_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_umax_i32_acquire: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB156_2 ; RV32I-NEXT: .LBB156_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB156_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB156_4 ; RV32I-NEXT: .LBB156_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12577,11 +12269,10 @@ define i32 @atomicrmw_umax_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB156_1 ; RV32I-NEXT: .LBB156_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umax_i32_acquire: @@ -12596,22 +12287,20 @@ define i32 @atomicrmw_umax_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB156_2 ; RV64I-NEXT: .LBB156_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB156_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB156_4 ; RV64I-NEXT: .LBB156_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12620,11 +12309,10 @@ define i32 @atomicrmw_umax_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB156_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB156_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB156_1 ; RV64I-NEXT: .LBB156_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12643,25 +12331,23 @@ define i32 @atomicrmw_umax_i32_acquire(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_umax_i32_release(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_umax_i32_release: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB157_2 ; RV32I-NEXT: .LBB157_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB157_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB157_4 ; RV32I-NEXT: .LBB157_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12673,11 +12359,10 @@ define i32 @atomicrmw_umax_i32_release(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB157_1 ; RV32I-NEXT: .LBB157_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umax_i32_release: @@ -12692,22 +12377,20 @@ define i32 @atomicrmw_umax_i32_release(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB157_2 ; RV64I-NEXT: .LBB157_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB157_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB157_4 ; RV64I-NEXT: .LBB157_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12716,11 +12399,10 @@ define i32 @atomicrmw_umax_i32_release(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB157_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB157_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB157_1 ; RV64I-NEXT: .LBB157_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12739,25 +12421,23 @@ define i32 @atomicrmw_umax_i32_release(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_umax_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_umax_i32_acq_rel: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB158_2 ; RV32I-NEXT: .LBB158_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB158_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB158_4 ; RV32I-NEXT: .LBB158_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12769,11 +12449,10 @@ define i32 @atomicrmw_umax_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB158_1 ; RV32I-NEXT: .LBB158_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umax_i32_acq_rel: @@ -12788,22 +12467,20 @@ define i32 @atomicrmw_umax_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB158_2 ; RV64I-NEXT: .LBB158_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB158_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB158_4 ; RV64I-NEXT: .LBB158_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12812,11 +12489,10 @@ define i32 @atomicrmw_umax_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB158_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB158_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB158_1 ; RV64I-NEXT: .LBB158_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12835,25 +12511,23 @@ define i32 @atomicrmw_umax_i32_acq_rel(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_umax_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_umax_i32_seq_cst: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB159_2 ; RV32I-NEXT: .LBB159_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB159_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB159_4 ; RV32I-NEXT: .LBB159_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12865,11 +12539,10 @@ define i32 @atomicrmw_umax_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB159_1 ; RV32I-NEXT: .LBB159_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umax_i32_seq_cst: @@ -12884,22 +12557,20 @@ define i32 @atomicrmw_umax_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB159_2 ; RV64I-NEXT: .LBB159_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB159_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB159_4 ; RV64I-NEXT: .LBB159_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12908,11 +12579,10 @@ define i32 @atomicrmw_umax_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bltu s1, a0, .LBB159_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB159_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB159_1 ; RV64I-NEXT: .LBB159_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -12931,25 +12601,23 @@ define i32 @atomicrmw_umax_i32_seq_cst(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_umin_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_umin_i32_monotonic: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB160_2 ; RV32I-NEXT: .LBB160_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB160_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB160_4 ; RV32I-NEXT: .LBB160_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -12961,11 +12629,10 @@ define i32 @atomicrmw_umin_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB160_1 ; RV32I-NEXT: .LBB160_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umin_i32_monotonic: @@ -12980,22 +12647,20 @@ define i32 @atomicrmw_umin_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB160_2 ; RV64I-NEXT: .LBB160_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB160_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB160_4 ; RV64I-NEXT: .LBB160_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -13004,11 +12669,10 @@ define i32 @atomicrmw_umin_i32_monotonic(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB160_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB160_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB160_1 ; RV64I-NEXT: .LBB160_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -13027,25 +12691,23 @@ define i32 @atomicrmw_umin_i32_monotonic(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_umin_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_umin_i32_acquire: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB161_2 ; RV32I-NEXT: .LBB161_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB161_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB161_4 ; RV32I-NEXT: .LBB161_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -13057,11 +12719,10 @@ define i32 @atomicrmw_umin_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB161_1 ; RV32I-NEXT: .LBB161_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umin_i32_acquire: @@ -13076,22 +12737,20 @@ define i32 @atomicrmw_umin_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB161_2 ; RV64I-NEXT: .LBB161_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB161_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB161_4 ; RV64I-NEXT: .LBB161_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -13100,11 +12759,10 @@ define i32 @atomicrmw_umin_i32_acquire(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB161_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB161_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB161_1 ; RV64I-NEXT: .LBB161_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -13123,27 +12781,25 @@ define i32 @atomicrmw_umin_i32_acquire(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_umin_i32_release(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_umin_i32_release: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB162_2 ; RV32I-NEXT: .LBB162_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB162_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB162_4 -; RV32I-NEXT: LBB162_2: # %atomicrmw.start +; RV32I-NEXT: .LBB162_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: mv a2, a3 ; RV32I-NEXT: bgeu s1, a3, .LBB162_1 @@ -13153,11 +12809,10 @@ define i32 @atomicrmw_umin_i32_release(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB162_1 ; RV32I-NEXT: .LBB162_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umin_i32_release: @@ -13172,22 +12827,20 @@ define i32 @atomicrmw_umin_i32_release(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB162_2 ; RV64I-NEXT: .LBB162_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB162_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB162_4 ; RV64I-NEXT: .LBB162_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -13196,11 +12849,10 @@ define i32 @atomicrmw_umin_i32_release(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB162_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB162_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB162_1 ; RV64I-NEXT: .LBB162_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -13219,25 +12871,23 @@ define i32 @atomicrmw_umin_i32_release(i32 *%a, i32 %b) nounwind { define i32 @atomicrmw_umin_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV32I-LABEL: atomicrmw_umin_i32_acq_rel: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB163_2 ; RV32I-NEXT: .LBB163_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB163_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB163_4 ; RV32I-NEXT: .LBB163_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -13249,11 +12899,10 @@ define i32 @atomicrmw_umin_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB163_1 ; RV32I-NEXT: .LBB163_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umin_i32_acq_rel: @@ -13268,22 +12917,20 @@ define i32 @atomicrmw_umin_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB163_2 ; RV64I-NEXT: .LBB163_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB163_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB163_4 ; RV64I-NEXT: .LBB163_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -13292,11 +12939,10 @@ define i32 @atomicrmw_umin_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB163_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB163_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB163_1 ; RV64I-NEXT: .LBB163_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -13312,28 +12958,26 @@ define i32 @atomicrmw_umin_i32_acq_rel(i32 *%a, i32 %b) nounwind { ret i32 %1 } -define i32 @atomicrmw_umin_i32_seq_cst(i32 *%a, i32 %b) nounwind { -; RV32I-LABEL: atomicrmw_umin_i32_seq_cst: -; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) -; RV32I-NEXT: sw s0, 24(sp) -; RV32I-NEXT: sw s1, 20(sp) -; RV32I-NEXT: sw s2, 16(sp) +define i32 @atomicrmw_umin_i32_seq_cst(i32 *%a, i32 %b) nounwind { +; RV32I-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a3, 0(a0) ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: addi s2, sp, 12 ; RV32I-NEXT: j .LBB164_2 ; RV32I-NEXT: .LBB164_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB164_2 Depth=1 -; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 12(sp) +; RV32I-NEXT: lw a3, 0(sp) ; RV32I-NEXT: bnez a0, .LBB164_4 ; RV32I-NEXT: .LBB164_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -13345,11 +12989,10 @@ define i32 @atomicrmw_umin_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV32I-NEXT: j .LBB164_1 ; RV32I-NEXT: .LBB164_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: lw s2, 16(sp) -; RV32I-NEXT: lw s1, 20(sp) -; RV32I-NEXT: lw s0, 24(sp) -; RV32I-NEXT: lw ra, 28(sp) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32IA-LABEL: atomicrmw_umin_i32_seq_cst: @@ -13364,22 +13007,20 @@ define i32 @atomicrmw_umin_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lwu a3, 0(a0) -; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a1 ; RV64I-NEXT: sext.w s1, a1 -; RV64I-NEXT: addi s2, sp, 4 ; RV64I-NEXT: j .LBB164_2 ; RV64I-NEXT: .LBB164_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB164_2 Depth=1 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) +; RV64I-NEXT: lw a3, 12(sp) ; RV64I-NEXT: bnez a0, .LBB164_4 ; RV64I-NEXT: .LBB164_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -13388,11 +13029,10 @@ define i32 @atomicrmw_umin_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; RV64I-NEXT: bgeu s1, a0, .LBB164_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB164_2 Depth=1 -; RV64I-NEXT: mv a2, s3 +; RV64I-NEXT: mv a2, s2 ; RV64I-NEXT: j .LBB164_1 ; RV64I-NEXT: .LBB164_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) ; RV64I-NEXT: ld s0, 32(sp) @@ -14821,25 +14461,23 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB200_2 ; RV32I-NEXT: .LBB200_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB200_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: mv a5, zero ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB200_7 ; RV32I-NEXT: .LBB200_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -14848,7 +14486,7 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB200_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB200_5 -; RV32I-NEXT: .LBB200_4: +; RV32I-NEXT: .LBB200_4: # in Loop: Header=BB200_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB200_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB200_2 Depth=1 @@ -14863,7 +14501,6 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB200_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -14878,25 +14515,23 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB200_2 ; RV32IA-NEXT: .LBB200_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB200_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: mv a4, zero ; RV32IA-NEXT: mv a5, zero ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB200_7 ; RV32IA-NEXT: .LBB200_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -14905,7 +14540,7 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB200_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB200_5 -; RV32IA-NEXT: .LBB200_4: +; RV32IA-NEXT: .LBB200_4: # in Loop: Header=BB200_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB200_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB200_2 Depth=1 @@ -14920,7 +14555,6 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB200_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -14930,25 +14564,23 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_max_i64_monotonic: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB200_2 ; RV64I-NEXT: .LBB200_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB200_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB200_4 ; RV64I-NEXT: .LBB200_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -14960,11 +14592,10 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB200_1 ; RV64I-NEXT: .LBB200_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_max_i64_monotonic: @@ -14983,25 +14614,23 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB201_2 ; RV32I-NEXT: .LBB201_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB201_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: addi a5, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB201_7 ; RV32I-NEXT: .LBB201_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15010,7 +14639,7 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB201_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB201_5 -; RV32I-NEXT: .LBB201_4: +; RV32I-NEXT: .LBB201_4: # in Loop: Header=BB201_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB201_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB201_2 Depth=1 @@ -15025,7 +14654,6 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB201_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -15040,25 +14668,23 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB201_2 ; RV32IA-NEXT: .LBB201_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB201_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 2 ; RV32IA-NEXT: addi a5, zero, 2 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB201_7 ; RV32IA-NEXT: .LBB201_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15067,7 +14693,7 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB201_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB201_5 -; RV32IA-NEXT: .LBB201_4: +; RV32IA-NEXT: .LBB201_4: # in Loop: Header=BB201_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB201_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB201_2 Depth=1 @@ -15082,7 +14708,6 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB201_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -15092,25 +14717,23 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_max_i64_acquire: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB201_2 ; RV64I-NEXT: .LBB201_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB201_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB201_4 ; RV64I-NEXT: .LBB201_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15122,11 +14745,10 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB201_1 ; RV64I-NEXT: .LBB201_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_max_i64_acquire: @@ -15145,25 +14767,23 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB202_2 ; RV32I-NEXT: .LBB202_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB202_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: mv a5, zero ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB202_7 ; RV32I-NEXT: .LBB202_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15172,7 +14792,7 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB202_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB202_5 -; RV32I-NEXT: .LBB202_4: +; RV32I-NEXT: .LBB202_4: # in Loop: Header=BB202_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB202_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB202_2 Depth=1 @@ -15187,7 +14807,6 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB202_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -15202,25 +14821,23 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB202_2 ; RV32IA-NEXT: .LBB202_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB202_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 3 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: mv a5, zero ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB202_7 ; RV32IA-NEXT: .LBB202_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15229,7 +14846,7 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB202_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB202_5 -; RV32IA-NEXT: .LBB202_4: +; RV32IA-NEXT: .LBB202_4: # in Loop: Header=BB202_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB202_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB202_2 Depth=1 @@ -15244,7 +14861,6 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB202_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -15254,25 +14870,23 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_max_i64_release: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB202_2 ; RV64I-NEXT: .LBB202_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB202_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB202_4 ; RV64I-NEXT: .LBB202_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15284,11 +14898,10 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB202_1 ; RV64I-NEXT: .LBB202_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_max_i64_release: @@ -15307,25 +14920,23 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB203_2 ; RV32I-NEXT: .LBB203_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB203_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 4 ; RV32I-NEXT: addi a5, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB203_7 ; RV32I-NEXT: .LBB203_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15334,7 +14945,7 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB203_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB203_5 -; RV32I-NEXT: .LBB203_4: +; RV32I-NEXT: .LBB203_4: # in Loop: Header=BB203_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB203_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB203_2 Depth=1 @@ -15349,7 +14960,6 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB203_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -15364,25 +14974,23 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB203_2 ; RV32IA-NEXT: .LBB203_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB203_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 4 ; RV32IA-NEXT: addi a5, zero, 2 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB203_7 ; RV32IA-NEXT: .LBB203_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15391,7 +14999,7 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB203_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB203_5 -; RV32IA-NEXT: .LBB203_4: +; RV32IA-NEXT: .LBB203_4: # in Loop: Header=BB203_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB203_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB203_2 Depth=1 @@ -15406,7 +15014,6 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB203_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -15416,25 +15023,23 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_max_i64_acq_rel: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB203_2 ; RV64I-NEXT: .LBB203_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB203_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB203_4 ; RV64I-NEXT: .LBB203_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15446,11 +15051,10 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB203_1 ; RV64I-NEXT: .LBB203_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_max_i64_acq_rel: @@ -15469,25 +15073,23 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB204_2 ; RV32I-NEXT: .LBB204_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB204_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: addi a5, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB204_7 ; RV32I-NEXT: .LBB204_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15496,7 +15098,7 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB204_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB204_5 -; RV32I-NEXT: .LBB204_4: +; RV32I-NEXT: .LBB204_4: # in Loop: Header=BB204_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB204_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB204_2 Depth=1 @@ -15511,7 +15113,6 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB204_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -15526,25 +15127,23 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB204_2 ; RV32IA-NEXT: .LBB204_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB204_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 5 ; RV32IA-NEXT: addi a5, zero, 5 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB204_7 ; RV32IA-NEXT: .LBB204_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15553,7 +15152,7 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB204_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB204_5 -; RV32IA-NEXT: .LBB204_4: +; RV32IA-NEXT: .LBB204_4: # in Loop: Header=BB204_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB204_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB204_2 Depth=1 @@ -15568,7 +15167,6 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB204_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -15578,25 +15176,23 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_max_i64_seq_cst: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB204_2 ; RV64I-NEXT: .LBB204_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB204_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB204_4 ; RV64I-NEXT: .LBB204_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15608,11 +15204,10 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB204_1 ; RV64I-NEXT: .LBB204_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_max_i64_seq_cst: @@ -15631,25 +15226,23 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB205_2 ; RV32I-NEXT: .LBB205_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB205_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: mv a5, zero ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB205_7 ; RV32I-NEXT: .LBB205_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15658,7 +15251,7 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB205_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB205_5 -; RV32I-NEXT: .LBB205_4: +; RV32I-NEXT: .LBB205_4: # in Loop: Header=BB205_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB205_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB205_2 Depth=1 @@ -15674,7 +15267,6 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB205_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -15689,25 +15281,23 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB205_2 ; RV32IA-NEXT: .LBB205_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB205_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: mv a4, zero ; RV32IA-NEXT: mv a5, zero ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB205_7 ; RV32IA-NEXT: .LBB205_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15716,7 +15306,7 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB205_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB205_5 -; RV32IA-NEXT: .LBB205_4: +; RV32IA-NEXT: .LBB205_4: # in Loop: Header=BB205_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB205_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB205_2 Depth=1 @@ -15732,7 +15322,6 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB205_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -15742,25 +15331,23 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_min_i64_monotonic: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB205_2 ; RV64I-NEXT: .LBB205_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB205_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB205_4 ; RV64I-NEXT: .LBB205_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15772,11 +15359,10 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB205_1 ; RV64I-NEXT: .LBB205_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_min_i64_monotonic: @@ -15795,25 +15381,23 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB206_2 ; RV32I-NEXT: .LBB206_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB206_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: addi a5, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB206_7 ; RV32I-NEXT: .LBB206_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15822,7 +15406,7 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB206_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB206_5 -; RV32I-NEXT: .LBB206_4: +; RV32I-NEXT: .LBB206_4: # in Loop: Header=BB206_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB206_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB206_2 Depth=1 @@ -15838,7 +15422,6 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB206_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -15853,25 +15436,23 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB206_2 ; RV32IA-NEXT: .LBB206_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB206_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 2 ; RV32IA-NEXT: addi a5, zero, 2 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB206_7 ; RV32IA-NEXT: .LBB206_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15880,7 +15461,7 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB206_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB206_5 -; RV32IA-NEXT: .LBB206_4: +; RV32IA-NEXT: .LBB206_4: # in Loop: Header=BB206_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB206_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB206_2 Depth=1 @@ -15896,7 +15477,6 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB206_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -15906,25 +15486,23 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_min_i64_acquire: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB206_2 ; RV64I-NEXT: .LBB206_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB206_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB206_4 ; RV64I-NEXT: .LBB206_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15936,11 +15514,10 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB206_1 ; RV64I-NEXT: .LBB206_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_min_i64_acquire: @@ -15959,25 +15536,23 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB207_2 ; RV32I-NEXT: .LBB207_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB207_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: mv a5, zero ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB207_7 ; RV32I-NEXT: .LBB207_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -15986,7 +15561,7 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB207_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB207_5 -; RV32I-NEXT: .LBB207_4: +; RV32I-NEXT: .LBB207_4: # in Loop: Header=BB207_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB207_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB207_2 Depth=1 @@ -16002,7 +15577,6 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB207_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -16017,25 +15591,23 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB207_2 ; RV32IA-NEXT: .LBB207_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB207_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 3 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: mv a5, zero ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB207_7 ; RV32IA-NEXT: .LBB207_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16044,7 +15616,7 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB207_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB207_5 -; RV32IA-NEXT: .LBB207_4: +; RV32IA-NEXT: .LBB207_4: # in Loop: Header=BB207_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB207_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB207_2 Depth=1 @@ -16060,7 +15632,6 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB207_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -16070,25 +15641,23 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_min_i64_release: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB207_2 ; RV64I-NEXT: .LBB207_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB207_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB207_4 ; RV64I-NEXT: .LBB207_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16100,11 +15669,10 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB207_1 ; RV64I-NEXT: .LBB207_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_min_i64_release: @@ -16123,25 +15691,23 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB208_2 ; RV32I-NEXT: .LBB208_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB208_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 4 ; RV32I-NEXT: addi a5, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB208_7 ; RV32I-NEXT: .LBB208_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16150,7 +15716,7 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB208_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB208_5 -; RV32I-NEXT: .LBB208_4: +; RV32I-NEXT: .LBB208_4: # in Loop: Header=BB208_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB208_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB208_2 Depth=1 @@ -16166,7 +15732,6 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB208_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -16181,25 +15746,23 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB208_2 ; RV32IA-NEXT: .LBB208_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB208_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 4 ; RV32IA-NEXT: addi a5, zero, 2 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB208_7 ; RV32IA-NEXT: .LBB208_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16208,7 +15771,7 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB208_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB208_5 -; RV32IA-NEXT: .LBB208_4: +; RV32IA-NEXT: .LBB208_4: # in Loop: Header=BB208_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB208_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB208_2 Depth=1 @@ -16224,7 +15787,6 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB208_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -16234,25 +15796,23 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_min_i64_acq_rel: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB208_2 ; RV64I-NEXT: .LBB208_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB208_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB208_4 ; RV64I-NEXT: .LBB208_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16264,11 +15824,10 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB208_1 ; RV64I-NEXT: .LBB208_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_min_i64_acq_rel: @@ -16287,25 +15846,23 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB209_2 ; RV32I-NEXT: .LBB209_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB209_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: addi a5, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB209_7 ; RV32I-NEXT: .LBB209_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16314,7 +15871,7 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB209_2 Depth=1 ; RV32I-NEXT: slt a0, s1, a5 ; RV32I-NEXT: j .LBB209_5 -; RV32I-NEXT: .LBB209_4: +; RV32I-NEXT: .LBB209_4: # in Loop: Header=BB209_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB209_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB209_2 Depth=1 @@ -16330,7 +15887,6 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB209_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -16345,25 +15901,23 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB209_2 ; RV32IA-NEXT: .LBB209_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB209_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 5 ; RV32IA-NEXT: addi a5, zero, 5 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB209_7 ; RV32IA-NEXT: .LBB209_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16372,7 +15926,7 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB209_2 Depth=1 ; RV32IA-NEXT: slt a0, s1, a5 ; RV32IA-NEXT: j .LBB209_5 -; RV32IA-NEXT: .LBB209_4: +; RV32IA-NEXT: .LBB209_4: # in Loop: Header=BB209_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB209_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB209_2 Depth=1 @@ -16388,7 +15942,6 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB209_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -16398,25 +15951,23 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_min_i64_seq_cst: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB209_2 ; RV64I-NEXT: .LBB209_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB209_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB209_4 ; RV64I-NEXT: .LBB209_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16428,11 +15979,10 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB209_1 ; RV64I-NEXT: .LBB209_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_min_i64_seq_cst: @@ -16451,25 +16001,23 @@ define i64 @atomicrmw_umax_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB210_2 ; RV32I-NEXT: .LBB210_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB210_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: mv a5, zero ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB210_7 ; RV32I-NEXT: .LBB210_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16478,7 +16026,7 @@ define i64 @atomicrmw_umax_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB210_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB210_5 -; RV32I-NEXT: .LBB210_4: +; RV32I-NEXT: .LBB210_4: # in Loop: Header=BB210_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB210_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB210_2 Depth=1 @@ -16493,7 +16041,6 @@ define i64 @atomicrmw_umax_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB210_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -16508,25 +16055,23 @@ define i64 @atomicrmw_umax_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB210_2 ; RV32IA-NEXT: .LBB210_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB210_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: mv a4, zero ; RV32IA-NEXT: mv a5, zero ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB210_7 ; RV32IA-NEXT: .LBB210_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16535,7 +16080,7 @@ define i64 @atomicrmw_umax_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB210_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB210_5 -; RV32IA-NEXT: .LBB210_4: +; RV32IA-NEXT: .LBB210_4: # in Loop: Header=BB210_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB210_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB210_2 Depth=1 @@ -16550,7 +16095,6 @@ define i64 @atomicrmw_umax_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB210_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -16560,25 +16104,23 @@ define i64 @atomicrmw_umax_i64_monotonic(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i64_monotonic: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB210_2 ; RV64I-NEXT: .LBB210_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB210_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB210_4 ; RV64I-NEXT: .LBB210_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16590,11 +16132,10 @@ define i64 @atomicrmw_umax_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB210_1 ; RV64I-NEXT: .LBB210_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i64_monotonic: @@ -16613,25 +16154,23 @@ define i64 @atomicrmw_umax_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB211_2 ; RV32I-NEXT: .LBB211_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB211_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: addi a5, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB211_7 ; RV32I-NEXT: .LBB211_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16640,7 +16179,7 @@ define i64 @atomicrmw_umax_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB211_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB211_5 -; RV32I-NEXT: .LBB211_4: +; RV32I-NEXT: .LBB211_4: # in Loop: Header=BB211_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB211_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB211_2 Depth=1 @@ -16655,7 +16194,6 @@ define i64 @atomicrmw_umax_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB211_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -16670,25 +16208,23 @@ define i64 @atomicrmw_umax_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB211_2 ; RV32IA-NEXT: .LBB211_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB211_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 2 ; RV32IA-NEXT: addi a5, zero, 2 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB211_7 ; RV32IA-NEXT: .LBB211_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16697,7 +16233,7 @@ define i64 @atomicrmw_umax_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB211_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB211_5 -; RV32IA-NEXT: .LBB211_4: +; RV32IA-NEXT: .LBB211_4: # in Loop: Header=BB211_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB211_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB211_2 Depth=1 @@ -16712,7 +16248,6 @@ define i64 @atomicrmw_umax_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB211_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -16722,25 +16257,23 @@ define i64 @atomicrmw_umax_i64_acquire(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i64_acquire: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB211_2 ; RV64I-NEXT: .LBB211_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB211_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB211_4 ; RV64I-NEXT: .LBB211_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16752,11 +16285,10 @@ define i64 @atomicrmw_umax_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB211_1 ; RV64I-NEXT: .LBB211_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i64_acquire: @@ -16775,25 +16307,23 @@ define i64 @atomicrmw_umax_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB212_2 ; RV32I-NEXT: .LBB212_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB212_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: mv a5, zero ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB212_7 ; RV32I-NEXT: .LBB212_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16802,7 +16332,7 @@ define i64 @atomicrmw_umax_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB212_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB212_5 -; RV32I-NEXT: .LBB212_4: +; RV32I-NEXT: .LBB212_4: # in Loop: Header=BB212_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB212_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB212_2 Depth=1 @@ -16817,7 +16347,6 @@ define i64 @atomicrmw_umax_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB212_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -16832,25 +16361,23 @@ define i64 @atomicrmw_umax_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB212_2 ; RV32IA-NEXT: .LBB212_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB212_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 3 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: mv a5, zero ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB212_7 ; RV32IA-NEXT: .LBB212_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16859,7 +16386,7 @@ define i64 @atomicrmw_umax_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB212_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB212_5 -; RV32IA-NEXT: .LBB212_4: +; RV32IA-NEXT: .LBB212_4: # in Loop: Header=BB212_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB212_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB212_2 Depth=1 @@ -16874,7 +16401,6 @@ define i64 @atomicrmw_umax_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB212_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -16884,25 +16410,23 @@ define i64 @atomicrmw_umax_i64_release(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i64_release: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB212_2 ; RV64I-NEXT: .LBB212_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB212_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB212_4 ; RV64I-NEXT: .LBB212_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16914,11 +16438,10 @@ define i64 @atomicrmw_umax_i64_release(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB212_1 ; RV64I-NEXT: .LBB212_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i64_release: @@ -16937,25 +16460,23 @@ define i64 @atomicrmw_umax_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB213_2 ; RV32I-NEXT: .LBB213_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB213_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 4 ; RV32I-NEXT: addi a5, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB213_7 ; RV32I-NEXT: .LBB213_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -16964,7 +16485,7 @@ define i64 @atomicrmw_umax_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB213_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB213_5 -; RV32I-NEXT: .LBB213_4: +; RV32I-NEXT: .LBB213_4: # in Loop: Header=BB213_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB213_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB213_2 Depth=1 @@ -16979,7 +16500,6 @@ define i64 @atomicrmw_umax_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB213_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -16994,25 +16514,23 @@ define i64 @atomicrmw_umax_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB213_2 ; RV32IA-NEXT: .LBB213_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB213_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 4 ; RV32IA-NEXT: addi a5, zero, 2 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB213_7 ; RV32IA-NEXT: .LBB213_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17021,7 +16539,7 @@ define i64 @atomicrmw_umax_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB213_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB213_5 -; RV32IA-NEXT: .LBB213_4: +; RV32IA-NEXT: .LBB213_4: # in Loop: Header=BB213_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB213_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB213_2 Depth=1 @@ -17036,7 +16554,6 @@ define i64 @atomicrmw_umax_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB213_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -17046,25 +16563,23 @@ define i64 @atomicrmw_umax_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i64_acq_rel: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB213_2 ; RV64I-NEXT: .LBB213_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB213_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB213_4 ; RV64I-NEXT: .LBB213_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17076,11 +16591,10 @@ define i64 @atomicrmw_umax_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB213_1 ; RV64I-NEXT: .LBB213_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i64_acq_rel: @@ -17099,25 +16613,23 @@ define i64 @atomicrmw_umax_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB214_2 ; RV32I-NEXT: .LBB214_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB214_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: addi a5, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB214_7 ; RV32I-NEXT: .LBB214_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17126,7 +16638,7 @@ define i64 @atomicrmw_umax_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB214_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB214_5 -; RV32I-NEXT: .LBB214_4: +; RV32I-NEXT: .LBB214_4: # in Loop: Header=BB214_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB214_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB214_2 Depth=1 @@ -17141,7 +16653,6 @@ define i64 @atomicrmw_umax_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB214_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -17156,25 +16667,23 @@ define i64 @atomicrmw_umax_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB214_2 ; RV32IA-NEXT: .LBB214_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB214_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 5 ; RV32IA-NEXT: addi a5, zero, 5 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB214_7 ; RV32IA-NEXT: .LBB214_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17183,7 +16692,7 @@ define i64 @atomicrmw_umax_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB214_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB214_5 -; RV32IA-NEXT: .LBB214_4: +; RV32IA-NEXT: .LBB214_4: # in Loop: Header=BB214_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB214_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB214_2 Depth=1 @@ -17198,7 +16707,6 @@ define i64 @atomicrmw_umax_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB214_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -17208,25 +16716,23 @@ define i64 @atomicrmw_umax_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umax_i64_seq_cst: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB214_2 ; RV64I-NEXT: .LBB214_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB214_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB214_4 ; RV64I-NEXT: .LBB214_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17238,11 +16744,10 @@ define i64 @atomicrmw_umax_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB214_1 ; RV64I-NEXT: .LBB214_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umax_i64_seq_cst: @@ -17261,25 +16766,23 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB215_2 ; RV32I-NEXT: .LBB215_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB215_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: mv a5, zero ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB215_7 ; RV32I-NEXT: .LBB215_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17288,7 +16791,7 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB215_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB215_5 -; RV32I-NEXT: .LBB215_4: +; RV32I-NEXT: .LBB215_4: # in Loop: Header=BB215_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB215_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB215_2 Depth=1 @@ -17304,7 +16807,6 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB215_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -17319,25 +16821,23 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB215_2 ; RV32IA-NEXT: .LBB215_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB215_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: mv a4, zero ; RV32IA-NEXT: mv a5, zero ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB215_7 ; RV32IA-NEXT: .LBB215_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17346,7 +16846,7 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB215_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB215_5 -; RV32IA-NEXT: .LBB215_4: +; RV32IA-NEXT: .LBB215_4: # in Loop: Header=BB215_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB215_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB215_2 Depth=1 @@ -17362,7 +16862,6 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB215_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -17372,25 +16871,23 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i64_monotonic: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB215_2 ; RV64I-NEXT: .LBB215_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB215_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a3, zero ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB215_4 ; RV64I-NEXT: .LBB215_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17402,11 +16899,10 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB215_1 ; RV64I-NEXT: .LBB215_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i64_monotonic: @@ -17425,25 +16921,23 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB216_2 ; RV32I-NEXT: .LBB216_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB216_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 2 ; RV32I-NEXT: addi a5, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB216_7 ; RV32I-NEXT: .LBB216_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17452,7 +16946,7 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB216_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB216_5 -; RV32I-NEXT: .LBB216_4: +; RV32I-NEXT: .LBB216_4: # in Loop: Header=BB216_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB216_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB216_2 Depth=1 @@ -17468,7 +16962,6 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB216_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -17483,25 +16976,23 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB216_2 ; RV32IA-NEXT: .LBB216_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB216_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 2 ; RV32IA-NEXT: addi a5, zero, 2 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB216_7 ; RV32IA-NEXT: .LBB216_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17510,7 +17001,7 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB216_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB216_5 -; RV32IA-NEXT: .LBB216_4: +; RV32IA-NEXT: .LBB216_4: # in Loop: Header=BB216_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB216_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB216_2 Depth=1 @@ -17526,7 +17017,6 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB216_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -17536,25 +17026,23 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i64_acquire: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB216_2 ; RV64I-NEXT: .LBB216_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB216_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 2 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB216_4 ; RV64I-NEXT: .LBB216_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17566,11 +17054,10 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB216_1 ; RV64I-NEXT: .LBB216_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i64_acquire: @@ -17589,25 +17076,23 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB217_2 ; RV32I-NEXT: .LBB217_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB217_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 3 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: mv a5, zero ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB217_7 ; RV32I-NEXT: .LBB217_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17616,7 +17101,7 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB217_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB217_5 -; RV32I-NEXT: .LBB217_4: +; RV32I-NEXT: .LBB217_4: # in Loop: Header=BB217_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB217_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB217_2 Depth=1 @@ -17632,7 +17117,6 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB217_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -17647,25 +17131,23 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB217_2 ; RV32IA-NEXT: .LBB217_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB217_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 3 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: mv a5, zero ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB217_7 ; RV32IA-NEXT: .LBB217_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17674,7 +17156,7 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB217_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB217_5 -; RV32IA-NEXT: .LBB217_4: +; RV32IA-NEXT: .LBB217_4: # in Loop: Header=BB217_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB217_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB217_2 Depth=1 @@ -17690,7 +17172,6 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB217_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -17700,25 +17181,23 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i64_release: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB217_2 ; RV64I-NEXT: .LBB217_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB217_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 3 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: mv a4, zero ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB217_4 ; RV64I-NEXT: .LBB217_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17730,11 +17209,10 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB217_1 ; RV64I-NEXT: .LBB217_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i64_release: @@ -17753,25 +17231,23 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB218_2 ; RV32I-NEXT: .LBB218_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB218_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 4 ; RV32I-NEXT: addi a5, zero, 2 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB218_7 ; RV32I-NEXT: .LBB218_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17780,7 +17256,7 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB218_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB218_5 -; RV32I-NEXT: .LBB218_4: +; RV32I-NEXT: .LBB218_4: # in Loop: Header=BB218_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB218_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB218_2 Depth=1 @@ -17796,7 +17272,6 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB218_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -17811,25 +17286,23 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB218_2 ; RV32IA-NEXT: .LBB218_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB218_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 4 ; RV32IA-NEXT: addi a5, zero, 2 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB218_7 ; RV32IA-NEXT: .LBB218_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17838,7 +17311,7 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB218_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB218_5 -; RV32IA-NEXT: .LBB218_4: +; RV32IA-NEXT: .LBB218_4: # in Loop: Header=BB218_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB218_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB218_2 Depth=1 @@ -17854,7 +17327,6 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB218_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -17864,25 +17336,23 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i64_acq_rel: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB218_2 ; RV64I-NEXT: .LBB218_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB218_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 4 ; RV64I-NEXT: addi a4, zero, 2 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB218_4 ; RV64I-NEXT: .LBB218_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17894,11 +17364,10 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB218_1 ; RV64I-NEXT: .LBB218_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i64_acq_rel: @@ -17917,25 +17386,23 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) -; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, sp ; RV32I-NEXT: j .LBB219_2 ; RV32I-NEXT: .LBB219_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB219_2 Depth=1 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a4, zero, 5 ; RV32I-NEXT: addi a5, zero, 5 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 4(sp) -; RV32I-NEXT: lw a4, 0(sp) +; RV32I-NEXT: lw a5, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) ; RV32I-NEXT: bnez a0, .LBB219_7 ; RV32I-NEXT: .LBB219_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -17944,7 +17411,7 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: # in Loop: Header=BB219_2 Depth=1 ; RV32I-NEXT: sltu a0, s1, a5 ; RV32I-NEXT: j .LBB219_5 -; RV32I-NEXT: .LBB219_4: +; RV32I-NEXT: .LBB219_4: # in Loop: Header=BB219_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB219_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB219_2 Depth=1 @@ -17960,7 +17427,6 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32I-NEXT: .LBB219_7: # %atomicrmw.end ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 -; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) ; RV32I-NEXT: lw s0, 24(sp) @@ -17975,25 +17441,23 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) ; RV32IA-NEXT: sw s1, 20(sp) ; RV32IA-NEXT: sw s2, 16(sp) -; RV32IA-NEXT: sw s3, 12(sp) ; RV32IA-NEXT: mv s0, a0 ; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: lw a4, 0(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 -; RV32IA-NEXT: mv s3, sp ; RV32IA-NEXT: j .LBB219_2 ; RV32IA-NEXT: .LBB219_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB219_2 Depth=1 -; RV32IA-NEXT: sw a4, 0(sp) -; RV32IA-NEXT: sw a5, 4(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: addi a4, zero, 5 ; RV32IA-NEXT: addi a5, zero, 5 ; RV32IA-NEXT: mv a0, s0 -; RV32IA-NEXT: mv a1, s3 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 4(sp) -; RV32IA-NEXT: lw a4, 0(sp) +; RV32IA-NEXT: lw a5, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) ; RV32IA-NEXT: bnez a0, .LBB219_7 ; RV32IA-NEXT: .LBB219_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 @@ -18002,7 +17466,7 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: # in Loop: Header=BB219_2 Depth=1 ; RV32IA-NEXT: sltu a0, s1, a5 ; RV32IA-NEXT: j .LBB219_5 -; RV32IA-NEXT: .LBB219_4: +; RV32IA-NEXT: .LBB219_4: # in Loop: Header=BB219_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB219_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB219_2 Depth=1 @@ -18018,7 +17482,6 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV32IA-NEXT: .LBB219_7: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a4 ; RV32IA-NEXT: mv a1, a5 -; RV32IA-NEXT: lw s3, 12(sp) ; RV32IA-NEXT: lw s2, 16(sp) ; RV32IA-NEXT: lw s1, 20(sp) ; RV32IA-NEXT: lw s0, 24(sp) @@ -18028,25 +17491,23 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; ; RV64I-LABEL: atomicrmw_umin_i64_seq_cst: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: ld a3, 0(a0) ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: addi s2, sp, 8 ; RV64I-NEXT: j .LBB219_2 ; RV64I-NEXT: .LBB219_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB219_2 Depth=1 -; RV64I-NEXT: sd a3, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: addi a3, zero, 5 ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 8(sp) +; RV64I-NEXT: ld a3, 0(sp) ; RV64I-NEXT: bnez a0, .LBB219_4 ; RV64I-NEXT: .LBB219_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -18058,11 +17519,10 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; RV64I-NEXT: j .LBB219_1 ; RV64I-NEXT: .LBB219_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64IA-LABEL: atomicrmw_umin_i64_seq_cst: diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll index 24788e110b461..284476fffe051 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll @@ -172,9 +172,9 @@ define void @caller_aligned_stack() nounwind { ; RV32I-FPELIM-NEXT: addi t0, a0, 1311 ; RV32I-FPELIM-NEXT: lui a0, 688509 ; RV32I-FPELIM-NEXT: addi a5, a0, -2048 -; RV32I-FPELIM-NEXT: addi a2, sp, 32 ; RV32I-FPELIM-NEXT: addi a0, zero, 1 ; RV32I-FPELIM-NEXT: addi a1, zero, 11 +; RV32I-FPELIM-NEXT: addi a2, sp, 32 ; RV32I-FPELIM-NEXT: addi a3, zero, 12 ; RV32I-FPELIM-NEXT: addi a4, zero, 13 ; RV32I-FPELIM-NEXT: addi a6, zero, 4 @@ -218,9 +218,9 @@ define void @caller_aligned_stack() nounwind { ; RV32I-WITHFP-NEXT: addi t0, a0, 1311 ; RV32I-WITHFP-NEXT: lui a0, 688509 ; RV32I-WITHFP-NEXT: addi a5, a0, -2048 -; RV32I-WITHFP-NEXT: addi a2, s0, -32 ; RV32I-WITHFP-NEXT: addi a0, zero, 1 ; RV32I-WITHFP-NEXT: addi a1, zero, 11 +; RV32I-WITHFP-NEXT: addi a2, s0, -32 ; RV32I-WITHFP-NEXT: addi a3, zero, 12 ; RV32I-WITHFP-NEXT: addi a4, zero, 13 ; RV32I-WITHFP-NEXT: addi a6, zero, 4 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll index e86a8c7406f7c..3e7945887f4d1 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll @@ -368,7 +368,6 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind { ; RV32I-FPELIM-NEXT: sw zero, 48(sp) ; RV32I-FPELIM-NEXT: sw zero, 44(sp) ; RV32I-FPELIM-NEXT: addi t0, zero, 8 -; RV32I-FPELIM-NEXT: addi a7, sp, 40 ; RV32I-FPELIM-NEXT: addi a0, zero, 1 ; RV32I-FPELIM-NEXT: addi a1, zero, 2 ; RV32I-FPELIM-NEXT: addi a2, zero, 3 @@ -376,6 +375,7 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind { ; RV32I-FPELIM-NEXT: addi a4, zero, 5 ; RV32I-FPELIM-NEXT: addi a5, zero, 6 ; RV32I-FPELIM-NEXT: addi a6, zero, 7 +; RV32I-FPELIM-NEXT: addi a7, sp, 40 ; RV32I-FPELIM-NEXT: sw t0, 40(sp) ; RV32I-FPELIM-NEXT: call callee_large_scalars_exhausted_regs ; RV32I-FPELIM-NEXT: lw ra, 60(sp) @@ -401,7 +401,6 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind { ; RV32I-WITHFP-NEXT: sw zero, -16(s0) ; RV32I-WITHFP-NEXT: sw zero, -20(s0) ; RV32I-WITHFP-NEXT: addi t0, zero, 8 -; RV32I-WITHFP-NEXT: addi a7, s0, -24 ; RV32I-WITHFP-NEXT: addi a0, zero, 1 ; RV32I-WITHFP-NEXT: addi a1, zero, 2 ; RV32I-WITHFP-NEXT: addi a2, zero, 3 @@ -409,6 +408,7 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind { ; RV32I-WITHFP-NEXT: addi a4, zero, 5 ; RV32I-WITHFP-NEXT: addi a5, zero, 6 ; RV32I-WITHFP-NEXT: addi a6, zero, 7 +; RV32I-WITHFP-NEXT: addi a7, s0, -24 ; RV32I-WITHFP-NEXT: sw t0, -24(s0) ; RV32I-WITHFP-NEXT: call callee_large_scalars_exhausted_regs ; RV32I-WITHFP-NEXT: lw s0, 56(sp) @@ -693,9 +693,9 @@ define void @caller_aligned_stack() nounwind { ; RV32I-FPELIM-NEXT: addi t0, a0, 1311 ; RV32I-FPELIM-NEXT: lui a0, 688509 ; RV32I-FPELIM-NEXT: addi a5, a0, -2048 -; RV32I-FPELIM-NEXT: addi a2, sp, 32 ; RV32I-FPELIM-NEXT: addi a0, zero, 1 ; RV32I-FPELIM-NEXT: addi a1, zero, 11 +; RV32I-FPELIM-NEXT: addi a2, sp, 32 ; RV32I-FPELIM-NEXT: addi a3, zero, 12 ; RV32I-FPELIM-NEXT: addi a4, zero, 13 ; RV32I-FPELIM-NEXT: addi a6, zero, 4 @@ -736,9 +736,9 @@ define void @caller_aligned_stack() nounwind { ; RV32I-WITHFP-NEXT: addi t0, a0, 1311 ; RV32I-WITHFP-NEXT: lui a0, 688509 ; RV32I-WITHFP-NEXT: addi a5, a0, -2048 -; RV32I-WITHFP-NEXT: addi a2, s0, -32 ; RV32I-WITHFP-NEXT: addi a0, zero, 1 ; RV32I-WITHFP-NEXT: addi a1, zero, 11 +; RV32I-WITHFP-NEXT: addi a2, s0, -32 ; RV32I-WITHFP-NEXT: addi a3, zero, 12 ; RV32I-WITHFP-NEXT: addi a4, zero, 13 ; RV32I-WITHFP-NEXT: addi a6, zero, 4 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll index 95cf39e79d3cc..fac227f4978c4 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll @@ -202,7 +202,6 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind { ; RV64I-NEXT: sd zero, 64(sp) ; RV64I-NEXT: sd zero, 56(sp) ; RV64I-NEXT: addi t0, zero, 8 -; RV64I-NEXT: addi a7, sp, 48 ; RV64I-NEXT: addi a0, zero, 1 ; RV64I-NEXT: addi a1, zero, 2 ; RV64I-NEXT: addi a2, zero, 3 @@ -210,6 +209,7 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind { ; RV64I-NEXT: addi a4, zero, 5 ; RV64I-NEXT: addi a5, zero, 6 ; RV64I-NEXT: addi a6, zero, 7 +; RV64I-NEXT: addi a7, sp, 48 ; RV64I-NEXT: sd t0, 48(sp) ; RV64I-NEXT: call callee_large_scalars_exhausted_regs ; RV64I-NEXT: ld ra, 88(sp) diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll index bb7d933bd84d8..9f65576d97868 100644 --- a/llvm/test/CodeGen/RISCV/vararg.ll +++ b/llvm/test/CodeGen/RISCV/vararg.ll @@ -1441,9 +1441,9 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32-ILP32F-FPELIM-NEXT: addi a5, a0, 1311 ; ILP32-ILP32F-FPELIM-NEXT: lui a0, 688509 ; ILP32-ILP32F-FPELIM-NEXT: addi a6, a0, -2048 -; ILP32-ILP32F-FPELIM-NEXT: addi a2, sp, 32 ; ILP32-ILP32F-FPELIM-NEXT: addi a0, zero, 1 ; ILP32-ILP32F-FPELIM-NEXT: addi a1, zero, 11 +; ILP32-ILP32F-FPELIM-NEXT: addi a2, sp, 32 ; ILP32-ILP32F-FPELIM-NEXT: addi a3, zero, 12 ; ILP32-ILP32F-FPELIM-NEXT: addi a4, zero, 13 ; ILP32-ILP32F-FPELIM-NEXT: addi a7, zero, 4 @@ -1486,9 +1486,9 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32-ILP32F-WITHFP-NEXT: addi a5, a0, 1311 ; ILP32-ILP32F-WITHFP-NEXT: lui a0, 688509 ; ILP32-ILP32F-WITHFP-NEXT: addi a6, a0, -2048 -; ILP32-ILP32F-WITHFP-NEXT: addi a2, s0, -32 ; ILP32-ILP32F-WITHFP-NEXT: addi a0, zero, 1 ; ILP32-ILP32F-WITHFP-NEXT: addi a1, zero, 11 +; ILP32-ILP32F-WITHFP-NEXT: addi a2, s0, -32 ; ILP32-ILP32F-WITHFP-NEXT: addi a3, zero, 12 ; ILP32-ILP32F-WITHFP-NEXT: addi a4, zero, 13 ; ILP32-ILP32F-WITHFP-NEXT: addi a7, zero, 4 @@ -1530,9 +1530,9 @@ define void @va5_aligned_stack_caller() nounwind { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a5, a0, 1311 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a0, 688509 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a6, a0, -2048 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a2, sp, 32 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, zero, 1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a1, zero, 11 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a2, sp, 32 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a3, zero, 12 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a4, zero, 13 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a7, zero, 4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll index 5fced6ad29e2a..fb974048b1ef4 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s ; CHECK-LABEL: mul_v16i8 @@ -37,7 +38,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i8, i8* %a, i32 %index ; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i8* %tmp to <16 x i8>* %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) @@ -94,7 +95,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i16, i16* %a, i32 %index ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) @@ -150,7 +151,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* @@ -204,7 +205,7 @@ vector.body: ; preds = %vector.body, %vecto %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> @@ -264,7 +265,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* @@ -323,7 +324,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* @@ -352,10 +353,10 @@ for.cond.cleanup: ; preds = %vector.body, %entry ; ; CHECK-LABEL: interleave4 ; CHECK: vector.body: -; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1) +; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) ; define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { entry: @@ -386,13 +387,13 @@ vector.body: %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>* %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>* - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %v7 = add i32 %index, 4 - %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1) + %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) %v8 = add i32 %v7, 4 - %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1) + %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) %v9 = add i32 %v8, 4 - %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1) + %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll index a0f13f3af65e2..38a7e1dbba193 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll @@ -23,13 +23,13 @@ define void @check_loop_dec_brcond_combine(i32* nocapture %a, i32* nocapture rea entry: call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body.preheader - + for.body.preheader: %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -66,13 +66,13 @@ define void @check_loop_dec_ugt_brcond_combine(i32* nocapture %a, i32* nocapture entry: call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body.preheader - + for.body.preheader: %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -109,13 +109,13 @@ define void @check_loop_dec_ult_brcond_combine(i32* nocapture %a, i32* nocapture entry: call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body.preheader - + for.body.preheader: %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -152,13 +152,13 @@ define void @check_loop_dec_ult_xor_brcond_combine(i32* nocapture %a, i32* nocap entry: call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body.preheader - + for.body.preheader: %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -196,13 +196,13 @@ define void @check_loop_dec_sgt_brcond_combine(i32* nocapture %a, i32* nocapture entry: call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body.preheader - + for.body.preheader: %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -239,13 +239,13 @@ define void @check_loop_dec_sge_brcond_combine(i32* nocapture %a, i32* nocapture entry: call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body.preheader - + for.body.preheader: %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -282,13 +282,13 @@ define void @check_loop_dec_sge_xor_brcond_combine(i32* nocapture %a, i32* nocap entry: call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body.preheader - + for.body.preheader: %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -326,13 +326,13 @@ define void @check_loop_dec_uge_brcond_combine(i32* nocapture %a, i32* nocapture entry: call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body.preheader - + for.body.preheader: %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -369,13 +369,13 @@ define void @check_loop_dec_uge_xor_brcond_combine(i32* nocapture %a, i32* nocap entry: call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body.preheader - + for.body.preheader: %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -416,10 +416,10 @@ entry: %wls = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) %xor = xor i1 %wls, 1 br i1 %xor, label %while.end, label %while.body.preheader - + while.body.preheader: br label %while.body - + while.body: %a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ] %b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ] @@ -431,7 +431,7 @@ while.body: %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1) %cmp = icmp ne i32 %count.next, 0 br i1 %cmp, label %while.body, label %while.end - + while.end: ret void } @@ -449,10 +449,10 @@ entry: %wls = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) %cmp = icmp ne i1 %wls, 1 br i1 %cmp, label %while.end, label %while.body.preheader - + while.body.preheader: br label %while.body - + while.body: %a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ] %b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ] @@ -464,7 +464,7 @@ while.body: %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1) %cmp.1 = icmp ne i32 %count.next, 0 br i1 %cmp.1, label %while.body, label %while.end - + while.end: ret void } @@ -482,10 +482,10 @@ while.end: define void @check_negated_reordered_wls(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) { entry: br label %while - + while.body.preheader: br label %while.body - + while.body: %a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ] %b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ] diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll index 56343a6d65cb5..db7ca7a55a5a5 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll @@ -23,13 +23,12 @@ define hidden i32 @_Z4loopPiPjiS0_i(i32* noalias nocapture readonly %s1, i32* no ; CHECK-NEXT: [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP3]]) -; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_183]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[N]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>* ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer @@ -108,7 +107,7 @@ vector.body: ; preds = %vector.body, %vecto %induction = add <4 x i32> %broadcast.splat, %5 = insertelement <4 x i32> undef, i32 %trip.count.minus.183, i32 0 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer - %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.183) + %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %broadcast.splat72, <4 x i32>* %lsr.iv911, i32 4, <4 x i1> %7) %index.next = add i32 %index, 4 %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll index d1151f29a9b20..2fa8a4d8ed7ef 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -8,6 +8,7 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: add.w r12, r3, #3 @@ -18,7 +19,7 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: add.w lr, lr, r12, lsr #2 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: and r4, r12, #15 @@ -37,8 +38,8 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: vmul.i32 q1, q1, q2 ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: le lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: le lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 @@ -64,7 +65,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 - %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) @@ -166,7 +167,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 - %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) @@ -268,7 +269,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 - %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) @@ -367,7 +368,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 - %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) @@ -406,9 +407,10 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB4_1: @ %bb3 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB4_1: @ %bb9 +; CHECK-NEXT: .LBB4_2: @ %bb9 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -418,8 +420,8 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %bb27 +; CHECK-NEXT: letp lr, .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %bb27 ; CHECK-NEXT: pop {r7, pc} bb: %tmp = icmp eq i32 %arg2, 0 @@ -441,7 +443,7 @@ bb9: ; preds = %bb9, %bb3 %tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10 ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8 - %tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %tmp6) + %tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %arg2) %tmp16 = bitcast i32* %tmp14 to <4 x i32>* %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef) @@ -468,9 +470,10 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB5_1: @ %bb4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB5_1: @ %bb12 +; CHECK-NEXT: .LBB5_2: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vptt.i32 ne, q0, zr @@ -480,8 +483,8 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB5_1 -; CHECK-NEXT: @ %bb.2: @ %bb32 +; CHECK-NEXT: letp lr, .LBB5_2 +; CHECK-NEXT: @ %bb.3: @ %bb32 ; CHECK-NEXT: pop {r7, pc} bb: %tmp = icmp eq i32 %arg3, 0 @@ -505,7 +508,7 @@ bb12: ; preds = %bb12, %bb4 %tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13 ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9 - %tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %tmp7) + %tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %arg3) %tmp19 = bitcast i32* %tmp17 to <4 x i32>* %tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir index 38d7567505025..f3c86e3ac6465 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s # CHECK-NOT: LETP @@ -7,11 +8,11 @@ entry: %cmp = icmp slt i32 %elts, 1 br i1 %cmp, label %exit, label %loop.ph - + loop.ph: ; preds = %entry call void @llvm.set.loop.iterations.i32(i32 %iters) br label %loop.body - + loop.body: ; preds = %loop.body, %loop.ph %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] @@ -34,20 +35,20 @@ %end = icmp ne i32 %loop.dec, 0 %lsr.iv.next = add i32 %lsr.iv, -1 br i1 %end, label %loop.body, label %exit - + exit: ; preds = %loop.body, %entry ret void } - + define arm_aapcs_vfpcc void @test_ctlz_i16(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, i32 %elts, i32 %iters) #0 { entry: %cmp = icmp slt i32 %elts, 1 br i1 %cmp, label %exit, label %loop.ph - + loop.ph: ; preds = %entry call void @llvm.set.loop.iterations.i32(i32 %iters) br label %loop.body - + loop.body: ; preds = %loop.body, %loop.ph %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] @@ -70,20 +71,20 @@ %end = icmp ne i32 %loop.dec, 0 %lsr.iv.next = add i32 %lsr.iv, -1 br i1 %end, label %loop.body, label %exit - + exit: ; preds = %loop.body, %entry ret void } - + define arm_aapcs_vfpcc void @test_ctlz_i32(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, i32 %elts, i32 %iters) #0 { entry: %cmp = icmp slt i32 %elts, 1 br i1 %cmp, label %exit, label %loop.ph - + loop.ph: ; preds = %entry call void @llvm.set.loop.iterations.i32(i32 %iters) br label %loop.body - + loop.body: ; preds = %loop.body, %loop.ph %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] @@ -106,11 +107,11 @@ %end = icmp ne i32 %loop.dec, 0 %lsr.iv.next = add i32 %lsr.iv, -1 br i1 %end, label %loop.body, label %exit - + exit: ; preds = %loop.body, %entry ret void } - + declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1 immarg) declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1 immarg) declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1 immarg) @@ -141,24 +142,59 @@ frameInfo: offsetAdjustment: 0 maxAlignment: 4 fixedStack: - - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, - isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: test_ctlz_i8 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.loop.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: dead $lr = t2DLS renamable $r12 + ; CHECK: $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: bb.2.loop.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2) + ; CHECK: renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 2) + ; CHECK: $lr = tMOVr $r4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VCLZs8 killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: $r0 = tMOVr $r1, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 2) + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r3, $r4, $lr - + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -166,14 +202,19 @@ body: | tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.loop.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - - bb.1.loop.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + bb.2.loop.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r4 - + renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2) @@ -187,10 +228,10 @@ body: | renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg MVE_VPST 8, implicit $vpr renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 2) - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg - - bb.2.exit: + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... @@ -209,24 +250,58 @@ frameInfo: offsetAdjustment: 0 maxAlignment: 4 fixedStack: - - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, - isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: test_ctlz_i16 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.loop.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: dead $lr = t2DLS renamable $r4 + ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + ; CHECK: bb.2.loop.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r12 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) + ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VCLZs16 killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r3, $r4, $lr - + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -234,14 +309,19 @@ body: | tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.loop.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) t2DoLoopStart renamable $r4 $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg - - bb.1.loop.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + bb.2.loop.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r12 - + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg $lr = tMOVr $r12, 14 /* CC::al */, $noreg MVE_VPST 4, implicit $vpr @@ -254,10 +334,10 @@ body: | renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg MVE_VPST 8, implicit $vpr renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg - - bb.2.exit: + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... @@ -276,24 +356,58 @@ frameInfo: offsetAdjustment: 0 maxAlignment: 4 fixedStack: - - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, - isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: test_ctlz_i32 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.loop.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: dead $lr = t2DLS renamable $r4 + ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + ; CHECK: bb.2.loop.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r12 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) + ; CHECK: renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) + ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VCLZs32 killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q0 = MVE_VQSHRUNs32th killed renamable $q0, killed renamable $q1, 3, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r3, $r4, $lr - + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -301,14 +415,19 @@ body: | tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.loop.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) t2DoLoopStart renamable $r4 $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg - - bb.1.loop.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + bb.2.loop.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r12 - + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg $lr = tMOVr $r12, 14 /* CC::al */, $noreg MVE_VPST 4, implicit $vpr @@ -321,10 +440,10 @@ body: | renamable $q0 = MVE_VQSHRUNs32th killed renamable $q0, killed renamable $q1, 3, 0, $noreg MVE_VPST 8, implicit $vpr renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg - - bb.2.exit: + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir index 5f007aaef6c77..de607549411e2 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir @@ -115,14 +115,17 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr @@ -133,8 +136,8 @@ body: | ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -149,6 +152,11 @@ body: | tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg @@ -156,8 +164,8 @@ body: | renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg @@ -170,10 +178,10 @@ body: | renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir deleted file mode 100644 index f642c792586b5..0000000000000 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir +++ /dev/null @@ -1,169 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s - ---- | - define dso_local void @CPSR_not_dead(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { - entry: - %cmp8 = icmp sgt i32 %N, 0 - %0 = add i32 %N, 3 - %1 = lshr i32 %0, 2 - %2 = shl nuw i32 %1, 2 - %3 = add i32 %2, -4 - %4 = lshr i32 %3, 2 - %5 = add nuw nsw i32 %4, 1 - br i1 %cmp8, label %vector.ph, label %for.cond.cleanup - - vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) - br label %vector.body - - vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %5, %vector.ph ] - %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] - %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] - %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] - %6 = phi i32 [ %N, %vector.ph ], [ %8, %vector.body ] - %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* - %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* - %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %7 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %6) - %8 = sub i32 %6, 4 - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %7, <4 x i32> undef) - %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %7, <4 x i32> undef) - %9 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %9, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %7) - %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 - %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 - %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 - %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) - %11 = icmp ne i32 %10, 0 - %lsr.iv.next = add nsw i32 %lsr.iv1, -1 - br i1 %11, label %vector.body, label %for.cond.cleanup - - for.cond.cleanup: ; preds = %vector.body, %entry - ret void - } - declare void @llvm.set.loop.iterations.i32(i32) - declare <4 x i1> @llvm.arm.mve.vctp32(i32) - declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) - declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - -... ---- -name: CPSR_not_dead -alignment: 2 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -registers: [] -liveins: - - { reg: '$r0', virtual-reg: '' } - - { reg: '$r1', virtual-reg: '' } - - { reg: '$r2', virtual-reg: '' } - - { reg: '$r3', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 8 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: false - hasCalls: false - stackProtector: '' - maxCallFrameSize: 0 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - localFrameSize: 0 - savePoint: '' - restorePoint: '' -fixedStack: [] -stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -constants: [] -machineFunctionInfo: {} -body: | - ; CHECK-LABEL: name: CPSR_not_dead - ; CHECK: bb.0.entry: - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 - ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp - ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 - ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 - ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 - ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr - ; CHECK: t2IT 11, 8, implicit-def $itstate - ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate - ; CHECK: $lr = MVE_DLSTP_32 renamable $r3 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 - ; CHECK: renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg - ; CHECK: t2IT 11, 8, implicit-def $itstate - ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate - ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) - ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) - ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: - ; CHECK: t2IT 11, 8, implicit-def dead $itstate - ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc - bb.0.entry: - successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $r4, $lr - - frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp - frame-setup CFI_INSTRUCTION def_cfa_offset 8 - frame-setup CFI_INSTRUCTION offset $lr, -4 - frame-setup CFI_INSTRUCTION offset $r4, -8 - tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr - t2IT 11, 8, implicit-def $itstate - tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate - renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg - renamable $lr = t2MOVi 1, 14, $noreg, $noreg - renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg - renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg - renamable $r4 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r4 - $r12 = tMOVr killed $r4, 14, $noreg - - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) - liveins: $r0, $r1, $r2, $r3, $r12 - - renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg - $lr = tMOVr $r12, 14, $noreg - renamable $r12 = nsw t2SUBri killed $r12, 1, 14, $noreg, $noreg - renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg - t2IT 11, 8, implicit-def $itstate - tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate - MVE_VPST 4, implicit $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) - renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) - renamable $lr = t2LoopDec killed renamable $lr, 1 - renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - MVE_VPST 8, implicit $vpr - renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg - - bb.2.for.cond.cleanup: - t2IT 11, 8, implicit-def $itstate - tPOP_RET 14, $noreg, def $r4, def $pc - -... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll index 162ccf55d068c..3e398eefb0924 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll @@ -5,13 +5,24 @@ define void @foo(%struct.SpeexPreprocessState_* nocapture readonly %st, i16* %x) { ; CHECK-LABEL: foo: ; CHECK: @ %bb.0: @ %entry -; CHECK: dlstp.16 lr, r4 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: ldrd r12, r4, [r0] +; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: rsb r12, r12, r4, lsl #1 +; CHECK-NEXT: mov r4, r12 +; CHECK-NEXT: dlstp.16 lr, r4 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r2], #16 ; CHECK-NEXT: vstrh.16 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB0_1 -; CHECK: dlstp.16 lr, r3 +; CHECK-NEXT: @ %bb.2: @ %do.end +; CHECK-NEXT: ldr r3, [r0] +; CHECK-NEXT: ldr r0, [r0, #8] +; CHECK-NEXT: vmov.i16 q0, #0x1800 +; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB0_3: @ %do.body6 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll index 1fda5c08a0375..2627965913ebc 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll @@ -8,17 +8,18 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.s16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 @@ -40,7 +41,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i8, i8* %b, i32 %index ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 - %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) @@ -67,17 +68,18 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 @@ -99,7 +101,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i8, i8* %b, i32 %index ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 - %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) @@ -126,17 +128,18 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16* ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.s32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 @@ -158,7 +161,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i16, i16* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) @@ -185,17 +188,18 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16* ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.u32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 @@ -217,7 +221,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i16, i16* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir index 3a27e94f02fa3..96652c5d76e93 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir @@ -111,6 +111,9 @@ body: | ; CHECK: t2IT 0, 4, implicit-def $itstate ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -123,8 +126,8 @@ body: | ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr @@ -135,8 +138,8 @@ body: | ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.middle.block: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0 ; CHECK: $r0 = VMOVRS killed $s3, 14 /* CC::al */, $noreg, implicit killed $q0 ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 @@ -148,6 +151,11 @@ body: | t2IT 0, 4, implicit-def $itstate renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -161,8 +169,8 @@ body: | t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg @@ -175,10 +183,10 @@ body: | renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.middle.block: + bb.3.middle.block: liveins: $q0 $r0 = VMOVRS killed $s3, 14, $noreg, implicit $q0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index d8d6af3b9a8dc..7101cebbb9793 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -139,7 +139,7 @@ vector.body: ; preds = %vector.body, %vecto %2 = getelementptr inbounds float, float* %b, i32 %index ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 - %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %4 = bitcast float* %2 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef) @@ -280,7 +280,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir index 479d7f20232c2..352fad145dc4a 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir @@ -105,14 +105,17 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr @@ -122,8 +125,8 @@ body: | ; CHECK: renamable $q0 = nsw MVE_VADDi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: renamable $r0 = MVE_VSTRHU16_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -136,6 +139,11 @@ body: | tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg @@ -143,8 +151,8 @@ body: | renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg @@ -156,10 +164,10 @@ body: | MVE_VPST 8, implicit $vpr renamable $r0 = MVE_VSTRHU16_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir index 750397ea65f42..993291931ffad 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir @@ -113,14 +113,17 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr @@ -130,8 +133,8 @@ body: | ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -144,6 +147,11 @@ body: | tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg @@ -151,8 +159,8 @@ body: | renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg @@ -164,10 +172,10 @@ body: | MVE_VPST 8, implicit $vpr renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir index 602c2c53eefb8..ec9d831795c65 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir @@ -106,14 +106,17 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr @@ -123,8 +126,8 @@ body: | ; CHECK: renamable $q0 = nsw MVE_VADDi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -137,6 +140,11 @@ body: | tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg @@ -144,8 +152,8 @@ body: | renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg @@ -157,10 +165,10 @@ body: | MVE_VPST 8, implicit $vpr renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir index f51ccfc0c32c2..5ec6079e6cbfd 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir @@ -98,24 +98,27 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $q2 = MVE_VMOVimmi32 4, 0, $noreg, undef renamable $q2 ; CHECK: renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) ; CHECK: renamable $r3, dead $cpsr = tLSRri renamable $r2, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $q1, $q2, $r0, $r1 ; CHECK: renamable $vpr = MVE_VCMPu32 renamable $q1, renamable $q0, 8, 0, killed $noreg ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $r1, renamable $q3 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv35, align 4) ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q3, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv12, align 4) ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q0, renamable $q2, 0, $noreg, undef renamable $q0 - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc - ; CHECK: bb.3 (align 16): + ; CHECK: bb.4 (align 16): ; CHECK: CONSTPOOL_ENTRY 0, %const.0, 16 bb.0.entry: successors: %bb.1(0x80000000) @@ -130,6 +133,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate frame-destroy tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg renamable $q2 = MVE_VMOVimmi32 4, 0, $noreg, undef renamable $q2 renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -142,8 +150,8 @@ body: | renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $q1, $q2, $r0, $r1, $r2 renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg @@ -154,13 +162,13 @@ body: | renamable $r0 = MVE_VSTRWU32_post killed renamable $q3, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv12, align 4) renamable $q0 = MVE_VADDi32 killed renamable $q0, renamable $q2, 0, $noreg, undef renamable $q0 renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc - bb.3 (align 16): + bb.4 (align 16): CONSTPOOL_ENTRY 0, %const.0, 16 ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll index f23b64013c355..62da471339454 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s ; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-GLOBAL @@ -16,11 +17,11 @@ ; CHECK: body: ; CHECK: bb.0.entry: ; CHECK: t2CMPri renamable $lr, 0 -; CHECK: tBcc %bb.3 -; CHECK: bb.1.while.body.preheader: +; CHECK: tBcc %bb.4 +; CHECK: bb.2.while.body.preheader: ; CHECK: $lr = t2DLS killed renamable $lr -; CHECK: bb.2.while.body: -; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 +; CHECK: bb.3.while.body: +; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 define void @ne_and_guard(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: %brmerge.demorgan = and i1 %t1, %t2 @@ -49,11 +50,11 @@ if.end: ; preds = %while.body, %entry ; CHECK: body: ; CHECK: bb.0.entry: ; CHECK: t2CMPri renamable $lr, 0 -; CHECK: tBcc %bb.3 -; CHECK: bb.1.while.body.preheader: +; CHECK: tBcc %bb.4 +; CHECK: bb.2.while.body.preheader: ; CHECK: $lr = t2DLS killed renamable $lr -; CHECK: bb.2.while.body: -; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 +; CHECK: bb.3.while.body: +; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 define void @ne_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: %brmerge.demorgan = and i1 %t1, %t2 @@ -84,11 +85,11 @@ if.end: ; preds = %while.body, %while. ; CHECK: body: ; CHECK: bb.0.entry: ; CHECK: t2CMPri renamable $lr, 0 -; CHECK: tBcc %bb.3 -; CHECK: bb.1.while.body.preheader: +; CHECK: tBcc %bb.4 +; CHECK: bb.2.while.body.preheader: ; CHECK: $lr = t2DLS killed renamable $lr -; CHECK: bb.2.while.body: -; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 +; CHECK: bb.3.while.body: +; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 define void @eq_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: %brmerge.demorgan = and i1 %t1, %t2 @@ -119,11 +120,11 @@ if.end: ; preds = %while.body, %while. ; CHECK: body: ; CHECK: bb.0.entry: ; CHECK: t2CMPri renamable $lr, 0 -; CHECK: tBcc %bb.3 -; CHECK: bb.1.while.body.preheader: +; CHECK: tBcc %bb.4 +; CHECK: bb.2.while.body.preheader: ; CHECK: $lr = t2DLS killed renamable $lr -; CHECK: bb.2.while.body: -; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 +; CHECK: bb.3.while.body: +; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 define void @ne_prepreheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: %cmp = icmp ne i32 %N, 0 @@ -153,8 +154,8 @@ if.end: ; preds = %while.body, %while. ; CHECK: body: ; CHECK: bb.0.entry: ; CHECK: $lr = t2DLS killed renamable $lr -; CHECK: bb.1.do.body: -; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 +; CHECK: bb.2.do.body: +; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: %cmp = icmp ne i32 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir index 74fb334d34705..d7a6d331b5358 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir @@ -1,29 +1,27 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=armv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s -# CHECK: for.body: -# CHECK-NOT: t2DLS -# CHECK-NOT: t2LEUpdate --- | ; ModuleID = 'massive.ll' source_filename = "massive.ll" target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main" - + define dso_local arm_aapcscc void @massive(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader - + for.body.preheader: ; preds = %entry %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body - + for.cond.cleanup: ; preds = %for.body, %entry ret void - + for.body: ; preds = %for.body, %for.body.preheader %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] @@ -44,19 +42,19 @@ %4 = icmp ne i32 %3, 0 br i1 %4, label %for.body, label %for.cond.cleanup } - + ; Function Attrs: nounwind declare i32 @llvm.arm.space(i32 immarg, i32) #0 - + ; Function Attrs: noduplicate nounwind declare void @llvm.set.loop.iterations.i32(i32) #1 - + ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - + ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #0 - + attributes #0 = { nounwind } attributes #1 = { noduplicate nounwind } @@ -98,20 +96,51 @@ frameInfo: restorePoint: '' fixedStack: [] stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: massive + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 8, implicit-def $itstate + ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.for.body.preheader: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14 /* CC::al */, $noreg + ; CHECK: $lr = tMOVr killed $r3, 14 /* CC::al */, $noreg + ; CHECK: bb.2.for.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: dead renamable $r3 = SPACE 4096, undef renamable $r0 + ; CHECK: renamable $r12, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep3) + ; CHECK: renamable $r3, renamable $r2 = t2LDR_PRE killed renamable $r2, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep7) + ; CHECK: renamable $r3 = nsw t2MUL killed renamable $r3, killed renamable $r12, 14 /* CC::al */, $noreg + ; CHECK: early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.scevgep11) + ; CHECK: $lr = t2SUBri killed renamable $lr, 1, 14 /* CC::al */, $noreg, def $cpsr + ; CHECK: t2Bcc %bb.2, 1 /* CC::ne */, killed $cpsr + ; CHECK: tB %bb.3, 14 /* CC::al */, $noreg + ; CHECK: bb.3.for.cond.cleanup: + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r3, $r7, $lr - + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -119,26 +148,31 @@ body: | tCMPi8 $r3, 0, 14, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg $lr = tMOVr $r3, 14, $noreg t2DoLoopStart killed $r3 - - bb.1.for.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + bb.2.for.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2 - + dead renamable $r3 = SPACE 4096, undef renamable $r0 renamable $r12, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep3) renamable $r3, renamable $r2 = t2LDR_PRE killed renamable $r2, 4, 14, $noreg :: (load 4 from %ir.scevgep7) renamable $r3 = nsw t2MUL killed renamable $r3, killed renamable $r12, 14, $noreg early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep11) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg - - bb.2.for.cond.cleanup: + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg + + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir index d53caa2c56e2b..51c7f34262838 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir @@ -109,11 +109,14 @@ body: | ; CHECK: t2CMPrs killed renamable $r12, renamable $r3, 11, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r12 = t2LSRri killed renamable $r3, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1) @@ -122,8 +125,8 @@ body: | ; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1) ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -137,6 +140,11 @@ body: | t2CMPrs killed renamable $r12, renamable $r3, 11, 14, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r12 = t2MOVi 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14, $noreg, $noreg @@ -148,8 +156,8 @@ body: | t2DoLoopStart renamable $r5 $lr = tMOVr killed $r5, 14, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3, $r12 renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg @@ -165,10 +173,10 @@ body: | MVE_VPST 8, implicit $vpr renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r4, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll index 07a3d5aa76803..5a370e5f96e76 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -2,8 +2,22 @@ ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) { -; CHECK-LABEL: .LBB0_1: @ %do.body.i -; CHECK: dlstp.32 lr, r1 +; CHECK-LABEL: arm_var_f32_mve: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: .LBB0_1: @ %do.body.i +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r12], #16 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit +; CHECK-NEXT: vmov s4, r1 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: vadd.f32 s0, s3, s3 ; CHECK-NEXT: vcvt.f32.u32 s4, s4 ; CHECK-NEXT: vdiv.f32 s0, s0, s4 @@ -18,6 +32,14 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float ; CHECK-NEXT: vsub.f32 q2, q2, q1 ; CHECK-NEXT: vfma.f32 q0, q2, q2 ; CHECK-NEXT: letp lr, .LBB0_3 +; CHECK-NEXT: @ %bb.4: @ %do.end +; CHECK-NEXT: subs r0, r1, #1 +; CHECK-NEXT: vadd.f32 s0, s3, s3 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vcvt.f32.u32 s2, s2 +; CHECK-NEXT: vdiv.f32 s0, s0, s2 +; CHECK-NEXT: vstr s0, [r2] +; CHECK-NEXT: pop {r4, pc} entry: br label %do.body.i diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir index f546226de4980..734bcc106785e 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir @@ -114,12 +114,15 @@ body: | ; CHECK: t2CMPrs killed renamable $r12, renamable $r3, 11, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: $r12 = t2MOVr killed $r3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2LSRri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1) @@ -128,8 +131,8 @@ body: | ; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1) ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -143,6 +146,11 @@ body: | t2CMPrs killed renamable $r12, renamable $r3, 11, 14, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r12 = t2MOVi 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14, $noreg, $noreg @@ -154,8 +162,8 @@ body: | renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3, $r12 renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg @@ -171,10 +179,10 @@ body: | MVE_VPST 8, implicit $vpr renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r4, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir index 80def3fd67451..a8f084474b0c7 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir @@ -114,6 +114,9 @@ body: | ; CHECK: t2CMPrs killed renamable $r12, renamable $r3, 11, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r12 = t2MOVi 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg @@ -122,8 +125,8 @@ body: | ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg ; CHECK: renamable $r12 = t2LSRri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1) @@ -132,8 +135,8 @@ body: | ; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1) ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -147,6 +150,11 @@ body: | t2CMPrs killed renamable $r12, renamable $r3, 11, 14, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r12 = t2MOVi 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14, $noreg, $noreg @@ -158,8 +166,8 @@ body: | renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3, $r12 renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg @@ -175,10 +183,10 @@ body: | MVE_VPST 8, implicit $vpr renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r4, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir index 7732793b8db38..588b62a22db8d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s --- | define dso_local arm_aapcs_vfpcc void @multi_cond_iter_count(i32* noalias nocapture %0, i32* nocapture readonly %1, i32 %2, i32 %3) { @@ -96,18 +96,21 @@ body: | ; CHECK: renamable $r2 = t2LSLrr killed renamable $r2, killed renamable $r12, 14 /* CC::al */, $noreg, def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1 (%ir-block.17): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: $r3 = tMOVr $r0, 14 /* CC::al */, $noreg ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 - ; CHECK: bb.1 (%ir-block.18): - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2 (%ir-block.18): + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r3 ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 0, $noreg ; CHECK: renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 0, killed $noreg ; CHECK: $r0 = tMOVr $r3, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2 (%ir-block.34): + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3 (%ir-block.34): ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0 (%ir-block.4): successors: %bb.1(0x80000000) @@ -129,6 +132,11 @@ body: | renamable $r2 = t2LSLrr killed renamable $r2, killed renamable $r12, 14, $noreg, def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1 (%ir-block.17): + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg @@ -137,8 +145,8 @@ body: | $r3 = tMOVr $r0, 14, $noreg t2DoLoopStart renamable $lr - bb.1 (%ir-block.18): - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2 (%ir-block.18): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg @@ -151,10 +159,10 @@ body: | MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr renamable $lr = t2LoopDec killed renamable $lr, 1 $r0 = tMOVr $r3, 14, $noreg - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2 (%ir-block.34): + bb.3 (%ir-block.34): tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir index 2e3aa0d97baf4..e23f0bc6f4b82 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir @@ -1,26 +1,19 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=armv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s -# CHECK: for.body: -# CHECK-NOT: t2DLS -# CHECK-NOT: t2LEUpdate --- | - ; ModuleID = 'multiblock-massive.ll' - source_filename = "multiblock-massive.ll" - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main" - define void @size_limit(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader - + for.body.preheader: ; preds = %entry call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body - + for.cond.cleanup: ; preds = %for.end, %entry ret void - + for.body: ; preds = %for.end, %for.body.preheader %lsr.iv4 = phi i32* [ %b, %for.body.preheader ], [ %scevgep5, %for.end ] %lsr.iv2 = phi i32* [ %c, %for.body.preheader ], [ %scevgep3, %for.end ] @@ -33,13 +26,13 @@ store i32 %mul, i32* %lsr.iv1, align 4 %cmp = icmp ne i32 %0, 0 br i1 %cmp, label %middle.block, label %for.end - + middle.block: ; preds = %for.body %div = udiv i32 %1, %0 store i32 %div, i32* %lsr.iv1, align 4 %size.1 = call i32 @llvm.arm.space(i32 1024, i32 undef) br label %for.end - + for.end: ; preds = %middle.block, %for.body %scevgep = getelementptr i32, i32* %lsr.iv1, i32 1 %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 1 @@ -48,19 +41,19 @@ %exitcond = icmp eq i32 %lsr.iv.next, 0 br i1 %exitcond, label %for.cond.cleanup, label %for.body } - + ; Function Attrs: nounwind declare i32 @llvm.arm.space(i32 immarg, i32) #0 - + ; Function Attrs: noduplicate nounwind declare void @llvm.set.loop.iterations.i32(i32) #1 - + ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - + ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #0 - + attributes #0 = { nounwind } attributes #1 = { noduplicate nounwind } @@ -102,20 +95,64 @@ frameInfo: restorePoint: '' fixedStack: [] stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: size_limit + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: tCMPi8 $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 8, implicit-def $itstate + ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.for.body.preheader: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: $lr = tMOVr killed $r3, 14 /* CC::al */, $noreg + ; CHECK: tB %bb.2, 14 /* CC::al */, $noreg + ; CHECK: bb.2.for.end: + ; CHECK: successors: %bb.5(0x04000000), %bb.3(0x7c000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 4, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2SUBri killed renamable $lr, 1, 14 /* CC::al */, $noreg, def $cpsr + ; CHECK: tBcc %bb.3, 1 /* CC::ne */, killed $cpsr + ; CHECK: t2B %bb.5, 14 /* CC::al */, $noreg + ; CHECK: bb.3.for.body: + ; CHECK: successors: %bb.4(0x50000000), %bb.2(0x30000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: dead renamable $r3 = SPACE 3072, undef renamable $r0 + ; CHECK: renamable $r3 = tLDRi renamable $r1, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.lsr.iv4) + ; CHECK: renamable $r12 = t2LDRi12 renamable $r2, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.lsr.iv2) + ; CHECK: tCMPi8 renamable $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: renamable $r4 = nsw t2MUL renamable $r12, renamable $r3, 14 /* CC::al */, $noreg + ; CHECK: tSTRi killed renamable $r4, renamable $r0, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.lsr.iv1) + ; CHECK: t2Bcc %bb.2, 0 /* CC::eq */, killed $cpsr + ; CHECK: bb.4.middle.block: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12 + ; CHECK: renamable $r3 = t2UDIV killed renamable $r12, killed renamable $r3, 14 /* CC::al */, $noreg + ; CHECK: tSTRi killed renamable $r3, renamable $r0, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.lsr.iv1) + ; CHECK: dead renamable $r3 = SPACE 1024, undef renamable $r0 + ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg + ; CHECK: bb.5.for.cond.cleanup: + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: - successors: %bb.2(0x80000000) + successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r3, $r4, $lr - + frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -123,43 +160,48 @@ body: | tCMPi8 $r3, 0, 14, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + $lr = tMOVr $r3, 14, $noreg t2DoLoopStart killed $r3 tB %bb.2, 14, $noreg - - bb.1.for.end: - successors: %bb.4(0x04000000), %bb.2(0x7c000000) + + bb.2.for.end: + successors: %bb.5(0x04000000), %bb.3(0x7c000000) liveins: $lr, $r0, $r1, $r2 - + renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 4, 14, $noreg renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 4, 14, $noreg renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 4, 14, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr - t2B %bb.4, 14, $noreg - - bb.2.for.body: - successors: %bb.3(0x50000000), %bb.1(0x30000000) + t2LoopEnd renamable $lr, %bb.3, implicit-def dead $cpsr + t2B %bb.5, 14, $noreg + + bb.3.for.body: + successors: %bb.4(0x50000000), %bb.2(0x30000000) liveins: $lr, $r0, $r1, $r2 - + dead renamable $r3 = SPACE 3072, undef renamable $r0 renamable $r3 = tLDRi renamable $r1, 0, 14, $noreg :: (load 4 from %ir.lsr.iv4) renamable $r12 = t2LDRi12 renamable $r2, 0, 14, $noreg :: (load 4 from %ir.lsr.iv2) tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr renamable $r4 = nsw t2MUL renamable $r12, renamable $r3, 14, $noreg tSTRi killed renamable $r4, renamable $r0, 0, 14, $noreg :: (store 4 into %ir.lsr.iv1) - t2Bcc %bb.1, 0, killed $cpsr - - bb.3.middle.block: - successors: %bb.1(0x80000000) + t2Bcc %bb.2, 0, killed $cpsr + + bb.4.middle.block: + successors: %bb.2(0x80000000) liveins: $lr, $r0, $r1, $r2, $r3, $r12 - + renamable $r3 = t2UDIV killed renamable $r12, killed renamable $r3, 14, $noreg tSTRi killed renamable $r3, renamable $r0, 0, 14, $noreg :: (store 4 into %ir.lsr.iv1) dead renamable $r3 = SPACE 1024, undef renamable $r0 - t2B %bb.1, 14, $noreg - - bb.4.for.cond.cleanup: + t2B %bb.2, 14, $noreg + + bb.5.for.cond.cleanup: tPOP_RET 14, $noreg, def $r4, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index 69039f9a4eaa2..2b90065ea0e80 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -8,6 +8,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 @@ -17,7 +18,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: adds r3, #4 @@ -26,8 +27,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u32 q2, [r1], #4 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: le lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -55,7 +56,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i8, i8* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) @@ -83,6 +84,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 @@ -92,7 +94,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: adds r3, #4 @@ -101,8 +103,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.s32 q2, [r1], #8 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: le lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -130,7 +132,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i16, i16* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) @@ -158,6 +160,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 @@ -167,7 +170,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: adds r3, #4 @@ -176,8 +179,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u32 q2, [r1], #4 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: le lr, .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -205,7 +208,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i8, i8* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) @@ -233,6 +236,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 @@ -242,7 +246,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: adds r3, #4 @@ -251,8 +255,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u32 q2, [r1], #8 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: le lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -280,7 +284,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i16, i16* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) @@ -308,6 +312,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 @@ -317,7 +322,7 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: adds r3, #4 @@ -326,8 +331,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: le lr, .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -354,7 +359,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i32, i32* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -514,7 +519,7 @@ vector.body: ; preds = %vector.body, %vecto %2 = getelementptr inbounds i8, i8* %a, i32 %index ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 - %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) @@ -618,17 +623,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: .LBB6_1: @ %vector.ph ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 -; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.s32 q0, [r0], #8 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 ; CHECK-NEXT: vstrw.32 q1, [r3], #16 -; CHECK-NEXT: letp lr, .LBB6_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB6_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp10 = icmp eq i32 %N, 0 @@ -653,7 +659,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i16, i16* %a, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) @@ -815,7 +821,7 @@ vector.body: ; preds = %vector.body, %vecto %2 = getelementptr inbounds i8, i8* %a, i32 %index ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 - %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) @@ -919,17 +925,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 -; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 ; CHECK-NEXT: vstrw.32 q1, [r3], #16 -; CHECK-NEXT: letp lr, .LBB8_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB8_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp10 = icmp eq i32 %N, 0 @@ -954,7 +961,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i16, i16* %a, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) @@ -1115,7 +1122,7 @@ vector.body: ; preds = %vector.body, %vecto %2 = getelementptr inbounds i32, i32* %a, i32 %index ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 - %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %4 = bitcast i32* %2 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef) @@ -1206,17 +1213,18 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB10_1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 -; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrb.u16 q1, [r2], #8 ; CHECK-NEXT: vmul.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB10_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB10_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp10 = icmp eq i32 %N, 0 @@ -1238,7 +1246,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i8, i8* %b, i32 %index ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 - %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll index 548ba396bed42..e5bcf2e6077f7 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -24,13 +24,12 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon ; CHECK-NEXT: [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX8_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) -; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], @@ -95,7 +94,7 @@ vector.body: ; preds = %vector.body, %for.c %tmp6 = getelementptr inbounds i16, i16* %tmp3, i32 %index ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29 - %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp8 = bitcast i16* %tmp6 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp8, i32 2, <4 x i1> %tmp7, <4 x i16> undef) @@ -146,13 +145,12 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B ; CHECK-NEXT: [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX7_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) -; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], @@ -215,7 +213,7 @@ vector.body: ; preds = %vector.body, %for.c %tmp6 = getelementptr inbounds i32, i32* %tmp3, i32 %index ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28 - %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp8 = bitcast i32* %tmp6 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp7, <4 x i32> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-cbnz.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-cbnz.mir index f8507e1f7bdb1..7e0f3bd4fc151 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-cbnz.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-cbnz.mir @@ -3,14 +3,10 @@ # RUN: llc -mtriple=thumbv8.1m.main -mattr=-lob %s -run-pass=arm-cp-islands --verify-machineinstrs -o - | FileCheck %s --check-prefix=CHECK-NOLOB --- | - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main" - %struct.head_s = type { %struct.head_s*, %struct.data_s* } %struct.data_s = type { i16, i16 } - ; Function Attrs: norecurse nounwind readonly - define dso_local arm_aapcscc %struct.head_s* @search(%struct.head_s* readonly %list, %struct.data_s* nocapture readonly %info) local_unnamed_addr #0 { + define dso_local arm_aapcscc %struct.head_s* @search(%struct.head_s* readonly %list, %struct.data_s* nocapture readonly %info) local_unnamed_addr { entry: %idx = getelementptr inbounds %struct.data_s, %struct.data_s* %info, i32 0, i32 1 %0 = load i16, i16* %idx, align 2 @@ -67,16 +63,14 @@ br i1 %tobool10, label %return, label %land.rhs11 return: ; preds = %while.body19, %land.rhs11, %while.body, %land.rhs, %while.cond.preheader, %while.cond9.preheader - %retval.0 = phi %struct.head_s* [ null, %while.cond.preheader ], [ null, %while.cond9.preheader ], [ %list.addr.033, %land.rhs ], [ null, %while.body ], [ %list.addr.136, %land.rhs11 ], [ null, %while.body19 ] + %retval.0 = phi %struct.head_s* [ null, %while.cond.preheader ], [ null, %while.cond9.preheader ], [ null, %while.body ], [ %list.addr.033, %land.rhs ], [ null, %while.body19 ], [ %list.addr.136, %land.rhs11 ] ret %struct.head_s* %retval.0 } - attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+lob,+ras,+soft-float,+strict-align,+thumb-mode,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8d16,-fp-armv8d16sp,-fp-armv8sp,-fp16,-fp16fml,-fp64,-fpregs,-fullfp16,-neon,-vfp2,-vfp2d16,-vfp2d16sp,-vfp2sp,-vfp3,-vfp3d16,-vfp3d16sp,-vfp3sp,-vfp4,-vfp4d16,-vfp4d16sp,-vfp4sp" "unsafe-fp-math"="false" "use-soft-float"="true" } - ... --- name: search -alignment: 1 +alignment: 2 exposesReturnsTwice: false legalized: false regBankSelected: false @@ -115,184 +109,195 @@ machineFunctionInfo: {} body: | ; CHECK-LOB-LABEL: name: search ; CHECK-LOB: bb.0.entry: - ; CHECK-LOB: successors: %bb.1(0x50000000), %bb.4(0x30000000) + ; CHECK-LOB: successors: %bb.1(0x50000000), %bb.5(0x30000000) ; CHECK-LOB: liveins: $r0, $r1 ; CHECK-LOB: renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx) ; CHECK-LOB: t2CMPri renamable $r2, -1, 14 /* CC::al */, $noreg, implicit-def $cpsr - ; CHECK-LOB: tBcc %bb.4, 13 /* CC::le */, killed $cpsr + ; CHECK-LOB: tBcc %bb.5, 13 /* CC::le */, killed $cpsr ; CHECK-LOB: bb.1.while.cond.preheader: - ; CHECK-LOB: successors: %bb.8(0x30000000), %bb.2(0x50000000) + ; CHECK-LOB: successors: %bb.9(0x30000000), %bb.2(0x50000000) ; CHECK-LOB: liveins: $r0, $r2 - ; CHECK-LOB: tCBZ $r0, %bb.8 + ; CHECK-LOB: tCBZ $r0, %bb.9 ; CHECK-LOB: bb.2.land.rhs.preheader: ; CHECK-LOB: successors: %bb.3(0x80000000) ; CHECK-LOB: liveins: $r0, $r2 ; CHECK-LOB: renamable $r1 = tUXTH killed renamable $r2, 14 /* CC::al */, $noreg ; CHECK-LOB: bb.3.land.rhs: - ; CHECK-LOB: successors: %bb.8(0x04000000), %bb.3(0x7c000000) + ; CHECK-LOB: successors: %bb.4(0x80000000) ; CHECK-LOB: liveins: $r0, $r1 ; CHECK-LOB: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info2) ; CHECK-LOB: renamable $r2 = tLDRHi killed renamable $r2, 1, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx3) ; CHECK-LOB: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-LOB: t2IT 0, 8, implicit-def $itstate ; CHECK-LOB: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK-LOB: bb.4.while.body: + ; CHECK-LOB: successors: %bb.9(0x04000000), %bb.3(0x7c000000) + ; CHECK-LOB: liveins: $r0, $r1 ; CHECK-LOB: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next4) - ; CHECK-LOB: tCBNZ $r0, %bb.8 + ; CHECK-LOB: tCBNZ $r0, %bb.9 ; CHECK-LOB: t2LE %bb.3 - ; CHECK-LOB: bb.4.while.cond9.preheader: - ; CHECK-LOB: successors: %bb.8(0x30000000), %bb.5(0x50000000) + ; CHECK-LOB: bb.5.while.cond9.preheader: + ; CHECK-LOB: successors: %bb.9(0x30000000), %bb.6(0x50000000) ; CHECK-LOB: liveins: $r0, $r1 - ; CHECK-LOB: tCBZ $r0, %bb.8 - ; CHECK-LOB: bb.5.land.rhs11.lr.ph: - ; CHECK-LOB: successors: %bb.6(0x80000000) + ; CHECK-LOB: tCBZ $r0, %bb.9 + ; CHECK-LOB: bb.6.land.rhs11.lr.ph: + ; CHECK-LOB: successors: %bb.7(0x80000000) ; CHECK-LOB: liveins: $r0, $r1 ; CHECK-LOB: renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14 /* CC::al */, $noreg :: (load 2 from %ir.data16143) - ; CHECK-LOB: bb.6.land.rhs11: - ; CHECK-LOB: successors: %bb.9(0x04000000), %bb.7(0x7c000000) + ; CHECK-LOB: bb.7.land.rhs11: + ; CHECK-LOB: successors: %bb.10(0x04000000), %bb.8(0x7c000000) ; CHECK-LOB: liveins: $r0, $r1 ; CHECK-LOB: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info12) ; CHECK-LOB: renamable $r2 = tLDRBi killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (load 1 from %ir.data165, align 2) ; CHECK-LOB: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr - ; CHECK-LOB: tBcc %bb.9, 0 /* CC::eq */, killed $cpsr - ; CHECK-LOB: bb.7.while.body19: - ; CHECK-LOB: successors: %bb.8(0x04000000), %bb.6(0x7c000000) + ; CHECK-LOB: tBcc %bb.10, 0 /* CC::eq */, killed $cpsr + ; CHECK-LOB: bb.8.while.body19: + ; CHECK-LOB: successors: %bb.9(0x04000000), %bb.7(0x7c000000) ; CHECK-LOB: liveins: $r0, $r1 ; CHECK-LOB: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next206) - ; CHECK-LOB: tCBZ $r0, %bb.8 - ; CHECK-LOB: t2LE %bb.6 - ; CHECK-LOB: bb.8: - ; CHECK-LOB: successors: %bb.9(0x80000000) + ; CHECK-LOB: tCBZ $r0, %bb.9 + ; CHECK-LOB: t2LE %bb.7 + ; CHECK-LOB: bb.9: + ; CHECK-LOB: successors: %bb.10(0x80000000) ; CHECK-LOB: renamable $r0, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - ; CHECK-LOB: bb.9.return: + ; CHECK-LOB: bb.10.return: ; CHECK-LOB: liveins: $r0 ; CHECK-LOB: tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 ; CHECK-NOLOB-LABEL: name: search ; CHECK-NOLOB: bb.0.entry: - ; CHECK-NOLOB: successors: %bb.1(0x50000000), %bb.4(0x30000000) + ; CHECK-NOLOB: successors: %bb.1(0x50000000), %bb.5(0x30000000) ; CHECK-NOLOB: liveins: $r0, $r1 ; CHECK-NOLOB: renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx) ; CHECK-NOLOB: t2CMPri renamable $r2, -1, 14 /* CC::al */, $noreg, implicit-def $cpsr - ; CHECK-NOLOB: tBcc %bb.4, 13 /* CC::le */, killed $cpsr + ; CHECK-NOLOB: tBcc %bb.5, 13 /* CC::le */, killed $cpsr ; CHECK-NOLOB: bb.1.while.cond.preheader: - ; CHECK-NOLOB: successors: %bb.8(0x30000000), %bb.2(0x50000000) + ; CHECK-NOLOB: successors: %bb.9(0x30000000), %bb.2(0x50000000) ; CHECK-NOLOB: liveins: $r0, $r2 - ; CHECK-NOLOB: tCBZ $r0, %bb.8 + ; CHECK-NOLOB: tCBZ $r0, %bb.9 ; CHECK-NOLOB: bb.2.land.rhs.preheader: ; CHECK-NOLOB: successors: %bb.3(0x80000000) ; CHECK-NOLOB: liveins: $r0, $r2 ; CHECK-NOLOB: renamable $r1 = tUXTH killed renamable $r2, 14 /* CC::al */, $noreg ; CHECK-NOLOB: bb.3.land.rhs: - ; CHECK-NOLOB: successors: %bb.8(0x04000000), %bb.3(0x7c000000) + ; CHECK-NOLOB: successors: %bb.4(0x80000000) ; CHECK-NOLOB: liveins: $r0, $r1 ; CHECK-NOLOB: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info2) ; CHECK-NOLOB: renamable $r2 = tLDRHi killed renamable $r2, 1, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx3) ; CHECK-NOLOB: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-NOLOB: t2IT 0, 8, implicit-def $itstate ; CHECK-NOLOB: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK-NOLOB: bb.4.while.body: + ; CHECK-NOLOB: successors: %bb.9(0x04000000), %bb.3(0x7c000000) + ; CHECK-NOLOB: liveins: $r0, $r1 ; CHECK-NOLOB: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next4) ; CHECK-NOLOB: tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-NOLOB: tBcc %bb.3, 0 /* CC::eq */, killed $cpsr - ; CHECK-NOLOB: tB %bb.8, 14 /* CC::al */, $noreg - ; CHECK-NOLOB: bb.4.while.cond9.preheader: - ; CHECK-NOLOB: successors: %bb.8(0x30000000), %bb.5(0x50000000) + ; CHECK-NOLOB: tB %bb.9, 14 /* CC::al */, $noreg + ; CHECK-NOLOB: bb.5.while.cond9.preheader: + ; CHECK-NOLOB: successors: %bb.9(0x30000000), %bb.6(0x50000000) ; CHECK-NOLOB: liveins: $r0, $r1 - ; CHECK-NOLOB: tCBZ $r0, %bb.8 - ; CHECK-NOLOB: bb.5.land.rhs11.lr.ph: - ; CHECK-NOLOB: successors: %bb.6(0x80000000) + ; CHECK-NOLOB: tCBZ $r0, %bb.9 + ; CHECK-NOLOB: bb.6.land.rhs11.lr.ph: + ; CHECK-NOLOB: successors: %bb.7(0x80000000) ; CHECK-NOLOB: liveins: $r0, $r1 ; CHECK-NOLOB: renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14 /* CC::al */, $noreg :: (load 2 from %ir.data16143) - ; CHECK-NOLOB: bb.6.land.rhs11: - ; CHECK-NOLOB: successors: %bb.9(0x04000000), %bb.7(0x7c000000) + ; CHECK-NOLOB: bb.7.land.rhs11: + ; CHECK-NOLOB: successors: %bb.10(0x04000000), %bb.8(0x7c000000) ; CHECK-NOLOB: liveins: $r0, $r1 ; CHECK-NOLOB: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info12) ; CHECK-NOLOB: renamable $r2 = tLDRBi killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (load 1 from %ir.data165, align 2) ; CHECK-NOLOB: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr - ; CHECK-NOLOB: tBcc %bb.9, 0 /* CC::eq */, killed $cpsr - ; CHECK-NOLOB: bb.7.while.body19: - ; CHECK-NOLOB: successors: %bb.8(0x04000000), %bb.6(0x7c000000) + ; CHECK-NOLOB: tBcc %bb.10, 0 /* CC::eq */, killed $cpsr + ; CHECK-NOLOB: bb.8.while.body19: + ; CHECK-NOLOB: successors: %bb.9(0x04000000), %bb.7(0x7c000000) ; CHECK-NOLOB: liveins: $r0, $r1 ; CHECK-NOLOB: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next206) ; CHECK-NOLOB: tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr - ; CHECK-NOLOB: tBcc %bb.6, 1 /* CC::ne */, killed $cpsr - ; CHECK-NOLOB: bb.8: - ; CHECK-NOLOB: successors: %bb.9(0x80000000) + ; CHECK-NOLOB: tBcc %bb.7, 1 /* CC::ne */, killed $cpsr + ; CHECK-NOLOB: bb.9: + ; CHECK-NOLOB: successors: %bb.10(0x80000000) ; CHECK-NOLOB: renamable $r0, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - ; CHECK-NOLOB: bb.9.return: + ; CHECK-NOLOB: bb.10.return: ; CHECK-NOLOB: liveins: $r0 ; CHECK-NOLOB: tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 bb.0.entry: successors: %bb.5(0x50000000), %bb.1(0x30000000) liveins: $r0, $r1 - renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14, $noreg :: (load 2 from %ir.idx) - t2CMPri renamable $r2, -1, 14, $noreg, implicit-def $cpsr - t2Bcc %bb.1, 13, killed $cpsr + renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx) + t2CMPri renamable $r2, -1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 13 /* CC::le */, killed $cpsr bb.5.while.cond.preheader: - successors: %bb.8(0x30000000), %bb.6(0x50000000) + successors: %bb.9(0x30000000), %bb.6(0x50000000) liveins: $r0, $r2 - tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr - t2Bcc %bb.8, 0, killed $cpsr + tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr bb.6.land.rhs.preheader: successors: %bb.7(0x80000000) liveins: $r0, $r2 - renamable $r1 = tUXTH killed renamable $r2, 14, $noreg + renamable $r1 = tUXTH killed renamable $r2, 14 /* CC::al */, $noreg bb.7.land.rhs: - successors: %bb.8(0x04000000), %bb.7(0x7c000000) + successors: %bb.8(0x80000000) liveins: $r0, $r1 - renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info2) - renamable $r2 = tLDRHi killed renamable $r2, 1, 14, $noreg :: (load 2 from %ir.idx3) - tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr + renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info2) + renamable $r2 = tLDRHi killed renamable $r2, 1, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx3) + tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate - tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate - renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next4) - tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr - t2Bcc %bb.7, 0, killed $cpsr - t2B %bb.8, 14, $noreg + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.8.while.body: + successors: %bb.9(0x04000000), %bb.7(0x7c000000) + liveins: $r0, $r1 + + renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next4) + tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.7, 0 /* CC::eq */, killed $cpsr + t2B %bb.9, 14 /* CC::al */, $noreg bb.1.while.cond9.preheader: - successors: %bb.8(0x30000000), %bb.2(0x50000000) + successors: %bb.9(0x30000000), %bb.2(0x50000000) liveins: $r0, $r1 - tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr - t2Bcc %bb.8, 0, killed $cpsr + tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr bb.2.land.rhs11.lr.ph: successors: %bb.3(0x80000000) liveins: $r0, $r1 - renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14, $noreg :: (load 2 from %ir.data16143) + renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14 /* CC::al */, $noreg :: (load 2 from %ir.data16143) bb.3.land.rhs11: - successors: %bb.9(0x04000000), %bb.4(0x7c000000) + successors: %bb.10(0x04000000), %bb.4(0x7c000000) liveins: $r0, $r1 - renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info12) - renamable $r2 = tLDRBi killed renamable $r2, 0, 14, $noreg :: (load 1 from %ir.data165, align 2) - tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr - t2Bcc %bb.9, 0, killed $cpsr + renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info12) + renamable $r2 = tLDRBi killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (load 1 from %ir.data165, align 2) + tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.10, 0 /* CC::eq */, killed $cpsr bb.4.while.body19: - successors: %bb.8(0x04000000), %bb.3(0x7c000000) + successors: %bb.9(0x04000000), %bb.3(0x7c000000) liveins: $r0, $r1 - renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next206) - tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr - t2Bcc %bb.3, 1, killed $cpsr + renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next206) + tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.3, 1 /* CC::ne */, killed $cpsr - bb.8: - successors: %bb.9(0x80000000) + bb.9: + successors: %bb.10(0x80000000) - renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg + renamable $r0, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - bb.9.return: + bb.10.return: liveins: $r0 - tBX_RET 14, $noreg, implicit killed $r0 + tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-reorder.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-reorder.mir index ffabd94c93447..e2f61b73f4b78 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-reorder.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-reorder.mir @@ -1,44 +1,40 @@ -# RUN: llc -mtriple=thumbv8.1m.main %s -run-pass=arm-cp-islands --verify-machineinstrs -o - | FileCheck %s -# CHECK-NOT: t2LE +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-cp-islands %s -o - --verify-machineinstrs | FileCheck %s --- | - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main-unknown-unknown" - %struct.head_s = type { %struct.head_s*, %struct.data_s* } %struct.data_s = type { i16, i16 } - - ; Function Attrs: norecurse nounwind readonly - define dso_local arm_aapcscc %struct.head_s* @search(%struct.head_s* readonly %list, %struct.data_s* nocapture readonly %info) local_unnamed_addr #0 { + + define dso_local arm_aapcscc %struct.head_s* @search(%struct.head_s* readonly %list, %struct.data_s* nocapture readonly %info) local_unnamed_addr { entry: %idx = getelementptr inbounds %struct.data_s, %struct.data_s* %info, i32 0, i32 1 %tmp = load i16, i16* %idx, align 2 %cmp = icmp sgt i16 %tmp, -1 br i1 %cmp, label %while.cond.preheader, label %while.cond9.preheader - + while.cond9.preheader: ; preds = %entry %0 = icmp eq %struct.head_s* %list, null br i1 %0, label %return, label %land.rhs11.lr.ph - + land.rhs11.lr.ph: ; preds = %while.cond9.preheader %data16143 = bitcast %struct.data_s* %info to i16* %tmp1 = load i16, i16* %data16143, align 2 %conv15 = sext i16 %tmp1 to i32 br label %land.rhs11 - + while.cond.preheader: ; preds = %entry %1 = icmp eq %struct.head_s* %list, null br i1 %1, label %return, label %land.rhs.preheader - + land.rhs.preheader: ; preds = %while.cond.preheader br label %land.rhs - + while.body: ; preds = %land.rhs %next4 = bitcast %struct.head_s* %list.addr.033 to %struct.head_s** %tmp4 = load %struct.head_s*, %struct.head_s** %next4, align 4 %tobool = icmp eq %struct.head_s* %tmp4, null br i1 %tobool, label %return, label %land.rhs - + land.rhs: ; preds = %land.rhs.preheader, %while.body %list.addr.033 = phi %struct.head_s* [ %tmp4, %while.body ], [ %list, %land.rhs.preheader ] %info2 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.033, i32 0, i32 1 @@ -47,13 +43,13 @@ %tmp3 = load i16, i16* %idx3, align 2 %cmp7 = icmp eq i16 %tmp3, %tmp br i1 %cmp7, label %return, label %while.body - + while.body19: ; preds = %land.rhs11 %next205 = bitcast %struct.head_s* %list.addr.136 to %struct.head_s** %tmp8 = load %struct.head_s*, %struct.head_s** %next205, align 4 %tobool10 = icmp eq %struct.head_s* %tmp8, null br i1 %tobool10, label %return, label %land.rhs11 - + land.rhs11: ; preds = %while.body19, %land.rhs11.lr.ph %list.addr.136 = phi %struct.head_s* [ %list, %land.rhs11.lr.ph ], [ %tmp8, %while.body19 ] %info12 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.136, i32 0, i32 1 @@ -64,18 +60,16 @@ %and = zext i16 %2 to i32 %cmp16 = icmp eq i32 %and, %conv15 br i1 %cmp16, label %return, label %while.body19 - + return: ; preds = %land.rhs11, %while.body19, %land.rhs, %while.body, %while.cond.preheader, %while.cond9.preheader - %retval.0 = phi %struct.head_s* [ null, %while.cond.preheader ], [ null, %while.cond9.preheader ], [ %list.addr.033, %land.rhs ], [ null, %while.body ], [ %list.addr.136, %land.rhs11 ], [ null, %while.body19 ] + %retval.0 = phi %struct.head_s* [ null, %while.cond.preheader ], [ null, %while.cond9.preheader ], [ null, %while.body ], [ %list.addr.033, %land.rhs ], [ null, %while.body19 ], [ %list.addr.136, %land.rhs11 ] ret %struct.head_s* %retval.0 } - - attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+lob,+ras,+soft-float,+strict-align,+thumb-mode,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8d16,-fp-armv8d16sp,-fp-armv8sp,-fp16,-fp16fml,-fp64,-fpregs,-fullfp16,-neon,-vfp2,-vfp2d16,-vfp2d16sp,-vfp2sp,-vfp3,-vfp3d16,-vfp3d16sp,-vfp3sp,-vfp4,-vfp4d16,-vfp4d16sp,-vfp4sp" "unsafe-fp-math"="false" "use-soft-float"="true" } - + ... --- name: search -alignment: 1 +alignment: 2 exposesReturnsTwice: false legalized: false regBankSelected: false @@ -112,73 +106,159 @@ callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: search + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x50000000), %bb.6(0x30000000) + ; CHECK: liveins: $r0, $r1 + ; CHECK: renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx) + ; CHECK: t2CMPri renamable $r2, -1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: tBcc %bb.6, 13 /* CC::le */, killed $cpsr + ; CHECK: bb.1.while.cond.preheader: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r2 + ; CHECK: tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: tB %bb.2, 14 /* CC::al */, $noreg + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: liveins: $r0, $r2 + ; CHECK: renamable $r1 = tUXTH killed renamable $r2, 14 /* CC::al */, $noreg + ; CHECK: bb.3.land.rhs: + ; CHECK: successors: %bb.5(0x04000000), %bb.4(0x7c000000) + ; CHECK: liveins: $r0, $r1 + ; CHECK: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info2) + ; CHECK: renamable $r2 = tLDRHi killed renamable $r2, 1, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx3) + ; CHECK: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: tBcc %bb.5, 0 /* CC::eq */, killed $cpsr + ; CHECK: bb.4.while.body: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: liveins: $r0, $r1 + ; CHECK: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next4) + ; CHECK: tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: tB %bb.3, 14 /* CC::al */, $noreg + ; CHECK: bb.5.return: + ; CHECK: liveins: $r0 + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 + ; CHECK: bb.6.while.cond9.preheader: + ; CHECK: successors: %bb.7(0x80000000) + ; CHECK: liveins: $r0, $r1 + ; CHECK: tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: tB %bb.7, 14 /* CC::al */, $noreg + ; CHECK: bb.7.land.rhs11.lr.ph: + ; CHECK: successors: %bb.8(0x80000000) + ; CHECK: liveins: $r0, $r1 + ; CHECK: renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14 /* CC::al */, $noreg :: (load 2 from %ir.data16143) + ; CHECK: bb.8.land.rhs11: + ; CHECK: successors: %bb.9(0x80000000) + ; CHECK: liveins: $r0, $r1 + ; CHECK: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info12) + ; CHECK: renamable $r2 = tLDRBi killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (load 1 from %ir.data166, align 2) + ; CHECK: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 8, implicit-def $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: tB %bb.9, 14 /* CC::al */, $noreg + ; CHECK: bb.9.while.body19: + ; CHECK: successors: %bb.8(0x80000000) + ; CHECK: liveins: $r0, $r1 + ; CHECK: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next205) + ; CHECK: tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: tB %bb.8, 14 /* CC::al */, $noreg bb.0.entry: - successors: %bb.3(0x50000000), %bb.1(0x30000000) + successors: %bb.2(0x50000000), %bb.1(0x30000000) liveins: $r0, $r1 - - renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14, $noreg :: (load 2 from %ir.idx) - t2CMPri renamable $r2, -1, 14, $noreg, implicit-def $cpsr - t2Bcc %bb.1, 13, killed $cpsr - - bb.3.while.cond.preheader: - successors: %bb.4(0x80000000) + + renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx) + t2CMPri renamable $r2, -1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 13 /* CC::le */, killed $cpsr + + bb.2.while.cond.preheader: + successors: %bb.3(0x50000000) liveins: $r0, $r2 - - tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + + tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 4, implicit-def $itstate - renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate - tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate - renamable $r1 = tUXTH killed renamable $r2, 14, $noreg - + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + t2B %bb.3, 14 /* CC::al */, $noreg + + bb.3: + successors: %bb.4(0x80000000) + liveins: $r0, $r2 + + renamable $r1 = tUXTH killed renamable $r2, 14 /* CC::al */, $noreg + bb.4.land.rhs: - successors: %bb.6(0x04000000), %bb.5(0x7c000000) + successors: %bb.9(0x04000000), %bb.5(0x7c000000) liveins: $r0, $r1 - - renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info2) - renamable $r2 = tLDRHi killed renamable $r2, 1, 14, $noreg :: (load 2 from %ir.idx3) - tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr - t2Bcc %bb.6, 0, killed $cpsr - + + renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info2) + renamable $r2 = tLDRHi killed renamable $r2, 1, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx3) + tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr + bb.5.while.body: successors: %bb.4(0x7c000000) liveins: $r0, $r1 - - renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next4) - tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + + renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next4) + tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 4, implicit-def $itstate - renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate - tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate - t2B %bb.4, 14, $noreg - - bb.6.return: + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + t2B %bb.4, 14 /* CC::al */, $noreg + + bb.9.return: liveins: $r0 - - tBX_RET 14, $noreg, implicit $r0 - + + tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 + bb.1.while.cond9.preheader: - successors: %bb.2(0x80000000) + successors: %bb.7(0x50000000) liveins: $r0, $r1 - - tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + + tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 4, implicit-def $itstate - renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate - tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate - renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14, $noreg :: (load 2 from %ir.data16143) - - bb.2.land.rhs11: - successors: %bb.2(0x7c000000) + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + t2B %bb.7, 14 /* CC::al */, $noreg + + bb.7.land.rhs11.lr.ph: + successors: %bb.8(0x80000000) + liveins: $r0, $r1 + + renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14 /* CC::al */, $noreg :: (load 2 from %ir.data16143) + + bb.8.land.rhs11: + successors: %bb.6(0x80000000) liveins: $r0, $r1 - - renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info12) - renamable $r2 = tLDRBi killed renamable $r2, 0, 14, $noreg :: (load 1 from %ir.data166, align 2) - tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr + + renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info12) + renamable $r2 = tLDRBi killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (load 1 from %ir.data166, align 2) + tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate - tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate - renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next205) - tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + t2B %bb.6, 14 /* CC::al */, $noreg + + bb.6.while.body19: + successors: %bb.8(0x7c000000) + liveins: $r0, $r1 + + renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next205) + tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 4, implicit-def $itstate - renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate - tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate - t2B %bb.2, 14, $noreg + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + t2B %bb.8, 14 /* CC::al */, $noreg ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec.mir index 6d27494341368..e2e44d5e42282 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec.mir @@ -1,49 +1,39 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main %s -run-pass=arm-cp-islands --verify-machineinstrs -o - | FileCheck %s --check-prefix=CHECK-LOB # RUN: llc -mtriple=thumbv8.1m.main -mattr=-lob %s -run-pass=arm-cp-islands --verify-machineinstrs -o - | FileCheck %s --check-prefix=CHECK-NOLOB -# CHECK-NOLOB-NOT: t2LE - -# CHECK-LOB: bb.3.land.rhs: -# CHECK-LOB: tCBZ $r0, %bb.8 -# CHECK-LOB: t2LE %bb.3 -# CHECK-LOB: bb.6.land.rhs11: -# CHECK-LOB: bb.7.while.body19: -# CHECK-LOB: tCBZ $r0, %bb.8 -# CHECK-LOB: t2LE %bb.6 -# CHECK-LOB: bb.8: - --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-unknown-unknown" - + %struct.head_s = type { %struct.head_s*, %struct.data_s* } %struct.data_s = type { i16, i16 } - + ; Function Attrs: norecurse nounwind readonly - define dso_local arm_aapcscc %struct.head_s* @search(%struct.head_s* readonly %list, %struct.data_s* nocapture readonly %info) local_unnamed_addr #0 { + define dso_local arm_aapcscc %struct.head_s* @search(%struct.head_s* readonly %list, %struct.data_s* nocapture readonly %info) local_unnamed_addr { entry: %idx = getelementptr inbounds %struct.data_s, %struct.data_s* %info, i32 0, i32 1 %0 = load i16, i16* %idx, align 2 %cmp = icmp sgt i16 %0, -1 br i1 %cmp, label %while.cond.preheader, label %while.cond9.preheader - + while.cond9.preheader: ; preds = %entry %1 = icmp eq %struct.head_s* %list, null br i1 %1, label %return, label %land.rhs11.lr.ph - + land.rhs11.lr.ph: ; preds = %while.cond9.preheader %data16143 = bitcast %struct.data_s* %info to i16* %2 = load i16, i16* %data16143, align 2 %conv15 = sext i16 %2 to i32 br label %land.rhs11 - + while.cond.preheader: ; preds = %entry %3 = icmp eq %struct.head_s* %list, null br i1 %3, label %return, label %land.rhs.preheader - + land.rhs.preheader: ; preds = %while.cond.preheader br label %land.rhs - + land.rhs: ; preds = %land.rhs.preheader, %while.body %list.addr.033 = phi %struct.head_s* [ %6, %while.body ], [ %list, %land.rhs.preheader ] %info2 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.033, i32 0, i32 1 @@ -52,13 +42,13 @@ %5 = load i16, i16* %idx3, align 2 %cmp7 = icmp eq i16 %5, %0 br i1 %cmp7, label %return, label %while.body - + while.body: ; preds = %land.rhs %next4 = bitcast %struct.head_s* %list.addr.033 to %struct.head_s** %6 = load %struct.head_s*, %struct.head_s** %next4, align 4 %tobool = icmp eq %struct.head_s* %6, null br i1 %tobool, label %return, label %land.rhs - + land.rhs11: ; preds = %while.body19, %land.rhs11.lr.ph %list.addr.136 = phi %struct.head_s* [ %list, %land.rhs11.lr.ph ], [ %10, %while.body19 ] %info12 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.136, i32 0, i32 1 @@ -69,20 +59,18 @@ %and = zext i16 %9 to i32 %cmp16 = icmp eq i32 %and, %conv15 br i1 %cmp16, label %return, label %while.body19 - + while.body19: ; preds = %land.rhs11 %next206 = bitcast %struct.head_s* %list.addr.136 to %struct.head_s** %10 = load %struct.head_s*, %struct.head_s** %next206, align 4 %tobool10 = icmp eq %struct.head_s* %10, null br i1 %tobool10, label %return, label %land.rhs11 - + return: ; preds = %while.body19, %land.rhs11, %while.body, %land.rhs, %while.cond.preheader, %while.cond9.preheader %retval.0 = phi %struct.head_s* [ null, %while.cond.preheader ], [ null, %while.cond9.preheader ], [ %list.addr.033, %land.rhs ], [ null, %while.body ], [ %list.addr.136, %land.rhs11 ], [ null, %while.body19 ] ret %struct.head_s* %retval.0 } - - attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+lob,+ras,+soft-float,+strict-align,+thumb-mode,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8d16,-fp-armv8d16sp,-fp-armv8sp,-fp16,-fp16fml,-fp64,-fpregs,-fullfp16,-neon,-vfp2,-vfp2d16,-vfp2d16sp,-vfp2sp,-vfp3,-vfp3d16,-vfp3d16sp,-vfp3sp,-vfp4,-vfp4d16,-vfp4d16sp,-vfp4sp" "unsafe-fp-math"="false" "use-soft-float"="true" } - + ... --- name: search @@ -123,79 +111,197 @@ callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LOB-LABEL: name: search + ; CHECK-LOB: bb.0.entry: + ; CHECK-LOB: successors: %bb.1(0x50000000), %bb.5(0x30000000) + ; CHECK-LOB: liveins: $r0, $r1 + ; CHECK-LOB: renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx) + ; CHECK-LOB: t2CMPri renamable $r2, -1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-LOB: tBcc %bb.5, 13 /* CC::le */, killed $cpsr + ; CHECK-LOB: bb.1.while.cond.preheader: + ; CHECK-LOB: successors: %bb.9(0x30000000), %bb.2(0x50000000) + ; CHECK-LOB: liveins: $r0, $r2 + ; CHECK-LOB: tCBZ $r0, %bb.9 + ; CHECK-LOB: bb.2.land.rhs.preheader: + ; CHECK-LOB: successors: %bb.3(0x80000000) + ; CHECK-LOB: liveins: $r0, $r2 + ; CHECK-LOB: renamable $r1 = tUXTH killed renamable $r2, 14 /* CC::al */, $noreg + ; CHECK-LOB: bb.3.land.rhs: + ; CHECK-LOB: successors: %bb.4(0x80000000) + ; CHECK-LOB: liveins: $r0, $r1 + ; CHECK-LOB: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info2) + ; CHECK-LOB: renamable $r2 = tLDRHi killed renamable $r2, 1, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx3) + ; CHECK-LOB: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-LOB: t2IT 0, 8, implicit-def $itstate + ; CHECK-LOB: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK-LOB: bb.4.while.body: + ; CHECK-LOB: successors: %bb.9(0x04000000), %bb.3(0x7c000000) + ; CHECK-LOB: liveins: $r0, $r1 + ; CHECK-LOB: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next4) + ; CHECK-LOB: tCBZ $r0, %bb.9 + ; CHECK-LOB: t2LE %bb.3 + ; CHECK-LOB: bb.5.while.cond9.preheader: + ; CHECK-LOB: successors: %bb.9(0x30000000), %bb.6(0x50000000) + ; CHECK-LOB: liveins: $r0, $r1 + ; CHECK-LOB: tCBZ $r0, %bb.9 + ; CHECK-LOB: bb.6.land.rhs11.lr.ph: + ; CHECK-LOB: successors: %bb.7(0x80000000) + ; CHECK-LOB: liveins: $r0, $r1 + ; CHECK-LOB: renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14 /* CC::al */, $noreg :: (load 2 from %ir.data16143) + ; CHECK-LOB: bb.7.land.rhs11: + ; CHECK-LOB: successors: %bb.10(0x04000000), %bb.8(0x7c000000) + ; CHECK-LOB: liveins: $r0, $r1 + ; CHECK-LOB: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info12) + ; CHECK-LOB: renamable $r2 = tLDRBi killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (load 1 from %ir.data165, align 2) + ; CHECK-LOB: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-LOB: tBcc %bb.10, 0 /* CC::eq */, killed $cpsr + ; CHECK-LOB: bb.8.while.body19: + ; CHECK-LOB: successors: %bb.9(0x04000000), %bb.7(0x7c000000) + ; CHECK-LOB: liveins: $r0, $r1 + ; CHECK-LOB: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next206) + ; CHECK-LOB: tCBZ $r0, %bb.9 + ; CHECK-LOB: t2LE %bb.7 + ; CHECK-LOB: bb.9: + ; CHECK-LOB: successors: %bb.10(0x80000000) + ; CHECK-LOB: renamable $r0, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg + ; CHECK-LOB: bb.10.return: + ; CHECK-LOB: liveins: $r0 + ; CHECK-LOB: tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 + ; CHECK-NOLOB-LABEL: name: search + ; CHECK-NOLOB: bb.0.entry: + ; CHECK-NOLOB: successors: %bb.1(0x50000000), %bb.5(0x30000000) + ; CHECK-NOLOB: liveins: $r0, $r1 + ; CHECK-NOLOB: renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx) + ; CHECK-NOLOB: t2CMPri renamable $r2, -1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NOLOB: tBcc %bb.5, 13 /* CC::le */, killed $cpsr + ; CHECK-NOLOB: bb.1.while.cond.preheader: + ; CHECK-NOLOB: successors: %bb.9(0x30000000), %bb.2(0x50000000) + ; CHECK-NOLOB: liveins: $r0, $r2 + ; CHECK-NOLOB: tCBZ $r0, %bb.9 + ; CHECK-NOLOB: bb.2.land.rhs.preheader: + ; CHECK-NOLOB: successors: %bb.3(0x80000000) + ; CHECK-NOLOB: liveins: $r0, $r2 + ; CHECK-NOLOB: renamable $r1 = tUXTH killed renamable $r2, 14 /* CC::al */, $noreg + ; CHECK-NOLOB: bb.3.land.rhs: + ; CHECK-NOLOB: successors: %bb.4(0x80000000) + ; CHECK-NOLOB: liveins: $r0, $r1 + ; CHECK-NOLOB: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info2) + ; CHECK-NOLOB: renamable $r2 = tLDRHi killed renamable $r2, 1, 14 /* CC::al */, $noreg :: (load 2 from %ir.idx3) + ; CHECK-NOLOB: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NOLOB: t2IT 0, 8, implicit-def $itstate + ; CHECK-NOLOB: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK-NOLOB: bb.4.while.body: + ; CHECK-NOLOB: successors: %bb.9(0x04000000), %bb.3(0x7c000000) + ; CHECK-NOLOB: liveins: $r0, $r1 + ; CHECK-NOLOB: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next4) + ; CHECK-NOLOB: tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NOLOB: tBcc %bb.3, 1 /* CC::ne */, killed $cpsr + ; CHECK-NOLOB: tB %bb.9, 14 /* CC::al */, $noreg + ; CHECK-NOLOB: bb.5.while.cond9.preheader: + ; CHECK-NOLOB: successors: %bb.9(0x30000000), %bb.6(0x50000000) + ; CHECK-NOLOB: liveins: $r0, $r1 + ; CHECK-NOLOB: tCBZ $r0, %bb.9 + ; CHECK-NOLOB: bb.6.land.rhs11.lr.ph: + ; CHECK-NOLOB: successors: %bb.7(0x80000000) + ; CHECK-NOLOB: liveins: $r0, $r1 + ; CHECK-NOLOB: renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14 /* CC::al */, $noreg :: (load 2 from %ir.data16143) + ; CHECK-NOLOB: bb.7.land.rhs11: + ; CHECK-NOLOB: successors: %bb.10(0x04000000), %bb.8(0x7c000000) + ; CHECK-NOLOB: liveins: $r0, $r1 + ; CHECK-NOLOB: renamable $r2 = tLDRi renamable $r0, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.info12) + ; CHECK-NOLOB: renamable $r2 = tLDRBi killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (load 1 from %ir.data165, align 2) + ; CHECK-NOLOB: tCMPr killed renamable $r2, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NOLOB: tBcc %bb.10, 0 /* CC::eq */, killed $cpsr + ; CHECK-NOLOB: bb.8.while.body19: + ; CHECK-NOLOB: successors: %bb.9(0x04000000), %bb.7(0x7c000000) + ; CHECK-NOLOB: liveins: $r0, $r1 + ; CHECK-NOLOB: renamable $r0 = tLDRi killed renamable $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.next206) + ; CHECK-NOLOB: tCMPi8 renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NOLOB: tBcc %bb.7, 1 /* CC::ne */, killed $cpsr + ; CHECK-NOLOB: bb.9: + ; CHECK-NOLOB: successors: %bb.10(0x80000000) + ; CHECK-NOLOB: renamable $r0, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg + ; CHECK-NOLOB: bb.10.return: + ; CHECK-NOLOB: liveins: $r0 + ; CHECK-NOLOB: tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 bb.0.entry: successors: %bb.5(0x50000000), %bb.1(0x30000000) liveins: $r0, $r1 - + renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14, $noreg :: (load 2 from %ir.idx) t2CMPri renamable $r2, -1, 14, $noreg, implicit-def $cpsr t2Bcc %bb.1, 13, killed $cpsr - + bb.5.while.cond.preheader: successors: %bb.8(0x30000000), %bb.6(0x50000000) liveins: $r0, $r2 - + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr t2Bcc %bb.8, 0, killed $cpsr - + bb.6.land.rhs.preheader: successors: %bb.7(0x80000000) liveins: $r0, $r2 - + renamable $r1 = tUXTH killed renamable $r2, 14, $noreg - + bb.7.land.rhs: - successors: %bb.8(0x04000000), %bb.7(0x7c000000) + successors: %bb.10(0x80000000) liveins: $r0, $r1 - + renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info2) renamable $r2 = tLDRHi killed renamable $r2, 1, 14, $noreg :: (load 2 from %ir.idx3) tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.10.while.body: + successors: %bb.8(0x04000000), %bb.7(0x7c000000) + liveins: $r0, $r1 + renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next4) tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr t2Bcc %bb.7, 1, killed $cpsr t2B %bb.8, 14, $noreg - + bb.1.while.cond9.preheader: successors: %bb.8(0x30000000), %bb.2(0x50000000) liveins: $r0, $r1 - + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr t2Bcc %bb.8, 0, killed $cpsr - + bb.2.land.rhs11.lr.ph: successors: %bb.3(0x80000000) liveins: $r0, $r1 - + renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14, $noreg :: (load 2 from %ir.data16143) - + bb.3.land.rhs11: successors: %bb.9(0x04000000), %bb.4(0x7c000000) liveins: $r0, $r1 - + renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info12) renamable $r2 = tLDRBi killed renamable $r2, 0, 14, $noreg :: (load 1 from %ir.data165, align 2) tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr t2Bcc %bb.9, 0, killed $cpsr - + bb.4.while.body19: successors: %bb.8(0x04000000), %bb.3(0x7c000000) liveins: $r0, $r1 - + renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next206) tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr t2Bcc %bb.3, 1, killed $cpsr - + bb.8: successors: %bb.9(0x80000000) - + renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg - + bb.9.return: liveins: $r0 - + tBX_RET 14, $noreg, implicit killed $r0 ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir index d0bb6e4160c71..15166cece6acb 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir @@ -108,6 +108,9 @@ body: | ; CHECK: t2IT 0, 4, implicit-def $itstate ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -120,8 +123,8 @@ body: | ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr @@ -132,8 +135,8 @@ body: | ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.middle.block: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0 ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 @@ -145,6 +148,11 @@ body: | t2IT 0, 4, implicit-def $itstate renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -158,8 +166,8 @@ body: | t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg @@ -172,10 +180,10 @@ body: | renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.middle.block: + bb.3.middle.block: liveins: $q0 renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir index c006add3bf4cc..65f9cc3176b1e 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir @@ -108,12 +108,15 @@ body: | ; CHECK-LABEL: name: non_masked_load ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 2, implicit-def $itstate ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: renamable $r0 = tUXTB killed renamable $r0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -129,8 +132,8 @@ body: | ; CHECK: renamable $r3 = t2LSRri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 34, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 @@ -140,8 +143,8 @@ body: | ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 16, 14 /* CC::al */, $noreg ; CHECK: renamable $q2 = MVE_VADDi8 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 ; CHECK: renamable $q0 = MVE_VADDi8 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.middle.block: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r3 ; CHECK: renamable $vpr = MVE_VCTP8 killed renamable $r3, 0, $noreg ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr @@ -158,6 +161,11 @@ body: | renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate renamable $r0 = tUXTB killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -174,8 +182,8 @@ body: | renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 34, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg @@ -187,10 +195,10 @@ body: | renamable $q2 = MVE_VADDi8 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 renamable $lr = t2LoopDec killed renamable $lr, 1 renamable $q0 = MVE_VADDi8 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.middle.block: + bb.3.middle.block: liveins: $q0, $q1, $r3 renamable $vpr = MVE_VCTP8 killed renamable $r3, 0, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir index 45a0b2e977377..57d8014712a5b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir @@ -110,14 +110,17 @@ body: | ; CHECK: tCMPi8 renamable $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r12 = t2ADDri renamable $r3, 15, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 15, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 16, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14 /* CC::al */, $noreg @@ -126,8 +129,8 @@ body: | ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1618, align 1) ; CHECK: renamable $q0 = MVE_VADDi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, $noreg :: (store 16 into %ir.lsr.iv1921, align 1) - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -142,6 +145,11 @@ body: | tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $lr + renamable $r12 = t2ADDri renamable $r3, 15, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg @@ -149,8 +157,8 @@ body: | renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg @@ -161,10 +169,10 @@ body: | renamable $lr = t2LoopDec killed renamable $lr, 1 renamable $q0 = MVE_VADDi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, $noreg :: (store 16 into %ir.lsr.iv1921, align 1) - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll index 66601dd66cb29..a4961f51f32b8 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -9,6 +9,7 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(i8* nocaptur ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: uxtbeq r0, r0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r3, r2, #15 ; CHECK-NEXT: vmov.i32 q1, #0x0 @@ -17,7 +18,7 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(i8* nocaptur ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #4 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 ; CHECK-NEXT: vmov q0, q1 @@ -28,8 +29,8 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(i8* nocaptur ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u8 q2, [r1], #16 ; CHECK-NEXT: vadd.i8 q1, q1, q2 -; CHECK-NEXT: le lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: le lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} @@ -49,7 +50,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ] %i = getelementptr inbounds i8, i8* %a, i32 %index - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) %i1 = bitcast i8* %i to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) %i2 = getelementptr inbounds i8, i8* %b, i32 %index @@ -79,6 +80,7 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: sxtheq r0, r0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 ; CHECK-NEXT: vmov.i32 q1, #0x0 @@ -87,7 +89,7 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -98,8 +100,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 ; CHECK-NEXT: vadd.i16 q1, q1, q2 -; CHECK-NEXT: le lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: le lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} @@ -119,7 +121,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ] %i = getelementptr inbounds i8, i8* %a, i32 %index - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %i1 = bitcast i8* %i to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef) %i2 = zext <8 x i8> %wide.masked.load to <8 x i16> @@ -151,17 +153,18 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocaptur ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: uxtbeq r0, r0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.8 lr, r2 -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vldrb.u8 q2, [r0], #16 ; CHECK-NEXT: vsub.i8 q1, q2, q1 ; CHECK-NEXT: vadd.i8 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: letp lr, .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -180,7 +183,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ] %i = getelementptr inbounds i8, i8* %a, i32 %index - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) %i1 = bitcast i8* %i to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) %i2 = getelementptr inbounds i8, i8* %b, i32 %index @@ -210,17 +213,18 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocaptu ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: sxtheq r0, r0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u16 q1, [r0], #8 ; CHECK-NEXT: vldrb.u16 q2, [r1], #8 ; CHECK-NEXT: vsub.i16 q1, q2, q1 ; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -239,7 +243,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ] %i = getelementptr inbounds i8, i8* %a, i32 %index - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %i1 = bitcast i8* %i to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef) %i2 = zext <8 x i8> %wide.masked.load to <8 x i16> @@ -271,17 +275,18 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocaptur ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: uxtbeq r0, r0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.8 lr, r2 -; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16 ; CHECK-NEXT: vldrb.u8 q2, [r1], #16 ; CHECK-NEXT: vmul.i8 q1, q2, q1 ; CHECK-NEXT: vadd.i8 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: letp lr, .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -300,7 +305,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ] %i = getelementptr inbounds i8, i8* %a, i32 %index - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) %i1 = bitcast i8* %i to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) %i2 = getelementptr inbounds i8, i8* %b, i32 %index @@ -330,17 +335,18 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocaptu ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: sxtheq r0, r0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u16 q1, [r0], #8 ; CHECK-NEXT: vldrb.u16 q2, [r1], #8 ; CHECK-NEXT: vmul.i16 q1, q2, q1 ; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB5_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: letp lr, .LBB5_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -359,7 +365,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ] %i = getelementptr inbounds i8, i8* %a, i32 %index - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %i1 = bitcast i8* %i to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef) %i2 = zext <8 x i8> %wide.masked.load to <8 x i16> @@ -450,7 +456,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ] %i = getelementptr inbounds i8, i8* %a, i32 %index - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %i1 = bitcast i8* %i to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %i1, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef) %i2 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -480,7 +486,7 @@ vector.body46: ; preds = %vector.body46, %vec %index51 = phi i32 [ 0, %vector.ph47 ], [ %index.next52, %vector.body46 ] %vec.phi60 = phi <4 x i32> [ %i11, %vector.ph47 ], [ %i19, %vector.body46 ] %i12 = getelementptr inbounds i8, i8* %a, i32 %index51 - %active.lane.mask61 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index51, i32 %trip.count.minus.154) + %active.lane.mask61 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index51, i32 %N) %i13 = bitcast i8* %i12 to <4 x i8>* %wide.masked.load62 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %i13, i32 1, <4 x i1> %active.lane.mask61, <4 x i8> undef) %i14 = zext <4 x i8> %wide.masked.load62 to <4 x i32> @@ -564,7 +570,7 @@ vector.body: ; preds = %vector.body, %vecto %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i8, %vector.body ] %vec.phi.1 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i9, %vector.body ] %i = getelementptr inbounds i8, i8* %a, i32 %index - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %i1 = bitcast i8* %i to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef) %i2 = zext <8 x i8> %wide.masked.load to <8 x i16> @@ -597,6 +603,119 @@ for.cond.cleanup: ; preds = %middle.block, %entr ret void } +%struct.date = type { i32, i32, i32, i32 } +@days = internal unnamed_addr constant [2 x [13 x i32]] [[13 x i32] [i32 0, i32 31, i32 28, i32 31, i32 30, i32 31, i32 30, i32 31, i32 31, i32 30, i32 31, i32 30, i32 31], [13 x i32] [i32 0, i32 31, i32 29, i32 31, i32 30, i32 31, i32 30, i32 31, i32 31, i32 30, i32 31, i32 30, i32 31]], align 4 +define i32 @wrongop(%struct.date* nocapture readonly %pd) { +; CHECK-LABEL: wrongop: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: ldr.w r2, [r12, #8] +; CHECK-NEXT: lsls r3, r2, #30 +; CHECK-NEXT: bne .LBB8_3 +; CHECK-NEXT: @ %bb.1: @ %entry +; CHECK-NEXT: movw r3, #34079 +; CHECK-NEXT: movt r3, #20971 +; CHECK-NEXT: smmul r3, r2, r3 +; CHECK-NEXT: asrs r1, r3, #5 +; CHECK-NEXT: add.w r1, r1, r3, lsr #31 +; CHECK-NEXT: movs r3, #100 +; CHECK-NEXT: mls r1, r1, r3, r2 +; CHECK-NEXT: cbz r1, .LBB8_3 +; CHECK-NEXT: @ %bb.2: +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: b .LBB8_4 +; CHECK-NEXT: .LBB8_3: @ %lor.rhs +; CHECK-NEXT: movw r1, #47184 +; CHECK-NEXT: movw r3, #23593 +; CHECK-NEXT: movt r1, #1310 +; CHECK-NEXT: movt r3, #49807 +; CHECK-NEXT: mla r1, r2, r3, r1 +; CHECK-NEXT: movw r2, #55051 +; CHECK-NEXT: movt r2, #163 +; CHECK-NEXT: ror.w r1, r1, #4 +; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: cset r4, lo +; CHECK-NEXT: .LBB8_4: @ %lor.end +; CHECK-NEXT: ldr.w r3, [r12, #4] +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB8_5: @ %vector.ph +; CHECK-NEXT: movw r1, :lower16:days +; CHECK-NEXT: movt r1, :upper16:days +; CHECK-NEXT: movs r2, #52 +; CHECK-NEXT: mla r1, r4, r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vdup.32 q0, r2 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: subs r0, r3, #1 +; CHECK-NEXT: dlstp.32 lr, r0 +; CHECK-NEXT: .LBB8_6: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: letp lr, .LBB8_6 +; CHECK-NEXT: @ %bb.7: @ %middle.block +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: pop {r4, pc} +entry: + %day1 = getelementptr inbounds %struct.date, %struct.date* %pd, i32 0, i32 0 + %0 = load i32, i32* %day1, align 4 + %year = getelementptr inbounds %struct.date, %struct.date* %pd, i32 0, i32 2 + %1 = load i32, i32* %year, align 4 + %2 = and i32 %1, 3 + %cmp = icmp ne i32 %2, 0 + %rem3 = srem i32 %1, 100 + %cmp4.not = icmp eq i32 %rem3, 0 + %or.cond = or i1 %cmp, %cmp4.not + br i1 %or.cond, label %lor.rhs, label %lor.end + +lor.rhs: ; preds = %entry + %rem6 = srem i32 %1, 400 + %cmp7 = icmp eq i32 %rem6, 0 + %phi.cast = zext i1 %cmp7 to i32 + br label %lor.end + +lor.end: ; preds = %entry, %lor.rhs + %3 = phi i32 [ %phi.cast, %lor.rhs ], [ 1, %entry ] + %month = getelementptr inbounds %struct.date, %struct.date* %pd, i32 0, i32 1 + %4 = load i32, i32* %month, align 4 + %cmp820 = icmp sgt i32 %4, 0 + br i1 %cmp820, label %vector.ph, label %for.end + +vector.ph: ; preds = %lor.end + %n.rnd.up = add i32 %4, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %4, -1 + %5 = insertelement <4 x i32> , i32 %0, i32 0 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ %5, %vector.ph ], [ %8, %vector.body ] + %6 = getelementptr inbounds [2 x [13 x i32]], [2 x [13 x i32]]* @days, i32 0, i32 %3, i32 %index + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %7 = bitcast i32* %6 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %7, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %8 = add <4 x i32> %wide.masked.load, %vec.phi + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n.vec + br i1 %9, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %10 = select <4 x i1> %active.lane.mask, <4 x i32> %8, <4 x i32> %vec.phi + %11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %10) + br label %for.end + +for.end: ; preds = %middle.block, %lor.end + %day.0.lcssa = phi i32 [ %0, %lor.end ], [ %11, %middle.block ] + ret i32 %day.0.lcssa +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir index ee07d0c1f871d..a5a83982fcc54 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir @@ -125,11 +125,14 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.loop.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) ; CHECK: dead $lr = MVE_DLSTP_32 killed renamable $r3 ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg - ; CHECK: bb.1.loop.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.loop.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $r0, $r1, $r2, $r12 ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg @@ -137,8 +140,8 @@ body: | ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.addr.a, align 4) ; CHECK: renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 0, killed $noreg :: (store 16 into %ir.addr.c, align 4) - ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -151,12 +154,17 @@ body: | tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.loop.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) t2DoLoopStart renamable $r4 $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg - bb.1.loop.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.loop.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r12 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg @@ -170,10 +178,10 @@ body: | renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg MVE_VPST 8, implicit $vpr renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... @@ -217,11 +225,14 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.loop.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) ; CHECK: dead $lr = MVE_DLSTP_16 killed renamable $r3 ; CHECK: $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.loop.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.loop.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $r0, $r1, $r2, $r4 ; CHECK: $lr = tMOVr $r4, 14 /* CC::al */, $noreg ; CHECK: renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg @@ -230,8 +241,8 @@ body: | ; CHECK: $r0 = tMOVr $r1, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg ; CHECK: renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 0, killed $noreg :: (store 16 into %ir.addr.c, align 2) - ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -244,12 +255,17 @@ body: | tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.loop.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.loop.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.loop.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r4 renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg @@ -264,10 +280,10 @@ body: | renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg MVE_VPST 8, implicit $vpr renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 2) - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir index e6460efadc2c1..5b74bd7352803 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir @@ -117,20 +117,23 @@ body: | ; CHECK: tCMPi8 $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.for.body.preheader: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14 /* CC::al */, $noreg ; CHECK: $lr = t2DLS killed $r3 - ; CHECK: bb.1.for.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.for.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2 ; CHECK: dead renamable $r3 = SPACE 4070, undef renamable $r0 ; CHECK: renamable $r12, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep3) ; CHECK: renamable $r3, renamable $r2 = t2LDR_PRE killed renamable $r2, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep7) ; CHECK: renamable $r3 = nsw t2MUL killed renamable $r3, killed renamable $r12, 14 /* CC::al */, $noreg ; CHECK: early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.scevgep11) - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -143,14 +146,19 @@ body: | tCMPi8 $r3, 0, 14, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg $lr = tMOVr $r3, 14, $noreg t2DoLoopStart killed $r3 - bb.1.for.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.for.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2 dead renamable $r3 = SPACE 4070, undef renamable $r0 @@ -159,10 +167,10 @@ body: | renamable $r3 = nsw t2MUL killed renamable $r3, killed renamable $r12, 14, $noreg early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep11) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll index 065e534dd55bd..4cd0c54c666c8 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -53,7 +53,7 @@ vector.body: %induction = add <4 x i32> %broadcast.splat, ; %1 = icmp ult <4 x i32> %induction, - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -388,7 +388,7 @@ vector.body: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -432,7 +432,7 @@ vector.body: %induction = add <4 x i32> %broadcast.splat, ; The induction variable %D is not an IV: - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32002) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003) %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -474,7 +474,7 @@ vector.body: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -519,7 +519,7 @@ vector.body: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll index e9facfda61335..c0b2a036f3711 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll @@ -9,15 +9,16 @@ define arm_aapcs_vfpcc void @uadd_sat(i16* noalias nocapture readonly %pSrcA, i1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: dlstp.16 lr, r3 -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 ; CHECK-NEXT: vqadd.u16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r2], #16 -; CHECK-NEXT: letp lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %while.end +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} entry: %cmp7 = icmp eq i32 %blockSize, 0 @@ -34,7 +35,7 @@ vector.body: ; preds = %vector.body, %vecto %next.gep = getelementptr i16, i16* %pSrcA, i32 %index %next.gep20 = getelementptr i16, i16* %pDst, i32 %index %next.gep21 = getelementptr i16, i16* %pSrcB, i32 %index - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize) %0 = bitcast i16* %next.gep to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) %1 = bitcast i16* %next.gep21 to <8 x i16>* @@ -58,15 +59,16 @@ define arm_aapcs_vfpcc void @sadd_sat(i16* noalias nocapture readonly %pSrcA, i1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: dlstp.16 lr, r3 -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 ; CHECK-NEXT: vqadd.s16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r2], #16 -; CHECK-NEXT: letp lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %while.end +; CHECK-NEXT: letp lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} entry: %cmp7 = icmp eq i32 %blockSize, 0 @@ -83,7 +85,7 @@ vector.body: ; preds = %vector.body, %vecto %next.gep = getelementptr i16, i16* %pSrcA, i32 %index %next.gep20 = getelementptr i16, i16* %pDst, i32 %index %next.gep21 = getelementptr i16, i16* %pSrcB, i32 %index - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize) %0 = bitcast i16* %next.gep to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) %1 = bitcast i16* %next.gep21 to <8 x i16>* diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll index 87f23adf7ffa5..5ad6d91123084 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll @@ -9,14 +9,15 @@ define arm_aapcs_vfpcc void @fabs(float* noalias nocapture readonly %pSrcA, floa ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vabs.f32 q0, q0 ; CHECK-NEXT: vstrw.32 q0, [r1], #16 -; CHECK-NEXT: letp lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %while.end +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} entry: %cmp3 = icmp eq i32 %blockSize, 0 @@ -32,7 +33,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %next.gep = getelementptr float, float* %pDst, i32 %index %next.gep13 = getelementptr float, float* %pSrcA, i32 %index - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %blockSize) %0 = bitcast float* %next.gep13 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) %1 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %wide.masked.load) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll index e72e81da7e7c1..3c6dd9c9f7d1b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll @@ -9,14 +9,15 @@ define arm_aapcs_vfpcc void @round(float* noalias nocapture readonly %pSrcA, flo ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vrinta.f32 q0, q0 ; CHECK-NEXT: vstrw.32 q0, [r1], #16 -; CHECK-NEXT: letp lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp5 = icmp eq i32 %n, 0 @@ -32,7 +33,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %next.gep = getelementptr float, float* %pSrcA, i32 %index %next.gep14 = getelementptr float, float* %pDst, i32 %index - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = bitcast float* %next.gep to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) %1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %wide.masked.load) @@ -54,14 +55,15 @@ define arm_aapcs_vfpcc void @rint(float* noalias nocapture readonly %pSrcA, floa ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vrintx.f32 q0, q0 ; CHECK-NEXT: vstrw.32 q0, [r1], #16 -; CHECK-NEXT: letp lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp5 = icmp eq i32 %n, 0 @@ -77,7 +79,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %next.gep = getelementptr float, float* %pSrcA, i32 %index %next.gep14 = getelementptr float, float* %pDst, i32 %index - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = bitcast float* %next.gep to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) %1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %wide.masked.load) @@ -99,14 +101,15 @@ define arm_aapcs_vfpcc void @trunc(float* noalias nocapture readonly %pSrcA, flo ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vrintz.f32 q0, q0 ; CHECK-NEXT: vstrw.32 q0, [r1], #16 -; CHECK-NEXT: letp lr, .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp5 = icmp eq i32 %n, 0 @@ -122,7 +125,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %next.gep = getelementptr float, float* %pSrcA, i32 %index %next.gep14 = getelementptr float, float* %pDst, i32 %index - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = bitcast float* %next.gep to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) %1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %wide.masked.load) @@ -144,14 +147,15 @@ define arm_aapcs_vfpcc void @ceil(float* noalias nocapture readonly %pSrcA, floa ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vrintp.f32 q0, q0 ; CHECK-NEXT: vstrw.32 q0, [r1], #16 -; CHECK-NEXT: letp lr, .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp5 = icmp eq i32 %n, 0 @@ -167,7 +171,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %next.gep = getelementptr float, float* %pSrcA, i32 %index %next.gep14 = getelementptr float, float* %pDst, i32 %index - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = bitcast float* %next.gep to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) %1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.masked.load) @@ -189,14 +193,15 @@ define arm_aapcs_vfpcc void @floor(float* noalias nocapture readonly %pSrcA, flo ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vrintm.f32 q0, q0 ; CHECK-NEXT: vstrw.32 q0, [r1], #16 -; CHECK-NEXT: letp lr, .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp5 = icmp eq i32 %n, 0 @@ -212,7 +217,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %next.gep = getelementptr float, float* %pSrcA, i32 %index %next.gep14 = getelementptr float, float* %pDst, i32 %index - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = bitcast float* %next.gep to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) %1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %wide.masked.load) @@ -235,26 +240,26 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA, ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #3 +; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: adr r3, .LCPI5_0 -; CHECK-NEXT: sub.w r12, r2, #1 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vdup.32 q1, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q0, r2 -; CHECK-NEXT: vdup.32 q3, r2 +; CHECK-NEXT: vadd.i32 q2, q0, r12 +; CHECK-NEXT: vdup.32 q3, r12 ; CHECK-NEXT: vcmp.u32 hi, q3, q2 -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpstt -; CHECK-NEXT: vcmpt.u32 cs, q1, q2 +; CHECK-NEXT: vcmpt.u32 hi, q1, q2 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vrintr.f32 s15, s11 ; CHECK-NEXT: vrintr.f32 s14, s10 @@ -262,11 +267,11 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA, ; CHECK-NEXT: vrintr.f32 s12, s8 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q3, [r1], #16 -; CHECK-NEXT: le lr, .LBB5_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB5_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI5_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 @@ -286,7 +291,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %next.gep = getelementptr float, float* %pSrcA, i32 %index %next.gep14 = getelementptr float, float* %pDst, i32 %index - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = bitcast float* %next.gep to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) %1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.masked.load) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll index 3c7ae4dc734ad..5b2f3a7c98e8a 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll @@ -9,15 +9,17 @@ define arm_aapcs_vfpcc void @usub_sat(i16* noalias nocapture readonly %pSrcA, i1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB0_1: @ %vector.ph +; CHECK-NEXT: subs r3, #1 ; CHECK-NEXT: dlstp.16 lr, r3 -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 ; CHECK-NEXT: vqsub.u16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r2], #16 -; CHECK-NEXT: letp lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %while.end +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} entry: %cmp7 = icmp eq i32 %blockSize, 0 @@ -58,15 +60,17 @@ define arm_aapcs_vfpcc void @ssub_sat(i16* noalias nocapture readonly %pSrcA, i1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB1_1: @ %vector.ph +; CHECK-NEXT: subs r3, #1 ; CHECK-NEXT: dlstp.16 lr, r3 -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 ; CHECK-NEXT: vqsub.s16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r2], #16 -; CHECK-NEXT: letp lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %while.end +; CHECK-NEXT: letp lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} entry: %cmp7 = icmp eq i32 %blockSize, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll index 8e46e3385385e..45c7b8f3e6239 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s ; The following functions should all fail to become tail-predicated. @@ -453,7 +454,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; adding 5, instead of 4, to index. +; adding 5, instead of 4, to index. define void @wrong_index_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll index b40b36ced4af2..5d81c4c07eeaf 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -29,7 +29,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i16, i16* %a, i32 %index ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 - %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) @@ -89,7 +89,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i16, i16* %a, i32 %index ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 - %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) @@ -151,7 +151,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp = getelementptr inbounds i32, i32* %a, i32 %index ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll index 3563f139cdeb5..0c85e89133374 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -36,7 +36,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 - %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) @@ -107,7 +107,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 - %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) @@ -170,7 +170,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 - %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) @@ -238,7 +238,7 @@ vector.body: ; preds = %vector.body, %vecto %9 = phi i32 [ %7, %vector.ph ], [ %17, %vector.body ] %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>* %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>* - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %8) %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv45, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef) %10 = sext <4 x i16> %wide.masked.load to <4 x i32> %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv4850, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir index 1a4e16fa43e78..1f212c9e3aa3f 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir @@ -82,6 +82,9 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r5, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -91,8 +94,8 @@ body: | ; CHECK: $r12 = t2MOVTi16 killed $r12, 65535, 14 /* CC::al */, $noreg ; CHECK: dead $lr = t2DLS renamable $r3 ; CHECK: $r5 = tMOVr killed $r3, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $r0, $r1, $r2, $r5, $r12 ; CHECK: $r3 = tMOVr $r12, 14 /* CC::al */, $noreg ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg @@ -103,8 +106,8 @@ body: | ; CHECK: early-clobber renamable $r1 = t2STRH_POST killed renamable $r3, killed renamable $r1, 2, 14 /* CC::al */, $noreg :: (store 2 into %ir.lsr.iv.2) ; CHECK: renamable $r5, dead $cpsr = nsw tSUBi8 killed $r5, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r5, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -117,6 +120,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r5, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r5, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -127,8 +135,8 @@ body: | t2DoLoopStart renamable $r3 $r5 = tMOVr killed $r3, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r5, $r12 $r3 = tMOVr $r12, 14 /* CC::al */, $noreg @@ -141,10 +149,10 @@ body: | renamable $r5, dead $cpsr = nsw tSUBi8 killed $r5, 1, 14 /* CC::al */, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r5, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir index b175a7ca7e392..666ad4dd742a8 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir @@ -123,11 +123,14 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.loop.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) ; CHECK: dead $lr = t2DLS renamable $r4 ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg - ; CHECK: bb.1.loop.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.loop.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $r0, $r1, $r2, $r3, $r12 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg @@ -140,8 +143,8 @@ body: | ; CHECK: renamable $q0 = MVE_VQSHRNbhs32 killed renamable $q0, killed renamable $q1, 15, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -154,12 +157,17 @@ body: | tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.loop.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) t2DoLoopStart renamable $r4 $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg - bb.1.loop.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.loop.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r12 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg @@ -174,10 +182,10 @@ body: | renamable $q0 = MVE_VQSHRNbhs32 killed renamable $q0, killed renamable $q1, 15, 0, $noreg MVE_VPST 8, implicit $vpr renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... @@ -221,11 +229,14 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.loop.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) ; CHECK: dead $lr = t2DLS renamable $r4 ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg - ; CHECK: bb.1.loop.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.loop.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $r0, $r1, $r2, $r3, $r12 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr @@ -238,8 +249,8 @@ body: | ; CHECK: renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -252,12 +263,17 @@ body: | tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.loop.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) t2DoLoopStart renamable $r4 $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg - bb.1.loop.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.loop.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r12 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg @@ -272,10 +288,10 @@ body: | renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg MVE_VPST 8, implicit $vpr renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir index ae1c783c0606d..cd8310c1004b7 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir @@ -853,15 +853,18 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1 ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv17, align 2) ; CHECK: renamable $r12 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg ; CHECK: early-clobber renamable $r1 = t2STR_POST killed renamable $r12, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -874,6 +877,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -882,8 +890,8 @@ body: | t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg @@ -895,10 +903,10 @@ body: | renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... @@ -944,15 +952,18 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: $lr = MVE_DLSTP_16 killed renamable $r2 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1 ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRHU16_post killed renamable $r0, 16, 0, killed $noreg :: (load 16 from %ir.lsr.iv17, align 2) ; CHECK: renamable $r12 = MVE_VADDVs16no_acc killed renamable $q0, 0, $noreg ; CHECK: early-clobber renamable $r1 = t2STR_POST killed renamable $r12, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -965,6 +976,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -973,8 +989,8 @@ body: | t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg @@ -986,10 +1002,10 @@ body: | renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... @@ -1035,15 +1051,18 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: $lr = MVE_DLSTP_8 killed renamable $r2 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1 ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRBU8_post killed renamable $r0, 16, 0, killed $noreg :: (load 16 from %ir.lsr.iv17, align 1) ; CHECK: renamable $r12 = MVE_VADDVs8no_acc killed renamable $q0, 0, $noreg ; CHECK: early-clobber renamable $r1 = t2STR_POST killed renamable $r12, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -1056,6 +1075,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg @@ -1064,8 +1088,8 @@ body: | t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg @@ -1077,10 +1101,10 @@ body: | renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 16, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... @@ -1234,6 +1258,9 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -1241,8 +1268,8 @@ body: | ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr @@ -1253,8 +1280,8 @@ body: | ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: early-clobber renamable $r1 = t2STR_POST killed renamable $r12, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -1267,6 +1294,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -1275,8 +1307,8 @@ body: | t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg @@ -1289,10 +1321,10 @@ body: | early-clobber renamable $r1 = t2STR_POST killed renamable $r12, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... @@ -1459,6 +1491,9 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -1466,8 +1501,8 @@ body: | ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr @@ -1478,8 +1513,8 @@ body: | ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: early-clobber renamable $r1 = t2STR_POST killed renamable $r12, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -1492,6 +1527,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -1500,8 +1540,8 @@ body: | t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg @@ -1514,10 +1554,10 @@ body: | early-clobber renamable $r1 = t2STR_POST killed renamable $r12, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... @@ -1687,6 +1727,9 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -1696,8 +1739,8 @@ body: | ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr @@ -1709,9 +1752,10 @@ body: | ; CHECK: renamable $r3 = t2SXTH killed renamable $r12, 0, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg ; CHECK: early-clobber renamable $r1 = t2STR_POST killed renamable $r3, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r4, $lr @@ -1723,6 +1767,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r4, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -1733,8 +1782,8 @@ body: | t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $q0, $r0, $r1, $r2, $r4 renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg @@ -1748,10 +1797,10 @@ body: | renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg early-clobber renamable $r1 = t2STR_POST killed renamable $r3, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... @@ -1932,6 +1981,9 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -1941,8 +1993,8 @@ body: | ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr @@ -1954,8 +2006,8 @@ body: | ; CHECK: renamable $r3 = t2UXTH killed renamable $r12, 0, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg ; CHECK: early-clobber renamable $r1 = t2STR_POST killed renamable $r3, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -1968,6 +2020,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r4, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg @@ -1978,8 +2035,8 @@ body: | t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $q0, $r0, $r1, $r2, $r4 renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg @@ -1993,10 +2050,10 @@ body: | renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg early-clobber renamable $r1 = t2STR_POST killed renamable $r3, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... @@ -2177,6 +2234,9 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg @@ -2186,8 +2246,8 @@ body: | ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr @@ -2199,8 +2259,8 @@ body: | ; CHECK: renamable $r3 = t2SXTB killed renamable $r12, 0, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 16, 14 /* CC::al */, $noreg ; CHECK: early-clobber renamable $r1 = t2STR_POST killed renamable $r3, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -2213,6 +2273,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r4, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg @@ -2223,8 +2288,8 @@ body: | t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $q0, $r0, $r1, $r2, $r4 renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg @@ -2238,10 +2303,10 @@ body: | renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 16, 14 /* CC::al */, $noreg early-clobber renamable $r1 = t2STR_POST killed renamable $r3, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... @@ -2422,6 +2487,9 @@ body: | ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 8, implicit-def $itstate ; CHECK: tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg @@ -2431,8 +2499,8 @@ body: | ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr @@ -2444,8 +2512,8 @@ body: | ; CHECK: renamable $r3 = t2UXTB killed renamable $r12, 0, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 16, 14 /* CC::al */, $noreg ; CHECK: early-clobber renamable $r1 = t2STR_POST killed renamable $r3, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.exit: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.exit: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -2458,6 +2526,11 @@ body: | tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 0, 8, implicit-def $itstate tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r4, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14 /* CC::al */, $noreg renamable $r3 = t2BICri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg @@ -2468,8 +2541,8 @@ body: | t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $q0, $r0, $r1, $r2, $r4 renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg @@ -2483,10 +2556,10 @@ body: | renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 16, 14 /* CC::al */, $noreg early-clobber renamable $r1 = t2STR_POST killed renamable $r3, killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.store.addr) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.exit: + bb.3.exit: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 32a1c17dbbff3..34701aba6324a 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -138,7 +138,6 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 ; NOREDUCTIONS-NEXT: add sp, #4 ; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} -; entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0 @@ -178,7 +177,7 @@ vector.body: ; preds = %vector.body, %vecto %i9 = phi i32 [ %i7, %vector.ph ], [ %i17, %vector.body ] %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>* %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>* - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i8) %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv45, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef) %i10 = sext <4 x i16> %wide.masked.load to <4 x i32> %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv4850, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir index 87cb8ea04f968..4f80869de3ccb 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir @@ -115,6 +115,9 @@ body: | ; CHECK: t2IT 0, 4, implicit-def $itstate ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -127,8 +130,8 @@ body: | ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 @@ -140,8 +143,8 @@ body: | ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.middle.block: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r2 ; CHECK: renamable $r0, dead $cpsr = tADDi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r0, 0, $noreg @@ -156,6 +159,11 @@ body: | t2IT 0, 4, implicit-def $itstate renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -169,8 +177,8 @@ body: | t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $q1, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg @@ -184,10 +192,10 @@ body: | renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.middle.block: + bb.3.middle.block: liveins: $q0, $q1, $r2 renamable $r0, dead $cpsr = tADDi3 killed renamable $r2, 4, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir index b339d54db46d3..b487fa044e86d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir @@ -108,16 +108,19 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2 ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -130,6 +133,11 @@ body: | tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg @@ -137,8 +145,8 @@ body: | renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg @@ -150,10 +158,10 @@ body: | MVE_VPST 8, implicit $vpr renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir index 027bbb4e719f4..9790c04d204ff 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir @@ -107,16 +107,19 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2 ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -129,6 +132,11 @@ body: | tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg @@ -136,8 +144,8 @@ body: | renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg @@ -149,10 +157,10 @@ body: | MVE_VPST 8, implicit $vpr renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir index edb1bd3be4fac..65ebff2f696bf 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir @@ -107,16 +107,19 @@ body: | ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2 ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4) - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -129,6 +132,11 @@ body: | tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg @@ -136,8 +144,8 @@ body: | renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg @@ -149,10 +157,10 @@ body: | MVE_VPST 8, implicit $vpr renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir index 3de9bcc6a7aea..a42c33e24f26b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir @@ -113,12 +113,15 @@ body: | ; CHECK-LABEL: name: wrong_liveout_shift ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 2, implicit-def $itstate ; CHECK: renamable $r0 = t2MOVi16 32767, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: renamable $r0 = tSXTH killed renamable $r0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -135,8 +138,8 @@ body: | ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) ; CHECK: renamable $r3 = t2SUBrs renamable $r2, killed renamable $r12, 26, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 @@ -146,8 +149,8 @@ body: | ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = nuw MVE_VMULi16 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $q0 = MVE_VSUBi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.middle.block: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r3 ; CHECK: renamable $vpr = MVE_VCTP16 killed renamable $r3, 0, $noreg ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr @@ -155,7 +158,7 @@ body: | ; CHECK: $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r7, def $lr ; CHECK: renamable $r0 = tSXTH killed renamable $r0, 14 /* CC::al */, $noreg ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 - ; CHECK: bb.3 (align 16): + ; CHECK: bb.4 (align 16): ; CHECK: CONSTPOOL_ENTRY 0, %const.0, 16 bb.0.entry: successors: %bb.1(0x80000000) @@ -166,6 +169,11 @@ body: | renamable $r0 = t2MOVi16 32767, 0, $cpsr, implicit killed $r0, implicit $itstate renamable $r0 = tSXTH killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -183,8 +191,8 @@ body: | renamable $r3 = t2SUBrs renamable $r2, killed renamable $r12, 26, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg @@ -196,10 +204,10 @@ body: | renamable $q0 = nuw MVE_VMULi16 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 renamable $lr = t2LoopDec killed renamable $lr, 1 renamable $q0 = MVE_VSUBi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.middle.block: + bb.3.middle.block: liveins: $q0, $q1, $r3 renamable $vpr = MVE_VCTP16 killed renamable $r3, 0, $noreg @@ -209,7 +217,7 @@ body: | renamable $r0 = tSXTH killed renamable $r0, 14, $noreg tBX_RET 14, $noreg, implicit killed $r0 - bb.3 (align 16): + bb.4 (align 16): CONSTPOOL_ENTRY 0, %const.0, 16 ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll index 615334300c283..c6f64a8464bdc 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -8,19 +8,20 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vmul.i32 q0, q2, q0 ; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -45,7 +46,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i32, i32* %a, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -75,17 +76,18 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: letp lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -110,7 +112,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i32, i32* %a, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -136,17 +138,18 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: letp lr, .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -171,7 +174,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i32, i32* %a, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -197,16 +200,17 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 @@ -230,7 +234,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i32, i32* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -253,16 +257,17 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 @@ -286,7 +291,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i32, i32* %b, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -309,17 +314,18 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.8 lr, r3 -; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #16 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16 ; CHECK-NEXT: vmul.i8 q0, q1, q0 ; CHECK-NEXT: vstrb.8 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB5_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB5_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp10 = icmp eq i32 %N, 0 @@ -341,7 +347,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i8, i8* %b, i32 %index ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 - %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) %2 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef) @@ -368,17 +374,18 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB6_1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 -; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16 ; CHECK-NEXT: vmul.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB6_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB6_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp10 = icmp eq i32 %N, 0 @@ -400,7 +407,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds i16, i16* %b, i32 %index ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 - %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %2 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll index e10cc3153b9c9..64e7552b92b36 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -49,7 +49,7 @@ vector.body: ; preds = %vector.body, %vecto %induction = add <4 x i32> %broadcast.splat, ; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12 - %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef) %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir index f3c1cc639b0f9..60a578d81594f 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir @@ -51,6 +51,8 @@ ; Intentionally left blank - see MIR sequence below. entry: unreachable + vector.ph: + unreachable vector.body: unreachable for.cond.cleanup: @@ -61,6 +63,8 @@ ; Intentionally left blank - see MIR sequence below. entry: unreachable + vector.ph: + unreachable vector.body: unreachable for.cond.cleanup: @@ -113,6 +117,8 @@ ; Intentionally left blank - see MIR sequence below. entry: unreachable + vector.ph: + unreachable vector.body: unreachable for.cond.cleanup: @@ -123,6 +129,8 @@ ; Intentionally left blank - see MIR sequence below. entry: unreachable + vector.ph: + unreachable vector.body: unreachable for.cond.cleanup: @@ -133,6 +141,8 @@ ; Intentionally left blank - see MIR sequence below. entry: unreachable + vector.ph: + unreachable vector.body: unreachable for.cond.cleanup: @@ -202,18 +212,21 @@ body: | ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg ; CHECK: MVE_VPTv4s32r 4, renamable $q1, renamable $r2, 11, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -226,6 +239,11 @@ body: | tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -235,8 +253,8 @@ body: | renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg @@ -248,10 +266,10 @@ body: | renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... --- @@ -311,6 +329,9 @@ body: | ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -319,8 +340,8 @@ body: | ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr @@ -331,8 +352,8 @@ body: | ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ; ; Tests that secondary VCTPs are refused when their operand's reaching definition is not the same as the main @@ -349,6 +370,11 @@ body: | tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -358,8 +384,8 @@ body: | renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg @@ -372,10 +398,10 @@ body: | renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... --- @@ -435,6 +461,9 @@ body: | ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -443,8 +472,8 @@ body: | ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr @@ -454,8 +483,8 @@ body: | ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ; ; Tests that secondary VCTPs are refused when their operand is not the same register as the main VCTP's. @@ -471,6 +500,11 @@ body: | tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -480,8 +514,8 @@ body: | renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg @@ -493,10 +527,10 @@ body: | renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... --- @@ -556,18 +590,21 @@ body: | ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg ; CHECK: MVE_VPTv4s32r 12, renamable $q1, renamable $r2, 10, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 13, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 2, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ; ; Test including a else-predicated VCTP. @@ -583,6 +620,11 @@ body: | tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -592,8 +634,8 @@ body: | renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg @@ -605,10 +647,10 @@ body: | renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 2, killed renamable $vpr renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... --- @@ -668,18 +710,21 @@ body: | ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg ; CHECK: MVE_VPTv4s32r 4, renamable $q0, renamable $r2, 11, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -691,6 +736,11 @@ body: | tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -700,8 +750,8 @@ body: | renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg @@ -713,10 +763,10 @@ body: | renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... --- @@ -776,16 +826,19 @@ body: | ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r1, $r2 ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r2, $r3 ; CHECK: MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 8, implicit-def $vpr ; CHECK: dead renamable $vpr = MVE_VCMPs32r renamable $q0, renamable $r3, 12, 1, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) @@ -797,6 +850,11 @@ body: | tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -806,8 +864,8 @@ body: | renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 8, implicit-def $vpr @@ -815,10 +873,10 @@ body: | renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... --- @@ -878,6 +936,9 @@ body: | ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -886,16 +947,16 @@ body: | ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: dead renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2 ; CHECK: MVE_VPTv4s32r 2, killed renamable $q0, renamable $r2, 2, implicit-def $vpr ; CHECK: renamable $q0 = MVE_VLDRWU32 renamable $r0, 0, 1, $vpr ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed $vpr ; CHECK: MVE_VSTRWU32 renamable $q0, renamable $r0, 0, 1, killed $vpr ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ; ; This shouldn't be tail-predicated because the VLDR isn't predicated on the VCTP. @@ -910,6 +971,11 @@ body: | tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg @@ -919,8 +985,8 @@ body: | renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 2, implicit-def $vpr @@ -929,9 +995,9 @@ body: | MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, $vpr renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14 /* CC::al */, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg - bb.2.for.cond.cleanup: + bb.3.for.cond.cleanup: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir index 109547e935e6b..fc0aa2020df96 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir @@ -113,12 +113,15 @@ body: | ; CHECK-LABEL: name: wrong_liveout_shift ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 0, 2, implicit-def $itstate ; CHECK: renamable $r0 = t2MOVi16 32767, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: renamable $r0 = tSXTH killed renamable $r0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -135,8 +138,8 @@ body: | ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) ; CHECK: renamable $r3 = t2SUBrs renamable $r2, killed renamable $r12, 26, 14 /* CC::al */, $noreg, $noreg ; CHECK: $lr = t2DLS killed renamable $lr - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 @@ -146,8 +149,8 @@ body: | ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = nuw MVE_VMULi16 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $q0 = MVE_VSUBi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.middle.block: + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r3 ; CHECK: renamable $vpr = MVE_VCTP16 killed renamable $r3, 0, $noreg ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr @@ -155,7 +158,7 @@ body: | ; CHECK: $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r7, def $lr ; CHECK: renamable $r0 = tSXTH killed renamable $r0, 14 /* CC::al */, $noreg ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 - ; CHECK: bb.3 (align 16): + ; CHECK: bb.4 (align 16): ; CHECK: CONSTPOOL_ENTRY 0, %const.0, 16 bb.0.entry: successors: %bb.1(0x80000000) @@ -166,6 +169,11 @@ body: | renamable $r0 = t2MOVi16 32767, 0, $cpsr, implicit killed $r0, implicit $itstate renamable $r0 = tSXTH killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -183,8 +191,8 @@ body: | renamable $r3 = t2SUBrs renamable $r2, killed renamable $r12, 26, 14, $noreg, $noreg t2DoLoopStart renamable $lr - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $q0, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg @@ -196,10 +204,10 @@ body: | renamable $q0 = nuw MVE_VMULi16 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 renamable $lr = t2LoopDec killed renamable $lr, 1 renamable $q0 = MVE_VSUBi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.middle.block: + bb.3.middle.block: liveins: $q0, $q1, $r3 renamable $vpr = MVE_VCTP16 killed renamable $r3, 0, $noreg @@ -209,7 +217,7 @@ body: | renamable $r0 = tSXTH killed renamable $r0, 14, $noreg tBX_RET 14, $noreg, implicit killed $r0 - bb.3 (align 16): + bb.4 (align 16): CONSTPOOL_ENTRY 0, %const.0, 16 ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir index 08e9524ffb914..d91556e3e70b9 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir @@ -120,6 +120,9 @@ body: | ; CHECK: t2IT 0, 4, implicit-def $itstate ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -133,8 +136,8 @@ body: | ; CHECK: dead $lr = t2DLS renamable $r3 ; CHECK: $r12 = tMOVr killed $r3, 14 /* CC::al */, $noreg ; CHECK: $r3 = tMOVr $r2, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3, $r12 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 @@ -146,8 +149,8 @@ body: | ; CHECK: renamable $r12 = nsw t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.middle.block: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r2, $r3 ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $q2 = MVE_VDUP32 killed renamable $r0, 0, $noreg, undef renamable $q2 @@ -164,6 +167,11 @@ body: | t2IT 0, 4, implicit-def $itstate renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -178,8 +186,8 @@ body: | $r12 = tMOVr killed $r3, 14, $noreg $r3 = tMOVr $r2, 14, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $q1, $r0, $r1, $r2, $r3, $r12 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg @@ -193,10 +201,10 @@ body: | renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.middle.block: + bb.3.middle.block: liveins: $q0, $q1, $r2, $r3 renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 1, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir index bc5c1902266d1..337816146e5f0 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir @@ -112,6 +112,9 @@ body: | ; CHECK: t2IT 0, 4, implicit-def $itstate ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -124,8 +127,8 @@ body: | ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: dead $lr = t2DLS renamable $r12 ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg - ; CHECK: bb.1.vector.body: - ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 @@ -137,8 +140,8 @@ body: | ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 - ; CHECK: bb.2.middle.block: + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r2 ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r2, 0, $noreg ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr @@ -152,6 +155,11 @@ body: | t2IT 0, 4, implicit-def $itstate renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -165,8 +173,8 @@ body: | t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14, $noreg - bb.1.vector.body: - successors: %bb.1(0x7c000000), %bb.2(0x04000000) + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $q1, $r0, $r1, $r2, $r3 renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg @@ -180,10 +188,10 @@ body: | renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr - tB %bb.2, 14, $noreg + t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg - bb.2.middle.block: + bb.3.middle.block: liveins: $q0, $q1, $r2 renamable $vpr = MVE_VCTP32 killed renamable $r2, 0, $noreg diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll index e9dfccd320dae..116031cb895ff 100644 --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -13,7 +13,7 @@ define <4 x i32> @v4i32(i32 %index, i32 %BTC, <4 x i32> %V1, <4 x i32> %V2) { ; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.u32 cs, q1, q0 +; CHECK-NEXT: vcmpt.u32 hi, q1, q0 ; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vldr d1, [sp] ; CHECK-NEXT: vldrw.u32 q1, [r0] @@ -33,17 +33,94 @@ define <4 x i32> @v4i32(i32 %index, i32 %BTC, <4 x i32> %V1, <4 x i32> %V2) { ret <4 x i32> %select } +define <7 x i32> @v7i32(i32 %index, i32 %BTC, <7 x i32> %V1, <7 x i32> %V2) { +; CHECK-LABEL: v7i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: adr r3, .LCPI1_0 +; CHECK-NEXT: vdup.32 q1, r1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q2, q0, r1 +; CHECK-NEXT: vdup.32 q0, r2 +; CHECK-NEXT: vcmp.u32 hi, q1, q2 +; CHECK-NEXT: ldr r2, [sp, #32] +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.u32 hi, q0, q2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: ldr r2, [sp, #36] +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: ldr r2, [sp, #40] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: ldr r2, [sp, #44] +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: ldr r2, [sp] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: ldr r2, [sp, #4] +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: ldr r2, [sp, #8] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: ldr r2, [sp, #12] +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: adr r2, .LCPI1_1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r2] +; CHECK-NEXT: movw r2, #4095 +; CHECK-NEXT: vadd.i32 q2, q2, r1 +; CHECK-NEXT: vcmp.u32 hi, q1, q2 +; CHECK-NEXT: vmrs r1, p0 +; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: ldr r1, [sp, #48] +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.u32 hi, q0, q2 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: ldr r1, [sp, #52] +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: ldr r1, [sp, #56] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: ldr r1, [sp, #16] +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: ldr r1, [sp, #20] +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: ldr r1, [sp, #24] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: strd r3, r2, [r0, #16] +; CHECK-NEXT: str r1, [r0, #24] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .zero 4 + %active.lane.mask = call <7 x i1> @llvm.get.active.lane.mask.v7i1.i32(i32 %index, i32 %BTC) + %select = select <7 x i1> %active.lane.mask, <7 x i32> %V1, <7 x i32> %V2 + ret <7 x i32> %select +} + define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) { ; CHECK-LABEL: v8i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: adr.w r12, .LCPI1_0 +; CHECK-NEXT: adr.w r12, .LCPI2_0 ; CHECK-NEXT: vdup.32 q5, r1 ; CHECK-NEXT: vldrw.u32 q0, [r12] ; CHECK-NEXT: vmov.i8 q1, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vcmp.u32 cs, q5, q3 +; CHECK-NEXT: vcmp.u32 hi, q5, q3 ; CHECK-NEXT: vpsel q4, q2, q1 ; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: vmov.16 q0[0], r1 @@ -53,10 +130,10 @@ define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) { ; CHECK-NEXT: vmov.16 q0[2], r1 ; CHECK-NEXT: vmov r1, s19 ; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: adr r1, .LCPI1_1 +; CHECK-NEXT: adr r1, .LCPI2_1 ; CHECK-NEXT: vldrw.u32 q4, [r1] ; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vcmp.u32 cs, q5, q4 +; CHECK-NEXT: vcmp.u32 hi, q5, q4 ; CHECK-NEXT: vpsel q5, q2, q1 ; CHECK-NEXT: vmov r1, s20 ; CHECK-NEXT: vmov.16 q0[4], r1 @@ -102,12 +179,12 @@ define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) { ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .LCPI2_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 2 @ 0x2 ; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .LCPI2_1: ; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 6 @ 0x6 @@ -122,13 +199,13 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: adr.w r12, .LCPI2_0 +; CHECK-NEXT: adr.w r12, .LCPI3_0 ; CHECK-NEXT: vdup.32 q7, r1 ; CHECK-NEXT: vldrw.u32 q0, [r12] ; CHECK-NEXT: vmov.i8 q5, #0x0 ; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vcmp.u32 cs, q7, q1 +; CHECK-NEXT: vcmp.u32 hi, q7, q1 ; CHECK-NEXT: vpsel q0, q4, q5 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov.16 q2[0], r1 @@ -138,10 +215,10 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-NEXT: vmov.16 q2[2], r1 ; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: adr r1, .LCPI2_1 +; CHECK-NEXT: adr r1, .LCPI3_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vcmp.u32 cs, q7, q3 +; CHECK-NEXT: vcmp.u32 hi, q7, q3 ; CHECK-NEXT: vpsel q0, q4, q5 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov.16 q2[4], r1 @@ -169,10 +246,10 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-NEXT: vmov.8 q2[6], r1 ; CHECK-NEXT: vmov.u16 r1, q0[7] ; CHECK-NEXT: vmov.8 q2[7], r1 -; CHECK-NEXT: adr r1, .LCPI2_2 +; CHECK-NEXT: adr r1, .LCPI3_2 ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vcmp.u32 cs, q7, q0 +; CHECK-NEXT: vcmp.u32 hi, q7, q0 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: vpsel q6, q4, q5 ; CHECK-NEXT: vmov r1, s24 @@ -183,10 +260,10 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-NEXT: vmov.16 q0[2], r1 ; CHECK-NEXT: vmov r1, s27 ; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: adr r1, .LCPI2_3 +; CHECK-NEXT: adr r1, .LCPI3_3 ; CHECK-NEXT: vldrw.u32 q6, [r1] ; CHECK-NEXT: vadd.i32 q6, q6, r0 -; CHECK-NEXT: vcmp.u32 cs, q7, q6 +; CHECK-NEXT: vcmp.u32 hi, q7, q6 ; CHECK-NEXT: vpsel q7, q4, q5 ; CHECK-NEXT: vmov r1, s28 ; CHECK-NEXT: vmov.16 q0[4], r1 @@ -308,22 +385,22 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .LCPI3_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 2 @ 0x2 ; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .LCPI2_1: +; CHECK-NEXT: .LCPI3_1: ; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .LCPI2_2: +; CHECK-NEXT: .LCPI3_2: ; CHECK-NEXT: .long 8 @ 0x8 ; CHECK-NEXT: .long 9 @ 0x9 ; CHECK-NEXT: .long 10 @ 0xa ; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .LCPI2_3: +; CHECK-NEXT: .LCPI3_3: ; CHECK-NEXT: .long 12 @ 0xc ; CHECK-NEXT: .long 13 @ 0xd ; CHECK-NEXT: .long 14 @ 0xe @@ -334,5 +411,6 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) { } declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <7 x i1> @llvm.get.active.lane.mask.v7i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/constant-hoisting.ll b/llvm/test/CodeGen/Thumb2/constant-hoisting.ll index 5c8f934ce61d3..161259fa5e233 100644 --- a/llvm/test/CodeGen/Thumb2/constant-hoisting.ll +++ b/llvm/test/CodeGen/Thumb2/constant-hoisting.ll @@ -122,28 +122,29 @@ define i32 @test_addr(i32 %a, i8* nocapture readonly %b) { ; CHECK-V7M: mov r2, r0 ; CHECK-V7M-NEXT: movs r0, #0 ; CHECK-V7M-NEXT: cmp r2, #29 -; CHECK-V7M-NEXT: bgt .LBB1_3 -; CHECK-V7M-NEXT: cbz r2, .LBB1_6 +; CHECK-V7M-NEXT: bgt .LBB1_4 +; CHECK-V7M-NEXT: cbz r2, .LBB1_7 ; CHECK-V7M-NEXT: cmp r2, #1 ; CHECK-V7M-NEXT: it ne ; CHECK-V7M-NEXT: bxne lr -; CHECK-V7M-NEXT: movw r0, #305 -; CHECK-V7M-NEXT: b .LBB1_8 ; CHECK-V7M-NEXT: .LBB1_3: +; CHECK-V7M-NEXT: movw r0, #305 +; CHECK-V7M-NEXT: b .LBB1_9 +; CHECK-V7M-NEXT: .LBB1_4: ; CHECK-V7M-NEXT: cmp r2, #30 -; CHECK-V7M-NEXT: beq .LBB1_7 +; CHECK-V7M-NEXT: beq .LBB1_8 ; CHECK-V7M-NEXT: cmp r2, #50 -; CHECK-V7M-NEXT: bne .LBB1_9 +; CHECK-V7M-NEXT: bne .LBB1_10 ; CHECK-V7M-NEXT: movw r0, #307 -; CHECK-V7M-NEXT: b .LBB1_8 -; CHECK-V7M-NEXT: .LBB1_6: -; CHECK-V7M-NEXT: mov.w r0, #304 -; CHECK-V7M-NEXT: b .LBB1_8 +; CHECK-V7M-NEXT: b .LBB1_9 ; CHECK-V7M-NEXT: .LBB1_7: -; CHECK-V7M-NEXT: mov.w r0, #306 +; CHECK-V7M-NEXT: mov.w r0, #304 +; CHECK-V7M-NEXT: b .LBB1_9 ; CHECK-V7M-NEXT: .LBB1_8: -; CHECK-V7M-NEXT: ldrb r0, [r1, r0] +; CHECK-V7M-NEXT: mov.w r0, #306 ; CHECK-V7M-NEXT: .LBB1_9: +; CHECK-V7M-NEXT: ldrb r0, [r1, r0] +; CHECK-V7M-NEXT: .LBB1_10: ; CHECK-V7M-NEXT: bx lr entry: switch i32 %a, label %return [ diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index 1a53561388138..d364eb97fff72 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -7,15 +7,16 @@ define arm_aapcs_vfpcc void @test_fadd(half* noalias nocapture readonly %A, half ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vadd.f16 q0, q0, r1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: bne .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %B = load half, half* %BB @@ -53,15 +54,16 @@ define arm_aapcs_vfpcc void @test_fadd_r(half* noalias nocapture readonly %A, ha ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vadd.f16 q0, q0, r1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: bne .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %B = load half, half* %BB @@ -99,15 +101,16 @@ define arm_aapcs_vfpcc void @test_fmul(half* noalias nocapture readonly %A, half ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vmul.f16 q0, q0, r1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: bne .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %B = load half, half* %BB @@ -145,15 +148,16 @@ define arm_aapcs_vfpcc void @test_fmul_r(half* noalias nocapture readonly %A, ha ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vmul.f16 q0, q0, r1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: bne .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %B = load half, half* %BB @@ -191,15 +195,16 @@ define arm_aapcs_vfpcc void @test_fsub(half* noalias nocapture readonly %A, half ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vsub.f16 q0, q0, r1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: bne .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %B = load half, half* %BB @@ -237,16 +242,17 @@ define arm_aapcs_vfpcc void @test_fsub_r(half* noalias nocapture readonly %A, ha ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vdup.16 q0, r1 -; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vsub.f16 q1, q0, q1 ; CHECK-NEXT: vstrb.8 q1, [r2], #16 -; CHECK-NEXT: bne .LBB5_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB5_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %B = load half, half* %BB @@ -286,16 +292,17 @@ define arm_aapcs_vfpcc void @test_fmas(half* noalias nocapture readonly %A, half ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB6_1: @ %vector.ph ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs.w r12, r12, #8 ; CHECK-NEXT: vfmas.f16 q1, q0, r2 ; CHECK-NEXT: vstrb.8 q1, [r3], #16 -; CHECK-NEXT: bne .LBB6_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB6_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %C = load half, half* %CC @@ -338,16 +345,17 @@ define arm_aapcs_vfpcc void @test_fmas_r(half* noalias nocapture readonly %A, ha ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB7_1: @ %vector.ph ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs.w r12, r12, #8 ; CHECK-NEXT: vfmas.f16 q1, q0, r2 ; CHECK-NEXT: vstrb.8 q1, [r3], #16 -; CHECK-NEXT: bne .LBB7_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB7_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %C = load half, half* %CC @@ -390,16 +398,17 @@ define arm_aapcs_vfpcc void @test_fma(half* noalias nocapture readonly %A, half* ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs.w r12, r12, #8 ; CHECK-NEXT: vfma.f16 q1, q0, r2 ; CHECK-NEXT: vstrb.8 q1, [r3], #16 -; CHECK-NEXT: bne .LBB8_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB8_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %C = load half, half* %CC @@ -442,16 +451,17 @@ define arm_aapcs_vfpcc void @test_fma_r(half* noalias nocapture readonly %A, hal ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB9_1: @ %vector.ph ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs.w r12, r12, #8 ; CHECK-NEXT: vfma.f16 q1, q0, r2 ; CHECK-NEXT: vstrb.8 q1, [r3], #16 -; CHECK-NEXT: bne .LBB9_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB9_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %C = load half, half* %CC @@ -495,10 +505,11 @@ define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB10_1: @ %vector.ph ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: vneg.f16 q0, q0 -; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -506,8 +517,8 @@ define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half ; CHECK-NEXT: subs.w r12, r12, #8 ; CHECK-NEXT: vfma.f16 q3, q2, q1 ; CHECK-NEXT: vstrb.8 q3, [r3], #16 -; CHECK-NEXT: bne .LBB10_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB10_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %C = load half, half* %CC @@ -550,9 +561,10 @@ define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, ha ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB11_1: @ %vector.ph ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: vdup.16 q0, r2 -; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: .LBB11_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -560,8 +572,8 @@ define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, ha ; CHECK-NEXT: subs.w r12, r12, #8 ; CHECK-NEXT: vfms.f16 q3, q2, q1 ; CHECK-NEXT: vstrb.8 q3, [r3], #16 -; CHECK-NEXT: bne .LBB11_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB11_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %C = load half, half* %CC @@ -604,8 +616,9 @@ define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB12_1: @ %vector.ph ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: .LBB12_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 @@ -613,8 +626,8 @@ define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* ; CHECK-NEXT: vneg.f16 q0, q0 ; CHECK-NEXT: vfma.f16 q0, q1, r2 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 -; CHECK-NEXT: bne .LBB12_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB12_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %C = load half, half* %CC @@ -657,8 +670,9 @@ define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, hal ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB13_1: @ %vector.ph ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: .LBB13_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 @@ -666,8 +680,8 @@ define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, hal ; CHECK-NEXT: vneg.f16 q0, q0 ; CHECK-NEXT: vfma.f16 q0, q1, r2 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 -; CHECK-NEXT: bne .LBB13_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB13_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %C = load half, half* %CC diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index bf6b57f223a46..165bf72c7187d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -7,15 +7,16 @@ define arm_aapcs_vfpcc void @test_fadd(float* noalias nocapture readonly %A, flo ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.f32 q0, q0, r3 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 -; CHECK-NEXT: bne .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -52,15 +53,16 @@ define arm_aapcs_vfpcc void @test_fadd_r(float* noalias nocapture readonly %A, f ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.f32 q0, q0, r3 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 -; CHECK-NEXT: bne .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -97,15 +99,16 @@ define arm_aapcs_vfpcc void @test_fmul(float* noalias nocapture readonly %A, flo ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmul.f32 q0, q0, r3 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 -; CHECK-NEXT: bne .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -142,15 +145,16 @@ define arm_aapcs_vfpcc void @test_fmul_r(float* noalias nocapture readonly %A, f ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmul.f32 q0, q0, r3 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 -; CHECK-NEXT: bne .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -187,15 +191,16 @@ define arm_aapcs_vfpcc void @test_fsub(float* noalias nocapture readonly %A, flo ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vsub.f32 q0, q0, r3 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 -; CHECK-NEXT: bne .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -232,16 +237,17 @@ define arm_aapcs_vfpcc void @test_fsub_r(float* noalias nocapture readonly %A, f ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vsub.f32 q1, q0, q1 ; CHECK-NEXT: vstrb.8 q1, [r1], #16 -; CHECK-NEXT: bne .LBB5_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB5_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -279,16 +285,17 @@ define arm_aapcs_vfpcc void @test_fmas(float* noalias nocapture readonly %A, flo ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB6_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrb.8 q1, [r2], #16 -; CHECK-NEXT: bne .LBB6_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB6_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -329,16 +336,17 @@ define arm_aapcs_vfpcc void @test_fmas_r(float* noalias nocapture readonly %A, f ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB7_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrb.8 q1, [r2], #16 -; CHECK-NEXT: bne .LBB7_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB7_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -379,16 +387,17 @@ define arm_aapcs_vfpcc void @test_fma(float* noalias nocapture readonly %A, floa ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrb.8 q1, [r2], #16 -; CHECK-NEXT: bne .LBB8_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB8_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -429,16 +438,17 @@ define arm_aapcs_vfpcc void @test_fma_r(float* noalias nocapture readonly %A, fl ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB9_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrb.8 q1, [r2], #16 -; CHECK-NEXT: bne .LBB9_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB9_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -480,10 +490,11 @@ define arm_aapcs_vfpcc void @test_fmss(float* noalias nocapture readonly %A, flo ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB10_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -491,8 +502,8 @@ define arm_aapcs_vfpcc void @test_fmss(float* noalias nocapture readonly %A, flo ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vfma.f32 q3, q2, q1 ; CHECK-NEXT: vstrb.8 q3, [r2], #16 -; CHECK-NEXT: bne .LBB10_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB10_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -533,9 +544,10 @@ define arm_aapcs_vfpcc void @test_fmss_r(float* noalias nocapture readonly %A, f ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB11_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 -; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: .LBB11_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -543,8 +555,8 @@ define arm_aapcs_vfpcc void @test_fmss_r(float* noalias nocapture readonly %A, f ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vfms.f32 q3, q2, q1 ; CHECK-NEXT: vstrb.8 q3, [r2], #16 -; CHECK-NEXT: bne .LBB11_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB11_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -585,8 +597,9 @@ define arm_aapcs_vfpcc void @test_fms(float* noalias nocapture readonly %A, floa ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB12_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: .LBB12_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 @@ -594,8 +607,8 @@ define arm_aapcs_vfpcc void @test_fms(float* noalias nocapture readonly %A, floa ; CHECK-NEXT: vneg.f32 q0, q0 ; CHECK-NEXT: vfma.f32 q0, q1, r12 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: bne .LBB12_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB12_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 @@ -636,8 +649,9 @@ define arm_aapcs_vfpcc void @test_fms_r(float* noalias nocapture readonly %A, fl ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB13_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: .LBB13_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 @@ -645,8 +659,8 @@ define arm_aapcs_vfpcc void @test_fms_r(float* noalias nocapture readonly %A, fl ; CHECK-NEXT: vneg.f32 q0, q0 ; CHECK-NEXT: vfma.f32 q0, q1, r12 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: bne .LBB13_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB13_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %0 = and i32 %n, 7 diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll index 306f31be27f96..86cbec661f1f5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -9,18 +9,19 @@ define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 -; CHECK-NEXT: letp lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -44,7 +45,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -71,18 +72,19 @@ define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 -; CHECK-NEXT: letp lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -106,7 +108,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -134,18 +136,19 @@ define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 -; CHECK-NEXT: letp lr, .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -169,7 +172,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -196,18 +199,19 @@ define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 -; CHECK-NEXT: letp lr, .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -231,7 +235,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -259,19 +263,20 @@ define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: eor r12, r4, #-2147483648 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 -; CHECK-NEXT: letp lr, .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -296,7 +301,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -323,12 +328,13 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vdup.32 q0, r4 ; CHECK-NEXT: vneg.f32 q0, q0 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 @@ -336,8 +342,8 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfma.f32 q3, q2, q1 ; CHECK-NEXT: vstrw.32 q3, [r2], #16 -; CHECK-NEXT: letp lr, .LBB5_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB5_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -361,7 +367,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -389,11 +395,12 @@ define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB6_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 @@ -401,8 +408,8 @@ define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 ; CHECK-NEXT: vfms.f32 q3, q2, q1 ; CHECK-NEXT: vstrw.32 q3, [r2], #16 -; CHECK-NEXT: letp lr, .LBB6_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB6_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -426,7 +433,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -454,11 +461,12 @@ define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB7_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 @@ -466,8 +474,8 @@ define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q3, q2, q1 ; CHECK-NEXT: vstrw.32 q3, [r2], #16 -; CHECK-NEXT: letp lr, .LBB7_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB7_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -491,7 +499,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -519,19 +527,20 @@ define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: eor r12, r4, #-2147483648 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 -; CHECK-NEXT: letp lr, .LBB8_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB8_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -556,7 +565,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -583,19 +592,20 @@ define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB9_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q2, q1, q0 ; CHECK-NEXT: vstrw.32 q2, [r2], #16 -; CHECK-NEXT: letp lr, .LBB9_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB9_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -619,7 +629,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -647,10 +657,11 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB10_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 @@ -658,8 +669,8 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: vneg.f32 q1, q1 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 -; CHECK-NEXT: letp lr, .LBB10_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB10_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -683,7 +694,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) @@ -711,10 +722,11 @@ define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB11_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: .LBB11_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 @@ -722,8 +734,8 @@ define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: vneg.f32 q1, q1 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 -; CHECK-NEXT: letp lr, .LBB11_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB11_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 @@ -747,7 +759,7 @@ vector.body: ; preds = %vector.body, %vecto %0 = getelementptr inbounds float, float* %x, i32 %index ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 19bece70dd05f..3cc5915a08ca3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -317,6 +317,7 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB8_1: @ %vector.ph.preheader ; CHECK-NEXT: bic r12, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w lr, r12, #4 @@ -324,26 +325,26 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read ; CHECK-NEXT: adr r3, .LCPI8_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB8_1: @ %vector.ph +; CHECK-NEXT: .LBB8_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB8_2 Depth 2 +; CHECK-NEXT: @ Child Loop BB8_3 Depth 2 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: dls lr, r4 -; CHECK-NEXT: .LBB8_2: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB8_1 Depth=1 +; CHECK-NEXT: .LBB8_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB8_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! ; CHECK-NEXT: vstrb.8 q2, [r0], #16 -; CHECK-NEXT: le lr, .LBB8_2 -; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB8_1 Depth=1 +; CHECK-NEXT: le lr, .LBB8_3 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB8_2 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB8_1 -; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB8_2 +; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.5: +; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI8_0: ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 ; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 @@ -489,6 +490,7 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(i32* noalias nocapture reado ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB10_1: @ %vector.ph.preheader ; CHECK-NEXT: bic r12, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w lr, r12, #4 @@ -496,26 +498,26 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(i32* noalias nocapture reado ; CHECK-NEXT: adr r3, .LCPI10_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB10_1: @ %vector.ph +; CHECK-NEXT: .LBB10_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB10_2 Depth 2 +; CHECK-NEXT: @ Child Loop BB10_3 Depth 2 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: dls lr, r4 -; CHECK-NEXT: .LBB10_2: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB10_1 Depth=1 +; CHECK-NEXT: .LBB10_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q2, [q1, #508]! ; CHECK-NEXT: vstrb.8 q2, [r0], #16 -; CHECK-NEXT: le lr, .LBB10_2 -; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB10_1 Depth=1 +; CHECK-NEXT: le lr, .LBB10_3 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB10_2 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB10_1 -; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB10_2 +; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.5: +; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI10_0: ; CHECK-NEXT: .long 4294966788 @ 0xfffffe04 ; CHECK-NEXT: .long 4294966792 @ 0xfffffe08 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll index 78e3ced1ceb6c..87e29be4be070 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -737,18 +737,19 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) { ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB22_1: @ %vector.body.preheader ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB22_1: @ %vector.body +; CHECK-NEXT: .LBB22_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q1, [q0] ; CHECK-NEXT: vstrwt.32 q1, [r0], #16 -; CHECK-NEXT: le lr, .LBB22_1 -; CHECK-NEXT: @ %bb.2: @ %for.end +; CHECK-NEXT: le lr, .LBB22_2 +; CHECK-NEXT: @ %bb.3: @ %for.end ; CHECK-NEXT: pop {r7, pc} entry: %and = and i32 %n, -16 @@ -782,18 +783,19 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) { ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB23_1: @ %vector.body.preheader ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB23_1: @ %vector.body +; CHECK-NEXT: .LBB23_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q1, [q0] ; CHECK-NEXT: vstrwt.32 q1, [r0], #16 -; CHECK-NEXT: le lr, .LBB23_1 -; CHECK-NEXT: @ %bb.2: @ %for.end +; CHECK-NEXT: le lr, .LBB23_2 +; CHECK-NEXT: @ %bb.3: @ %for.end ; CHECK-NEXT: pop {r7, pc} entry: %and = and i32 %n, -16 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll index 89aaee779a7bf..535d7a1c38cb7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -4,16 +4,15 @@ define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) { ; CHECK-LABEL: mve_gather_qi_wb: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: adr r4, .LCPI0_0 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: adds r0, r3, #1 +; CHECK-NEXT: adr r0, .LCPI0_0 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 -; CHECK-NEXT: dlstp.32 lr, r0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r12], #16 @@ -25,7 +24,7 @@ define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32* ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: str.w r0, [r2, r1, lsl #2] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: @@ -74,18 +73,17 @@ define dso_local void @mve_gatherscatter_offset(i32* noalias nocapture readonly ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: adr r4, .LCPI1_0 -; CHECK-NEXT: add.w r12, r0, r3, lsl #2 -; CHECK-NEXT: adds r0, r3, #1 -; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: adds r3, #4 +; CHECK-NEXT: add.w r4, r0, r3, lsl #2 +; CHECK-NEXT: adr r0, .LCPI1_0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: add.w r12, r3, #4 ; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: vmov.i32 q0, #0x14 -; CHECK-NEXT: dlstp.32 lr, r0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [r1, q1, uxtw #2] -; CHECK-NEXT: vldrw.u32 q4, [r12], #16 +; CHECK-NEXT: vldrw.u32 q4, [r4], #16 ; CHECK-NEXT: vmul.i32 q2, q2, q4 ; CHECK-NEXT: vstrw.32 q2, [r1, q1, uxtw #2] ; CHECK-NEXT: vadd.i32 q1, q1, q0 @@ -94,7 +92,7 @@ define dso_local void @mve_gatherscatter_offset(i32* noalias nocapture readonly ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vmov q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: str.w r0, [r2, r3, lsl #2] +; CHECK-NEXT: str.w r0, [r2, r12, lsl #2] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 @@ -141,17 +139,16 @@ end: ; preds = %middle.block define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) { ; CHECK-LABEL: mve_scatter_qi: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: adr r4, .LCPI2_0 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: adds r0, r3, #1 +; CHECK-NEXT: adr r0, .LCPI2_0 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x3 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 -; CHECK-NEXT: vmov.i32 q2, #0x3 -; CHECK-NEXT: dlstp.32 lr, r0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r12], #16 @@ -163,7 +160,7 @@ define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* n ; CHECK-NEXT: vmov q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: str.w r0, [r2, r1, lsl #2] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI2_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll index cccd76a6512cc..6689504561cb1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll @@ -9,6 +9,7 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB0_1: @ %vector.ph.preheader ; CHECK-NEXT: bic r12, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w lr, r12, #4 @@ -16,26 +17,26 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read ; CHECK-NEXT: adr r3, .LCPI0_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB0_1: @ %vector.ph +; CHECK-NEXT: .LBB0_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB0_2 Depth 2 +; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: dls lr, r4 -; CHECK-NEXT: .LBB0_2: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB0_1 Depth=1 +; CHECK-NEXT: .LBB0_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! ; CHECK-NEXT: vstrb.8 q2, [r0], #16 -; CHECK-NEXT: le lr, .LBB0_2 -; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: le lr, .LBB0_3 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB0_1 -; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB0_2 +; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.5: +; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI0_0: ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 ; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll index 4c665bdbfde1a..12561d560309a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll @@ -9,6 +9,7 @@ define arm_aapcs_vfpcc void @thres_i32(i32* %data, i16 zeroext %N, i32 %T) { ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #3 ; CHECK-NEXT: add.w r1, r3, r1, lsl #2 ; CHECK-NEXT: movs r3, #1 @@ -16,14 +17,14 @@ define arm_aapcs_vfpcc void @thres_i32(i32* %data, i16 zeroext %N, i32 %T) { ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 ; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpte.s32 ge, q1, r2 ; CHECK-NEXT: vcmpt.s32 le, q1, r1 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %conv = zext i16 %N to i32 @@ -65,6 +66,7 @@ define arm_aapcs_vfpcc void @thresh_i16(i16* %data, i16 zeroext %N, i16 signext ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3 ; CHECK-NEXT: movs r3, #1 @@ -72,14 +74,14 @@ define arm_aapcs_vfpcc void @thresh_i16(i16* %data, i16 zeroext %N, i16 signext ; CHECK-NEXT: add.w lr, r3, r1, lsr #3 ; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vpte.s16 ge, q1, r2 ; CHECK-NEXT: vcmpt.s16 le, q1, r1 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %conv2 = zext i16 %N to i32 @@ -121,6 +123,7 @@ define arm_aapcs_vfpcc void @thresh_i8(i8* %data, i16 zeroext %N, i8 signext %T) ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #15 ; CHECK-NEXT: add.w r1, r3, r1, lsl #4 ; CHECK-NEXT: movs r3, #1 @@ -128,14 +131,14 @@ define arm_aapcs_vfpcc void @thresh_i8(i8* %data, i16 zeroext %N, i8 signext %T) ; CHECK-NEXT: add.w lr, r3, r1, lsr #4 ; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vpte.s8 ge, q1, r2 ; CHECK-NEXT: vcmpt.s8 le, q1, r1 ; CHECK-NEXT: vstrbe.8 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %conv2 = zext i16 %N to i32 @@ -177,6 +180,7 @@ define arm_aapcs_vfpcc void @thresh_f32(float* %data, i16 zeroext %N, float %T) ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 @@ -185,14 +189,14 @@ define arm_aapcs_vfpcc void @thresh_f32(float* %data, i16 zeroext %N, float %T) ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: eor r2, r1, #-2147483648 -; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpte.f32 ge, q1, r1 ; CHECK-NEXT: vcmpt.f32 le, q1, r2 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %conv = zext i16 %N to i32 @@ -234,6 +238,7 @@ define arm_aapcs_vfpcc void @thresh_f16(half* %data, i16 zeroext %N, float %T.co ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3 ; CHECK-NEXT: vmov r2, s0 @@ -243,14 +248,14 @@ define arm_aapcs_vfpcc void @thresh_f16(half* %data, i16 zeroext %N, float %T.co ; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vpte.f16 ge, q1, r2 ; CHECK-NEXT: vcmpt.f16 le, q1, r1 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %0 = bitcast float %T.coerce to i32 @@ -297,6 +302,7 @@ define arm_aapcs_vfpcc void @thres_rev_i32(i32* %data, i16 zeroext %N, i32 %T) { ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #3 ; CHECK-NEXT: add.w r1, r3, r1, lsl #2 ; CHECK-NEXT: movs r3, #1 @@ -304,14 +310,14 @@ define arm_aapcs_vfpcc void @thres_rev_i32(i32* %data, i16 zeroext %N, i32 %T) { ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 ; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpte.s32 ge, q1, r2 ; CHECK-NEXT: vcmpt.s32 le, q1, r1 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB5_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB5_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %conv = zext i16 %N to i32 @@ -353,6 +359,7 @@ define arm_aapcs_vfpcc void @thresh_rev_i16(i16* %data, i16 zeroext %N, i16 sign ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB6_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3 ; CHECK-NEXT: movs r3, #1 @@ -360,14 +367,14 @@ define arm_aapcs_vfpcc void @thresh_rev_i16(i16* %data, i16 zeroext %N, i16 sign ; CHECK-NEXT: add.w lr, r3, r1, lsr #3 ; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vpte.s16 ge, q1, r2 ; CHECK-NEXT: vcmpt.s16 le, q1, r1 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB6_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB6_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %conv2 = zext i16 %N to i32 @@ -409,6 +416,7 @@ define arm_aapcs_vfpcc void @thresh_rev_i8(i8* %data, i16 zeroext %N, i8 signext ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB7_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #15 ; CHECK-NEXT: add.w r1, r3, r1, lsl #4 ; CHECK-NEXT: movs r3, #1 @@ -416,14 +424,14 @@ define arm_aapcs_vfpcc void @thresh_rev_i8(i8* %data, i16 zeroext %N, i8 signext ; CHECK-NEXT: add.w lr, r3, r1, lsr #4 ; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vpte.s8 ge, q1, r2 ; CHECK-NEXT: vcmpt.s8 le, q1, r1 ; CHECK-NEXT: vstrbe.8 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB7_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB7_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %conv2 = zext i16 %N to i32 @@ -465,6 +473,7 @@ define arm_aapcs_vfpcc void @thresh_rev_f32(float* %data, i16 zeroext %N, float ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 @@ -473,14 +482,14 @@ define arm_aapcs_vfpcc void @thresh_rev_f32(float* %data, i16 zeroext %N, float ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: eor r2, r1, #-2147483648 -; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpte.f32 ge, q1, r1 ; CHECK-NEXT: vcmpt.f32 le, q1, r2 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB8_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB8_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %conv = zext i16 %N to i32 @@ -522,6 +531,7 @@ define arm_aapcs_vfpcc void @thresh_rev_f16(half* %data, i16 zeroext %N, float % ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB9_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3 ; CHECK-NEXT: vmov r2, s0 @@ -531,14 +541,14 @@ define arm_aapcs_vfpcc void @thresh_rev_f16(half* %data, i16 zeroext %N, float % ; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vpte.f16 ge, q1, r2 ; CHECK-NEXT: vcmpt.f16 le, q1, r1 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB9_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB9_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %0 = bitcast float %T.coerce to i32 diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index 9897b607d6b3a..2ea70f1b06de2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -1537,6 +1537,7 @@ define arm_aapcs_vfpcc void @ssatmul_s4t_q15(i16* nocapture readonly %pSrcA, i16 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: add.w r12, r3, #3 ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: bic r12, r12, #3 @@ -1548,7 +1549,7 @@ define arm_aapcs_vfpcc void @ssatmul_s4t_q15(i16* nocapture readonly %pSrcA, i16 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vdup.32 q2, r3 ; CHECK-NEXT: adds r3, #4 @@ -1561,11 +1562,11 @@ define arm_aapcs_vfpcc void @ssatmul_s4t_q15(i16* nocapture readonly %pSrcA, i16 ; CHECK-NEXT: vmovlb.s16 q2, q2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.32 q2, [r2], #8 -; CHECK-NEXT: le lr, .LBB8_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: le lr, .LBB8_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI8_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 diff --git a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll new file mode 100644 index 0000000000000..ffba80c4895bb --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll @@ -0,0 +1,205 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK + +define arm_aapcs_vfpcc <4 x i32> @test_v4i32(i32 %x, <4 x i32> %s0, <4 x i32> %s1) { +; CHECK-LABEL: test_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB0_1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <4 x i32> %s0, <4 x i32> %s1 + ret <4 x i32> %s +} + +define arm_aapcs_vfpcc <8 x i16> @test_v8i16(i32 %x, <8 x i16> %s0, <8 x i16> %s1) { +; CHECK-LABEL: test_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB1_1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <8 x i16> %s0, <8 x i16> %s1 + ret <8 x i16> %s +} + +define arm_aapcs_vfpcc <16 x i8> @test_v16i8(i32 %x, <16 x i8> %s0, <16 x i8> %s1) { +; CHECK-LABEL: test_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB2_1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <16 x i8> %s0, <16 x i8> %s1 + ret <16 x i8> %s +} + +define arm_aapcs_vfpcc <2 x i64> @test_v2i64(i32 %x, <2 x i64> %s0, <2 x i64> %s1) { +; CHECK-LABEL: test_v2i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB3_1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <2 x i64> %s0, <2 x i64> %s1 + ret <2 x i64> %s +} + +define arm_aapcs_vfpcc <4 x float> @test_v4float(i32 %x, <4 x float> %s0, <4 x float> %s1) { +; CHECK-LABEL: test_v4float: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB4_1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <4 x float> %s0, <4 x float> %s1 + ret <4 x float> %s +} + +define arm_aapcs_vfpcc <8 x half> @test_v8half(i32 %x, <8 x half> %s0, <8 x half> %s1) { +; CHECK-LABEL: test_v8half: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB5_1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <8 x half> %s0, <8 x half> %s1 + ret <8 x half> %s +} + +define arm_aapcs_vfpcc <2 x double> @test_v2double(i32 %x, <2 x double> %s0, <2 x double> %s1) { +; CHECK-LABEL: test_v2double: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: bxeq lr +; CHECK-NEXT: .LBB6_1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <2 x double> %s0, <2 x double> %s1 + ret <2 x double> %s +} + +define arm_aapcs_vfpcc <4 x i32> @minsize_v4i32(i32 %x, <4 x i32> %s0, <4 x i32> %s1) minsize { +; CHECK-LABEL: minsize_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cbz r0, .LBB7_2 +; CHECK-NEXT: @ %bb.1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: .LBB7_2: @ %select.end +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <4 x i32> %s0, <4 x i32> %s1 + ret <4 x i32> %s +} + +define arm_aapcs_vfpcc <8 x i16> @minsize_v8i16(i32 %x, <8 x i16> %s0, <8 x i16> %s1) minsize { +; CHECK-LABEL: minsize_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cbz r0, .LBB8_2 +; CHECK-NEXT: @ %bb.1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: .LBB8_2: @ %select.end +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <8 x i16> %s0, <8 x i16> %s1 + ret <8 x i16> %s +} + +define arm_aapcs_vfpcc <16 x i8> @minsize_v16i8(i32 %x, <16 x i8> %s0, <16 x i8> %s1) minsize { +; CHECK-LABEL: minsize_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cbz r0, .LBB9_2 +; CHECK-NEXT: @ %bb.1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: .LBB9_2: @ %select.end +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <16 x i8> %s0, <16 x i8> %s1 + ret <16 x i8> %s +} + +define arm_aapcs_vfpcc <2 x i64> @minsize_v2i64(i32 %x, <2 x i64> %s0, <2 x i64> %s1) minsize { +; CHECK-LABEL: minsize_v2i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cbz r0, .LBB10_2 +; CHECK-NEXT: @ %bb.1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: .LBB10_2: @ %select.end +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <2 x i64> %s0, <2 x i64> %s1 + ret <2 x i64> %s +} + +define arm_aapcs_vfpcc <4 x float> @minsize_v4float(i32 %x, <4 x float> %s0, <4 x float> %s1) minsize { +; CHECK-LABEL: minsize_v4float: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cbz r0, .LBB11_2 +; CHECK-NEXT: @ %bb.1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: .LBB11_2: @ %select.end +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <4 x float> %s0, <4 x float> %s1 + ret <4 x float> %s +} + +define arm_aapcs_vfpcc <8 x half> @minsize_v8half(i32 %x, <8 x half> %s0, <8 x half> %s1) minsize { +; CHECK-LABEL: minsize_v8half: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cbz r0, .LBB12_2 +; CHECK-NEXT: @ %bb.1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: .LBB12_2: @ %select.end +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <8 x half> %s0, <8 x half> %s1 + ret <8 x half> %s +} + +define arm_aapcs_vfpcc <2 x double> @minsize_v2double(i32 %x, <2 x double> %s0, <2 x double> %s1) minsize { +; CHECK-LABEL: minsize_v2double: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cbz r0, .LBB13_2 +; CHECK-NEXT: @ %bb.1: @ %select.false +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: .LBB13_2: @ %select.end +; CHECK-NEXT: bx lr +entry: + %c = icmp eq i32 %x, 0 + %s = select i1 %c, <2 x double> %s0, <2 x double> %s1 + ret <2 x double> %s +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll index 539f760642c26..64a76f38920a7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -1730,7 +1730,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ] - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i32, i32* %x, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) @@ -1781,7 +1781,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ] - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i32, i32* %x, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) @@ -1835,7 +1835,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ] - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i16, i16* %x, i32 %index %1 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) @@ -1887,7 +1887,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ] - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i16, i16* %x, i32 %index %1 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) @@ -1943,7 +1943,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ] - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i8, i8* %x, i32 %index %1 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) @@ -1995,7 +1995,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ] - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i8, i8* %x, i32 %index %1 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) @@ -2051,7 +2051,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ] - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i16, i16* %x, i32 %index %1 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) @@ -2102,7 +2102,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ] - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i16, i16* %x, i32 %index %1 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) @@ -2156,7 +2156,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ] - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i8, i8* %x, i32 %index %1 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) @@ -2208,7 +2208,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ] - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i8, i8* %x, i32 %index %1 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) @@ -2264,7 +2264,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ] - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i8, i8* %x, i32 %index %1 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) @@ -2315,7 +2315,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ] - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i8, i8* %x, i32 %index %1 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) @@ -2371,7 +2371,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ] - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i32, i32* %x, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) @@ -2425,7 +2425,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ] - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i32, i32* %x, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) @@ -2484,7 +2484,7 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ] - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) %0 = getelementptr inbounds i16, i16* %x, i32 %index %1 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) diff --git a/llvm/test/CodeGen/WebAssembly/fshl.ll b/llvm/test/CodeGen/WebAssembly/fshl.ll new file mode 100644 index 0000000000000..fb14826ff9a46 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/fshl.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +; From https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=25150 +define i33 @fshr_multi_use(i33 %a) { +; CHECK-LABEL: fshr_multi_use: +; CHECK: .functype fshr_multi_use (i64) -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i64.const 1 +; CHECK-NEXT: i64.shr_u +; CHECK-NEXT: i64.const 31 +; CHECK-NEXT: i64.and +; CHECK-NEXT: # fallthrough-return + %b = tail call i33 @llvm.fshr.i33(i33 %a, i33 %a, i33 1) + %e = and i33 %b, 31 + ret i33 %e +} + +declare i33 @llvm.fshr.i33(i33, i33, i33) diff --git a/llvm/test/CodeGen/X86/2007-11-06-InstrSched.ll b/llvm/test/CodeGen/X86/2007-11-06-InstrSched.ll index 91df29681373d..d073aad4a5c35 100644 --- a/llvm/test/CodeGen/X86/2007-11-06-InstrSched.ll +++ b/llvm/test/CodeGen/X86/2007-11-06-InstrSched.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-- -mcpu=generic -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s define float @foo(i32* %x, float* %y, i32 %c) nounwind { ; CHECK-LABEL: foo: diff --git a/llvm/test/CodeGen/X86/GlobalISel/lit.local.cfg b/llvm/test/CodeGen/X86/GlobalISel/lit.local.cfg deleted file mode 100644 index e99d1bb8446ce..0000000000000 --- a/llvm/test/CodeGen/X86/GlobalISel/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not 'global-isel' in config.root.available_features: - config.unsupported = True diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll b/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll index 1d5e55c282c5c..b33daf570d456 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll @@ -19,7 +19,7 @@ define float @test_return_f1(float %f.coerce) { ; ALL: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.retval ; ALL: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.f ; ALL: G_STORE [[TRUNC]](s32), [[FRAME_INDEX1]](p0) :: (store 4 into %ir.coerce.dive2) - ; ALL: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; ALL: G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) ; ALL: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load 4 from %ir.coerce.dive13) ; ALL: [[ANYEXT:%[0-9]+]]:_(s128) = G_ANYEXT [[LOAD]](s32) ; ALL: $xmm0 = COPY [[ANYEXT]](s128) @@ -49,7 +49,7 @@ define double @test_return_d1(double %d.coerce) { ; ALL: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.retval ; ALL: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.d ; ALL: G_STORE [[TRUNC]](s64), [[FRAME_INDEX1]](p0) :: (store 8 into %ir.coerce.dive2) - ; ALL: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.0, align 8), (load 1 from %ir.1, align 8) + ; ALL: G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.0, align 8), (load 1 from %ir.1, align 8) ; ALL: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load 8 from %ir.coerce.dive13) ; ALL: [[ANYEXT:%[0-9]+]]:_(s128) = G_ANYEXT [[LOAD]](s64) ; ALL: $xmm0 = COPY [[ANYEXT]](s128) @@ -82,7 +82,7 @@ define { double, double } @test_return_d2(double %d.coerce0, double %d.coerce1) ; ALL: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; ALL: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX1]], [[C1]](s64) ; ALL: G_STORE [[TRUNC1]](s64), [[PTR_ADD]](p0) :: (store 8 into %ir.2) - ; ALL: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.3, align 8), (load 1 from %ir.4, align 8) + ; ALL: G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.3, align 8), (load 1 from %ir.4, align 8) ; ALL: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load 8 from %ir.5) ; ALL: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64) ; ALL: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load 8 from %ir.5 + 8) @@ -116,7 +116,7 @@ define i32 @test_return_i1(i32 %i.coerce) { ; ALL: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.retval ; ALL: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.i ; ALL: G_STORE [[COPY]](s32), [[FRAME_INDEX1]](p0) :: (store 4 into %ir.coerce.dive2) - ; ALL: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; ALL: G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) ; ALL: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load 4 from %ir.coerce.dive13) ; ALL: $eax = COPY [[LOAD]](s32) ; ALL: RET 0, implicit $eax @@ -142,7 +142,7 @@ define i64 @test_return_i2(i64 %i.coerce) { ; ALL: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.retval ; ALL: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.i ; ALL: G_STORE [[COPY]](s64), [[FRAME_INDEX1]](p0) :: (store 8 into %ir.0, align 4) - ; ALL: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.1, align 4), (load 1 from %ir.2, align 4) + ; ALL: G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.1, align 4), (load 1 from %ir.2, align 4) ; ALL: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load 8 from %ir.3, align 4) ; ALL: $rax = COPY [[LOAD]](s64) ; ALL: RET 0, implicit $rax @@ -174,9 +174,9 @@ define { i64, i32 } @test_return_i3(i64 %i.coerce0, i32 %i.coerce1) { ; ALL: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; ALL: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX2]], [[C1]](s64) ; ALL: G_STORE [[COPY1]](s32), [[PTR_ADD]](p0) :: (store 4 into %ir.1) - ; ALL: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[FRAME_INDEX1]](p0), [[FRAME_INDEX2]](p0), [[C]](s64), 0 :: (store 1 into %ir.2, align 4), (load 1 from %ir.3, align 4) - ; ALL: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.4, align 4), (load 1 from %ir.5, align 4) - ; ALL: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[FRAME_INDEX3]](p0), [[FRAME_INDEX]](p0), [[C]](s64), 0 :: (store 1 into %ir.6, align 8), (load 1 from %ir.7, align 4) + ; ALL: G_MEMCPY [[FRAME_INDEX1]](p0), [[FRAME_INDEX2]](p0), [[C]](s64), 0 :: (store 1 into %ir.2, align 4), (load 1 from %ir.3, align 4) + ; ALL: G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.4, align 4), (load 1 from %ir.5, align 4) + ; ALL: G_MEMCPY [[FRAME_INDEX3]](p0), [[FRAME_INDEX]](p0), [[C]](s64), 0 :: (store 1 into %ir.6, align 8), (load 1 from %ir.7, align 4) ; ALL: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX3]](p0) :: (dereferenceable load 8 from %ir.tmp) ; ALL: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX3]], [[C1]](s64) ; ALL: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load 4 from %ir.tmp + 8, align 8) @@ -218,7 +218,7 @@ define { i64, i64 } @test_return_i4(i64 %i.coerce0, i64 %i.coerce1) { ; ALL: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; ALL: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX1]], [[C1]](s64) ; ALL: G_STORE [[COPY1]](s64), [[PTR_ADD]](p0) :: (store 8 into %ir.2, align 4) - ; ALL: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.3, align 4), (load 1 from %ir.4, align 4) + ; ALL: G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store 1 into %ir.3, align 4), (load 1 from %ir.4, align 4) ; ALL: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load 8 from %ir.5, align 4) ; ALL: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64) ; ALL: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load 8 from %ir.5 + 8, align 4) diff --git a/llvm/test/CodeGen/X86/abi-isel.ll b/llvm/test/CodeGen/X86/abi-isel.ll index 967aac4bc9b59..f2ea051c5f628 100644 --- a/llvm/test/CodeGen/X86/abi-isel.ll +++ b/llvm/test/CodeGen/X86/abi-isel.ll @@ -1,18 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Use the --no_x86_scrub_rip additional argument to keep the rip address math. -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-STATIC -; RUN: llc < %s -mcpu=generic -mtriple=i686-unknown-linux-gnu -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-STATIC -; RUN: llc < %s -mcpu=generic -mtriple=i686-unknown-linux-gnu -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-PIC -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-PIC - -; RUN: llc < %s -mcpu=generic -mtriple=i686-apple-darwin -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-STATIC -; RUN: llc < %s -mcpu=generic -mtriple=i686-apple-darwin9 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC -; RUN: llc < %s -mcpu=generic -mtriple=i686-apple-darwin9 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC - -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-STATIC -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-PIC +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-STATIC +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-STATIC +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-PIC +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-PIC + +; RUN: llc < %s -mtriple=i686-apple-darwin -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-STATIC +; RUN: llc < %s -mtriple=i686-apple-darwin9 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC +; RUN: llc < %s -mtriple=i686-apple-darwin9 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC + +; RUN: llc < %s -mtriple=x86_64-apple-darwin -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-STATIC +; RUN: llc < %s -mtriple=x86_64-apple-darwin -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC +; RUN: llc < %s -mtriple=x86_64-apple-darwin -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-PIC @src = external global [131072 x i32] @dst = external global [131072 x i32] diff --git a/llvm/test/CodeGen/X86/add.ll b/llvm/test/CodeGen/X86/add.ll index 9de5645158733..7681c7c8d8a75 100644 --- a/llvm/test/CodeGen/X86/add.ll +++ b/llvm/test/CodeGen/X86/add.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mcpu=generic -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,X64-LINUX -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 | FileCheck %s --check-prefixes=X64,X64-WIN32 +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,X64-LINUX +; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefixes=X64,X64-WIN32 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 5ce111806a2cc..fc1ba049c6912 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1766,39 +1766,39 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { define i16 @trunc_i32_to_i1(i32 %a) { ; KNL-LABEL: trunc_i32_to_i1: ; KNL: # %bb.0: -; KNL-NEXT: movw $-4, %ax -; KNL-NEXT: kmovw %eax, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: andl $1, %edi -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: movw $-4, %ax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: # kill: def $ax killed $ax killed $eax ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_i32_to_i1: ; SKX: # %bb.0: -; SKX-NEXT: movw $-4, %ax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: kshiftrw $1, %k0, %k0 -; SKX-NEXT: kshiftlw $1, %k0, %k0 ; SKX-NEXT: andl $1, %edi -; SKX-NEXT: kmovw %edi, %k1 -; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: kmovw %edi, %k0 +; SKX-NEXT: movw $-4, %ax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: kshiftrw $1, %k1, %k1 +; SKX-NEXT: kshiftlw $1, %k1, %k1 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: # kill: def $ax killed $ax killed $eax ; SKX-NEXT: retq ; ; AVX512DQNOBW-LABEL: trunc_i32_to_i1: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: movw $-4, %ax -; AVX512DQNOBW-NEXT: kmovw %eax, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: andl $1, %edi -; AVX512DQNOBW-NEXT: kmovw %edi, %k1 -; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %edi, %k0 +; AVX512DQNOBW-NEXT: movw $-4, %ax +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k0, %k1, %k0 ; AVX512DQNOBW-NEXT: kmovw %k0, %eax ; AVX512DQNOBW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512DQNOBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 41bdaf21baa38..fd722e1beb135 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -2181,32 +2181,32 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %arg2) { ; KNL-LABEL: test_concat_v2i1: ; KNL: ## %bb.0: -; KNL-NEXT: movzwl (%rdi), %eax -; KNL-NEXT: movzwl 2(%rdi), %ecx +; KNL-NEXT: movzwl 2(%rdi), %eax +; KNL-NEXT: movzwl (%rdi), %ecx ; KNL-NEXT: vmovd %ecx, %xmm0 ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ; KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; KNL-NEXT: vucomiss %xmm1, %xmm0 ; KNL-NEXT: setb %cl +; KNL-NEXT: andl $1, %ecx ; KNL-NEXT: kmovw %ecx, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: vmovd %eax, %xmm2 ; KNL-NEXT: vcvtph2ps %xmm2, %xmm2 ; KNL-NEXT: vucomiss %xmm1, %xmm2 ; KNL-NEXT: setb %al -; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vucomiss %xmm1, %xmm0 ; KNL-NEXT: seta %al +; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: vucomiss %xmm1, %xmm2 ; KNL-NEXT: seta %al -; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k1 ; KNL-NEXT: kmovw %k1, %ecx diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index e67b81581396d..67067e3fff27d 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -5157,13 +5157,13 @@ define <64 x i1> @mask64_insert(i32 %a) { ; KNL-LABEL: mask64_insert: ; KNL: ## %bb.0: ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movw $-4, %cx -; KNL-NEXT: kmovw %ecx, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: andl $1, %esi -; KNL-NEXT: kmovw %esi, %k1 -; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: movw $-4, %cx +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) ; KNL-NEXT: movw $-3, 6(%rdi) ; KNL-NEXT: movl $-131075, 2(%rdi) ## imm = 0xFFFDFFFD @@ -5198,13 +5198,13 @@ define <64 x i1> @mask64_insert(i32 %a) { ; AVX512DQ-LABEL: mask64_insert: ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax -; AVX512DQ-NEXT: movw $-4, %cx -; AVX512DQ-NEXT: kmovw %ecx, %k0 -; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQ-NEXT: andl $1, %esi -; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kmovw %esi, %k0 +; AVX512DQ-NEXT: movw $-4, %cx +; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQ-NEXT: korw %k0, %k1, %k0 ; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: movw $-3, 6(%rdi) ; AVX512DQ-NEXT: movl $-131075, 2(%rdi) ## imm = 0xFFFDFFFD diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 719bd9f9d95f0..7dcae9a2d24ac 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1434,8 +1434,8 @@ define <4 x i32> @zext_bool_logic(<4 x i64> %cond1, <4 x i64> %cond2, <4 x i32> define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) { ; KNL-LABEL: half_vec_compare: ; KNL: ## %bb.0: ## %entry -; KNL-NEXT: movzwl (%rdi), %eax ## encoding: [0x0f,0xb7,0x07] -; KNL-NEXT: movzwl 2(%rdi), %ecx ## encoding: [0x0f,0xb7,0x4f,0x02] +; KNL-NEXT: movzwl 2(%rdi), %eax ## encoding: [0x0f,0xb7,0x47,0x02] +; KNL-NEXT: movzwl (%rdi), %ecx ## encoding: [0x0f,0xb7,0x0f] ; KNL-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] @@ -1443,17 +1443,17 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) { ; KNL-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1] ; KNL-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2] ; KNL-NEXT: orb %cl, %dl ## encoding: [0x08,0xca] +; KNL-NEXT: andl $1, %edx ## encoding: [0x83,0xe2,0x01] ; KNL-NEXT: kmovw %edx, %k0 ## encoding: [0xc5,0xf8,0x92,0xc2] -; KNL-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01] ; KNL-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; KNL-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] ; KNL-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] ; KNL-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] ; KNL-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; KNL-NEXT: andl $1, %ecx ## encoding: [0x83,0xe1,0x01] ; KNL-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] -; KNL-NEXT: korw %k0, %k1, %k1 ## encoding: [0xc5,0xf4,0x45,0xc8] +; KNL-NEXT: kshiftlw $1, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x01] +; KNL-NEXT: korw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x45,0xc9] ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x33,0xc0] ; KNL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0] @@ -1465,8 +1465,8 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) { ; ; AVX512BW-LABEL: half_vec_compare: ; AVX512BW: ## %bb.0: ## %entry -; AVX512BW-NEXT: movzwl (%rdi), %eax ## encoding: [0x0f,0xb7,0x07] -; AVX512BW-NEXT: movzwl 2(%rdi), %ecx ## encoding: [0x0f,0xb7,0x4f,0x02] +; AVX512BW-NEXT: movzwl 2(%rdi), %eax ## encoding: [0x0f,0xb7,0x47,0x02] +; AVX512BW-NEXT: movzwl (%rdi), %ecx ## encoding: [0x0f,0xb7,0x0f] ; AVX512BW-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] @@ -1474,17 +1474,17 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) { ; AVX512BW-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1] ; AVX512BW-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2] ; AVX512BW-NEXT: orb %cl, %dl ## encoding: [0x08,0xca] -; AVX512BW-NEXT: kmovd %edx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc2] -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01] +; AVX512BW-NEXT: andl $1, %edx ## encoding: [0x83,0xe2,0x01] +; AVX512BW-NEXT: kmovw %edx, %k0 ## encoding: [0xc5,0xf8,0x92,0xc2] ; AVX512BW-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] ; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; AVX512BW-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] ; AVX512BW-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] ; AVX512BW-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] ; AVX512BW-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; AVX512BW-NEXT: andl $1, %ecx ## encoding: [0x83,0xe1,0x01] -; AVX512BW-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] -; AVX512BW-NEXT: korw %k0, %k1, %k0 ## encoding: [0xc5,0xf4,0x45,0xc0] +; AVX512BW-NEXT: kmovd %ecx, %k1 ## encoding: [0xc5,0xfb,0x92,0xc9] +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x01] +; AVX512BW-NEXT: korw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x45,0xc1] ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ## encoding: [0x62,0xf2,0xfe,0x48,0x28,0xc0] ; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0] ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] diff --git a/llvm/test/CodeGen/X86/base-pointer-and-mwaitx.ll b/llvm/test/CodeGen/X86/base-pointer-and-mwaitx.ll new file mode 100644 index 0000000000000..fead4c650c0af --- /dev/null +++ b/llvm/test/CodeGen/X86/base-pointer-and-mwaitx.ll @@ -0,0 +1,210 @@ +; RUN: llc -mtriple=x86_64-pc-linux-gnu -mattr=+mwaitx -x86-use-base-pointer=true -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE_64 %s +; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -mattr=+mwaitx -x86-use-base-pointer=true -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE_32 %s +; RUN: llc -mtriple=x86_64-pc-linux-gnu -mattr=+mwaitx -x86-use-base-pointer=true %s -o - | FileCheck --check-prefix=CHECK --check-prefix=NO_BASE_64 %s +; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -mattr=+mwaitx -x86-use-base-pointer=true %s -o - | FileCheck --check-prefix=CHECK --check-prefix=NO_BASE_32 %s + +; This test checks that we save and restore the base pointer (ebx or rbx) in the +; presence of the mwaitx intrinsic which requires to use ebx for one of its +; argument. +; This function uses a dynamically allocated stack to force the use +; of a base pointer. +; After the call to the mwaitx intrinsic we do a volatile store to the +; dynamically allocated memory which will require the use of the base pointer. +; The base pointer should therefore be restored straight after the mwaitx +; instruction. + +define void @test_baseptr(i64 %x, i64 %y, i32 %E, i32 %H, i32 %C) nounwind { +entry: + %ptr = alloca i8*, align 8 + %0 = alloca i8, i64 %x, align 16 + store i8* %0, i8** %ptr, align 8 + call void @llvm.x86.mwaitx(i32 %E, i32 %H, i32 %C) + %1 = load i8*, i8** %ptr, align 8 + %arrayidx = getelementptr inbounds i8, i8* %1, i64 %y + store volatile i8 42, i8* %arrayidx, align 1 + ret void +} +; CHECK-LABEL: test_baseptr: +; USE_BASE_64: movq %rsp, %rbx +; Pass mwaitx first 2 arguments in eax and ecx respectively. +; USE_BASE_64: movl %ecx, %eax +; USE_BASE_64: movl %edx, %ecx +; Save base pointer. +; USE_BASE_64: movq %rbx, [[SAVE_rbx:%r([8-9]|1[0-5]|di|si)]] +; Set mwaitx ebx argument. +; USE_BASE_64: movl %r8d, %ebx +; USE_BASE_64-NEXT: mwaitx +; Restore base pointer. +; USE_BASE_64-NEXT: movq [[SAVE_rbx]], %rbx + +; USE_BASE_32: movl %esp, %ebx +; Pass mwaitx first 2 arguments in eax and ecx respectively. +; USE_BASE_32: movl %ecx, %eax +; USE_BASE_32: movl %edx, %ecx +; Save base pointer. +; USE_BASE_32: movl %ebx, [[SAVE_ebx:%e(di|si)]] +; Set mwaitx ebx argument. +; USE_BASE_32: movl %r8d, %ebx +; USE_BASE_32-NEXT: mwaitx +; Restore base pointer. +; USE_BASE_32-NEXT: movl [[SAVE_ebx]], %ebx + +; Pass mwaitx 3 arguments in eax, ecx, ebx +; NO_BASE_64: movl %r8d, %ebx +; NO_BASE_64: movl %ecx, %eax +; NO_BASE_64: movl %edx, %ecx +; No need to save base pointer. +; NO_BASE_64-NOT: movq %rbx +; NO_BASE_64: mwaitx +; No need to restore base pointer. +; NO_BASE_64-NOT: movq {{.*}}, %rbx +; NO_BASE_64-NEXT: {{.+$}} + +; Pass mwaitx 3 arguments in eax, ecx, ebx +; NO_BASE_32: movl %r8d, %ebx +; NO_BASE_32: movl %ecx, %eax +; NO_BASE_32: movl %edx, %ecx +; No need to save base pointer. +; NO_BASE_32-NOT: movl %ebx +; NO_BASE_32: mwaitx +; No need to restore base pointer. +; NO_BASE_32-NOT: movl {{.*}}, %ebx +; NO_BASE_32-NEXT: {{.+$}} + +; Test of the case where an opaque sp adjustement is introduced by a separate +; basic block which, combined with stack realignment, requires a base pointer. +@g = global i32 0, align 8 + +define void @test_opaque_sp_adjustment(i32 %E, i32 %H, i32 %C, i64 %x) { +entry: + %ptr = alloca i8*, align 8 + call void @llvm.x86.mwaitx(i32 %E, i32 %H, i32 %C) + %g = load i32, i32* @g, align 4 + %tobool = icmp ne i32 %g, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: + call void asm sideeffect "", "~{rsp},~{esp},~{dirflag},~{fpsr},~{flags}"() + br label %if.end + +if.end: + %ptr2 = load i8*, i8** %ptr, align 8 + %arrayidx = getelementptr inbounds i8, i8* %ptr2, i64 %x + store volatile i8 42, i8* %arrayidx, align 1 + ret void +} +; CHECK-LABEL: test_opaque_sp_adjustment: +; USE_BASE_64: movq %rsp, %rbx +; Pass mwaitx first 2 arguments in eax and ecx respectively. +; USE_BASE_64: movl %esi, %eax +; USE_BASE_64: movl %edi, %ecx +; Save base pointer. +; USE_BASE_64: movq %rbx, [[SAVE_rbx:%r([8-9]|1[0-5]|di|si)]] +; Set mwaitx ebx argument. +; USE_BASE_64: movl %edx, %ebx +; USE_BASE_64-NEXT: mwaitx +; Restore base pointer. +; USE_BASE_64-NEXT: movq [[SAVE_rbx]], %rbx + +; USE_BASE_32: movl %esp, %ebx +; Pass mwaitx first 2 arguments in eax and ecx respectively. +; USE_BASE_32: movl %esi, %eax +; USE_BASE_32: movl %edi, %ecx +; Save base pointer. +; USE_BASE_32: movl %ebx, [[SAVE_ebx:%e(di|si)]] +; Set mwaitx ebx argument. +; USE_BASE_32: movl %edx, %ebx +; USE_BASE_32-NEXT: mwaitx +; Restore base pointer. +; USE_BASE_32-NEXT: movl [[SAVE_ebx]], %ebx + +; Pass mwaitx 3 arguments in eax, ecx, ebx +; NO_BASE_64: movl %edx, %ebx +; NO_BASE_64: movl %esi, %eax +; NO_BASE_64: movl %edi, %ecx +; No need to save base pointer. +; NO_BASE_64-NOT: movq %rbx +; NO_BASE_64: mwaitx +; NO_BASE_64-NOT: movq {{.*}}, %rbx +; NO_BASE_64-NEXT: {{.+$}} + +; Pass mwaitx 3 arguments in eax, ecx, ebx +; NO_BASE_32: movl %edx, %ebx +; NO_BASE_32: movl %esi, %eax +; NO_BASE_32: movl %edi, %ecx +; No need to save base pointer. +; NO_BASE_32-NOT: movl %ebx +; NO_BASE_32: mwaitx +; No need to restore base pointer. +; NO_BASE_32-NOT: movl {{.*}}, %ebx +; NO_BASE_32-NEXT: {{.+$}} + +; Test of the case where a variable size object is introduced by a separate +; basic block which, combined with stack realignment, requires a base pointer. +define void @test_variable_size_object(i32 %E, i32 %H, i32 %C, i64 %x) { +entry: + %ptr = alloca i8*, align 8 + call void @llvm.x86.mwaitx(i32 %E, i32 %H, i32 %C) + %g = load i32, i32* @g, align 4 + %tobool = icmp ne i32 %g, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: + %i5 = alloca i8, i64 %x, align 16 + store i8* %i5, i8** %ptr, align 8 + br label %if.end + +if.end: + %ptr2 = load i8*, i8** %ptr, align 8 + %arrayidx = getelementptr inbounds i8, i8* %ptr2, i64 %x + store volatile i8 42, i8* %arrayidx, align 1 + ret void +} + +; CHECK-LABEL: test_variable_size_object: +; USE_BASE_64: movq %rsp, %rbx +; Pass mwaitx first 2 arguments in eax and ecx respectively. +; USE_BASE_64: movl %esi, %eax +; USE_BASE_64: movl %edi, %ecx +; Save base pointer. +; USE_BASE_64: movq %rbx, [[SAVE_rbx:%r([8-9]|1[0-5]|di|si)]] +; Set mwaitx ebx argument. +; USE_BASE_64: movl %edx, %ebx +; USE_BASE_64-NEXT: mwaitx +; Restore base pointer. +; USE_BASE_64-NEXT: movq [[SAVE_rbx]], %rbx + +; USE_BASE_32: movl %esp, %ebx +; Pass mwaitx first 2 arguments in eax and ecx respectively. +; USE_BASE_32: movl %esi, %eax +; USE_BASE_32: movl %edi, %ecx +; Save base pointer. +; USE_BASE_32: movl %ebx, [[SAVE_ebx:%e(di|si)]] +; Set mwaitx ebx argument. +; USE_BASE_32: movl %edx, %ebx +; USE_BASE_32-NEXT: mwaitx +; Restore base pointer. +; USE_BASE_32-NEXT: movl [[SAVE_ebx]], %ebx + +; Pass mwaitx 3 arguments in eax, ecx, ebx +; NO_BASE_64: movl %edx, %ebx +; NO_BASE_64: movl %esi, %eax +; NO_BASE_64: movl %edi, %ecx +; No need to save base pointer. +; NO_BASE_64-NOT: movq %rbx +; NO_BASE_64: mwaitx +; NO_BASE_64-NOT: movq {{.*}}, %rbx +; NO_BASE_64-NEXT: {{.+$}} + +; Pass mwaitx 3 arguments in eax, ecx, ebx +; NO_BASE_32: movl %edx, %ebx +; NO_BASE_32: movl %esi, %eax +; NO_BASE_32: movl %edi, %ecx +; No need to save base pointer. +; NO_BASE_32-NOT: movl %ebx +; NO_BASE_32: mwaitx +; No need to restore base pointer. +; NO_BASE_32-NOT: movl {{.*}}, %ebx +; NO_BASE_32-NEXT: {{.+$}} + +declare void @llvm.x86.mwaitx(i32, i32, i32) nounwind diff --git a/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll b/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll index 4446f360ec042..25e3691913c8c 100644 --- a/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll +++ b/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll @@ -1,35 +1,9 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_rip ; RUN: llc < %s -mtriple=i686-pc-windows-msvc | FileCheck %s -check-prefix=X32 ; Control Flow Guard is currently only available on Windows ; Test that Control Flow Guard checks are correctly added for x86 vector calls. define void @func_cf_vector_x86(void (%struct.HVA)* %0, %struct.HVA* %1) #0 { -; X32-LABEL: func_cf_vector_x86: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $48, %esp -; X32-NEXT: movl 8(%ebp), %ecx -; X32-NEXT: movl 12(%ebp), %eax -; X32-NEXT: movups (%eax), %xmm0 -; X32-NEXT: movups 16(%eax), %xmm1 -; X32-NEXT: movaps %xmm0, (%esp) -; X32-NEXT: movaps %xmm1, 16(%esp) -; X32-NEXT: movsd (%esp), %xmm4 -; X32-NEXT: movsd 8(%esp), %xmm5 -; X32-NEXT: movsd 16(%esp), %xmm6 -; X32-NEXT: movsd 24(%esp), %xmm7 -; X32-NEXT: calll *___guard_check_icall_fptr -; X32-NEXT: movaps %xmm4, %xmm0 -; X32-NEXT: movaps %xmm5, %xmm1 -; X32-NEXT: movaps %xmm6, %xmm2 -; X32-NEXT: movaps %xmm7, %xmm3 -; X32-NEXT: calll *%ecx -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: retl entry: %2 = alloca %struct.HVA, align 8 %3 = bitcast %struct.HVA* %2 to i8* @@ -39,6 +13,23 @@ entry: call x86_vectorcallcc void %0(%struct.HVA inreg %5) ret void + ; X32-LABEL: func_cf_vector_x86 + ; X32: movl 12(%ebp), %eax + ; X32: movl 8(%ebp), %ecx + ; X32: movsd 24(%eax), %xmm4 # xmm4 = mem[0],zero + ; X32: movsd %xmm4, 24(%esp) + ; X32: movsd 16(%eax), %xmm5 # xmm5 = mem[0],zero + ; X32: movsd %xmm5, 16(%esp) + ; X32: movsd (%eax), %xmm6 # xmm6 = mem[0],zero + ; X32: movsd 8(%eax), %xmm7 # xmm7 = mem[0],zero + ; X32: movsd %xmm7, 8(%esp) + ; X32: movsd %xmm6, (%esp) + ; X32: calll *___guard_check_icall_fptr + ; X32: movaps %xmm6, %xmm0 + ; X32: movaps %xmm7, %xmm1 + ; X32: movaps %xmm5, %xmm2 + ; X32: movaps %xmm4, %xmm3 + ; X32: calll *%ecx } attributes #0 = { "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" } diff --git a/llvm/test/CodeGen/X86/cmov-fp.ll b/llvm/test/CodeGen/X86/cmov-fp.ll index 6bbad427a9b6d..756324bbdfdc9 100644 --- a/llvm/test/CodeGen/X86/cmov-fp.ll +++ b/llvm/test/CodeGen/X86/cmov-fp.ll @@ -1056,11 +1056,11 @@ define float @test16(i32 %a, i32 %b, float %x) nounwind { define x86_fp80 @test17(i32 %a, i32 %b, x86_fp80 %x) nounwind { ; SSE-LABEL: test17: ; SSE: # %bb.0: -; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: fxch %st(1) +; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: fcmovnbe %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1109,11 +1109,11 @@ define x86_fp80 @test17(i32 %a, i32 %b, x86_fp80 %x) nounwind { define x86_fp80 @test18(i32 %a, i32 %b, x86_fp80 %x) nounwind { ; SSE-LABEL: test18: ; SSE: # %bb.0: -; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: fxch %st(1) +; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: fcmovnb %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1162,11 +1162,11 @@ define x86_fp80 @test18(i32 %a, i32 %b, x86_fp80 %x) nounwind { define x86_fp80 @test19(i32 %a, i32 %b, x86_fp80 %x) nounwind { ; SSE-LABEL: test19: ; SSE: # %bb.0: -; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: fxch %st(1) +; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: fcmovb %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1215,11 +1215,11 @@ define x86_fp80 @test19(i32 %a, i32 %b, x86_fp80 %x) nounwind { define x86_fp80 @test20(i32 %a, i32 %b, x86_fp80 %x) nounwind { ; SSE-LABEL: test20: ; SSE: # %bb.0: -; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: fxch %st(1) +; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: fcmovbe %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1268,13 +1268,13 @@ define x86_fp80 @test20(i32 %a, i32 %b, x86_fp80 %x) nounwind { define x86_fp80 @test21(i32 %a, i32 %b, x86_fp80 %x) nounwind { ; SSE-LABEL: test21: ; SSE: # %bb.0: -; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} +; SSE-NEXT: fxch %st(1) ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: setg %al ; SSE-NEXT: testb %al, %al -; SSE-NEXT: flds {{\.LCPI.*}} -; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovne %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1328,13 +1328,13 @@ define x86_fp80 @test21(i32 %a, i32 %b, x86_fp80 %x) nounwind { define x86_fp80 @test22(i32 %a, i32 %b, x86_fp80 %x) nounwind { ; SSE-LABEL: test22: ; SSE: # %bb.0: -; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} +; SSE-NEXT: fxch %st(1) ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: setge %al ; SSE-NEXT: testb %al, %al -; SSE-NEXT: flds {{\.LCPI.*}} -; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovne %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1387,13 +1387,13 @@ define x86_fp80 @test22(i32 %a, i32 %b, x86_fp80 %x) nounwind { define x86_fp80 @test23(i32 %a, i32 %b, x86_fp80 %x) nounwind { ; SSE-LABEL: test23: ; SSE: # %bb.0: -; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} +; SSE-NEXT: fxch %st(1) ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: setl %al ; SSE-NEXT: testb %al, %al -; SSE-NEXT: flds {{\.LCPI.*}} -; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovne %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1446,13 +1446,13 @@ define x86_fp80 @test23(i32 %a, i32 %b, x86_fp80 %x) nounwind { define x86_fp80 @test24(i32 %a, i32 %b, x86_fp80 %x) nounwind { ; SSE-LABEL: test24: ; SSE: # %bb.0: -; SSE-NEXT: fldt {{[0-9]+}}(%esp) ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} +; SSE-NEXT: fxch %st(1) ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: setle %al ; SSE-NEXT: testb %al, %al -; SSE-NEXT: flds {{\.LCPI.*}} -; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovne %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/cpus-intel.ll b/llvm/test/CodeGen/X86/cpus-intel.ll index e0dd647409f95..b065c6d34f80a 100644 --- a/llvm/test/CodeGen/X86/cpus-intel.ll +++ b/llvm/test/CodeGen/X86/cpus-intel.ll @@ -40,6 +40,7 @@ ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=icelake-client 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=icelake-server 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=tigerlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=atom 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bonnell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty diff --git a/llvm/test/CodeGen/X86/embed-bitcode.ll b/llvm/test/CodeGen/X86/embed-bitcode.ll new file mode 100644 index 0000000000000..00dd7ef17d56f --- /dev/null +++ b/llvm/test/CodeGen/X86/embed-bitcode.ll @@ -0,0 +1,10 @@ +; RUN: llc -filetype=obj -mtriple=x86_64 %s -o %t +; RUN: llvm-readelf -S %t | FileCheck %s + +; CHECK: .text PROGBITS 0000000000000000 [[#%x,OFF:]] 000000 00 AX 0 +; CHECK-NEXT: .llvmbc PROGBITS 0000000000000000 [[#%x,OFF:]] 000004 00 0 +; CHECK-NEXT: .llvmcmd PROGBITS 0000000000000000 [[#%x,OFF:]] 000005 00 0 + +@llvm.embedded.module = private constant [4 x i8] c"BC\C0\DE", section ".llvmbc" +@llvm.cmdline = private constant [5 x i8] c"-cc1\00", section ".llvmcmd" +@llvm.compiler.used = appending global [2 x i8*] [i8* getelementptr inbounds ([4 x i8], [4 x i8]* @llvm.embedded.module, i32 0, i32 0), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @llvm.cmdline, i32 0, i32 0)], section "llvm.metadata" diff --git a/llvm/test/CodeGen/X86/full-lsr.ll b/llvm/test/CodeGen/X86/full-lsr.ll index 36c98c8b494e3..14cee4dbe0419 100644 --- a/llvm/test/CodeGen/X86/full-lsr.ll +++ b/llvm/test/CodeGen/X86/full-lsr.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i686-- -mcpu=generic | FileCheck %s +; RUN: llc < %s -mtriple=i686-- | FileCheck %s define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind { ; CHECK: foo diff --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll index d551ed9a0931d..4a2c7c07e5518 100644 --- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=generic | FileCheck %s -check-prefixes=CHECK,GENERIC +; RUN: llc < %s -mtriple=x86_64-darwin | FileCheck %s -check-prefixes=CHECK,GENERIC ; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=atom | FileCheck %s -check-prefixes=CHECK,ATOM @Te0 = external global [256 x i32] ; <[256 x i32]*> [#uses=5] diff --git a/llvm/test/CodeGen/X86/lsr-static-addr.ll b/llvm/test/CodeGen/X86/lsr-static-addr.ll index 1c22e3ba1fa2b..6609c3dc0e62c 100644 --- a/llvm/test/CodeGen/X86/lsr-static-addr.ll +++ b/llvm/test/CodeGen/X86/lsr-static-addr.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=static < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=static < %s | FileCheck %s --check-prefix=CHECK ; RUN: llc -mcpu=atom -mtriple=x86_64-unknown-linux-gnu -relocation-model=static < %s | FileCheck %s --check-prefix=ATOM @A = external global [0 x double] diff --git a/llvm/test/CodeGen/X86/masked-iv-safe.ll b/llvm/test/CodeGen/X86/masked-iv-safe.ll index 2e86f9f23e397..a9b53205dd197 100644 --- a/llvm/test/CodeGen/X86/masked-iv-safe.ll +++ b/llvm/test/CodeGen/X86/masked-iv-safe.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-- | FileCheck %s --implicit-check-not '{{and|movz|sar|shl}}' +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --implicit-check-not '{{and|movz|sar|shl}}' ; Optimize away zext-inreg and sext-inreg on the loop induction ; variable using trip-count information. diff --git a/llvm/test/CodeGen/X86/optimize-max-3.ll b/llvm/test/CodeGen/X86/optimize-max-3.ll index 71885efbd31fb..220b1044ed99f 100644 --- a/llvm/test/CodeGen/X86/optimize-max-3.ll +++ b/llvm/test/CodeGen/X86/optimize-max-3.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -asm-verbose=false | FileCheck %s -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-win32 -asm-verbose=false | FileCheck %s ; LSR's OptimizeMax should eliminate the select (max). diff --git a/llvm/test/CodeGen/X86/oss-fuzz-25184.ll b/llvm/test/CodeGen/X86/oss-fuzz-25184.ll new file mode 100644 index 0000000000000..45ff7fa8f7ee5 --- /dev/null +++ b/llvm/test/CodeGen/X86/oss-fuzz-25184.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin19.5.0 | FileCheck %s + +; OSS fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=25184 + +define <2 x double> @test_fpext() { +; CHECK-LABEL: test_fpext: +; CHECK: ## %bb.0: +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: retq + %tmp12 = insertelement <4 x float> undef, float 0.000000e+00, i32 3 + %tmp5 = fpext <4 x float> %tmp12 to <4 x double> + %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> + %E1 = extractelement <4 x double> %tmp5, i16 undef + %I2 = insertelement <2 x double> %ret, double 4.940660e-324, i16 undef + store double %E1, double* undef, align 8 + ret <2 x double> %I2 +} diff --git a/llvm/test/CodeGen/X86/post-ra-sched.ll b/llvm/test/CodeGen/X86/post-ra-sched.ll index 70882fba50608..f6de77a698835 100644 --- a/llvm/test/CodeGen/X86/post-ra-sched.ll +++ b/llvm/test/CodeGen/X86/post-ra-sched.ll @@ -1,6 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386 -mcpu=pentium4 | FileCheck %s --check-prefix=PENTIUM4 -; RUN: llc < %s -mtriple=i386 -mcpu=pentium4m | FileCheck %s --check-prefix=PENTIUM4 +; RUN: llc < %s -mtriple=i386 -mcpu=pentium4 | FileCheck %s +; RUN: llc < %s -mtriple=i386 -mcpu=pentium4m | FileCheck %s ; RUN: llc < %s -mtriple=i386 -mcpu=pentium-m | FileCheck %s ; RUN: llc < %s -mtriple=i386 -mcpu=prescott | FileCheck %s ; RUN: llc < %s -mtriple=i386 -mcpu=nocona | FileCheck %s @@ -10,26 +9,12 @@ ; happens during the post-RA-scheduler, which should be enabled by ; default with the above specified cpus. -; Pentium4 is the default 32-bit CPU on Linux and currently has the postRA -; scheduler disabled. Leaving the command lines in place in case we change that. - @ptrs = external global [0 x i32*], align 4 @idxa = common global i32 0, align 4 @idxb = common global i32 0, align 4 @res = common global i32 0, align 4 define void @addindirect() { -; PENTIUM4-LABEL: addindirect: -; PENTIUM4: # %bb.0: # %entry -; PENTIUM4-NEXT: movl idxa, %eax -; PENTIUM4-NEXT: movl ptrs(,%eax,4), %eax -; PENTIUM4-NEXT: movl idxb, %ecx -; PENTIUM4-NEXT: movl ptrs(,%ecx,4), %ecx -; PENTIUM4-NEXT: movl (%ecx), %ecx -; PENTIUM4-NEXT: addl (%eax), %ecx -; PENTIUM4-NEXT: movl %ecx, res -; PENTIUM4-NEXT: retl -; ; CHECK-LABEL: addindirect: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl idxb, %ecx diff --git a/llvm/test/CodeGen/X86/pr34088.ll b/llvm/test/CodeGen/X86/pr34088.ll index a57ff09cc037b..6950e50dd7556 100644 --- a/llvm/test/CodeGen/X86/pr34088.ll +++ b/llvm/test/CodeGen/X86/pr34088.ll @@ -6,7 +6,7 @@ %struct.Buffer = type { i8*, i32 } ; This test checks that the load of store %2 is not dropped. -; +; define i32 @pr34088() local_unnamed_addr { ; CHECK-LABEL: pr34088: ; CHECK: # %bb.0: # %entry @@ -18,13 +18,13 @@ define i32 @pr34088() local_unnamed_addr { ; CHECK-NEXT: andl $-16, %esp ; CHECK-NEXT: subl $32, %esp ; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205] +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movaps %xmm0, (%esp) ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205] -; CHECK-NEXT: movaps %xmm1, (%esp) ; CHECK-NEXT: movl $-842150451, {{[0-9]+}}(%esp) # imm = 0xCDCDCDCD +; CHECK-NEXT: movaps %xmm1, (%esp) ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/pr40539.ll b/llvm/test/CodeGen/X86/pr40539.ll index f52fec51203a8..f2135cd2e73b2 100644 --- a/llvm/test/CodeGen/X86/pr40539.ll +++ b/llvm/test/CodeGen/X86/pr40539.ll @@ -40,6 +40,7 @@ define zeroext i1 @_Z8test_cosv() { ; CHECK-NEXT: subl $8, %esp ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: divss {{\.LCPI.*}}, %xmm0 ; CHECK-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; CHECK-NEXT: flds {{[0-9]+}}(%esp) @@ -48,7 +49,6 @@ define zeroext i1 @_Z8test_cosv() { ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: fstps (%esp) ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: setae %cl ; CHECK-NEXT: ucomiss {{\.LCPI.*}}, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr47299.ll b/llvm/test/CodeGen/X86/pr47299.ll new file mode 100644 index 0000000000000..2f5d07802c7c1 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr47299.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -x86-asm-syntax=intel -mtriple=x86_64-linux-generic-march=x86-64 -mcpu=skylake-avx512 < %s | FileCheck %s + +declare <7 x i1> @llvm.get.active.lane.mask.v7i1.i64(i64, i64) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64) +declare <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64, i64) +declare <64 x i1> @llvm.get.active.lane.mask.v64i1.i64(i64, i64) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) +declare <64 x i1> @llvm.get.active.lane.mask.v64i1.i32(i32, i32) + +define <7 x i1> @create_mask7(i64 %0) { +; CHECK-LABEL: create_mask7: +; CHECK: # %bb.0: +; CHECK-NEXT: mov rax, rdi +; CHECK-NEXT: vpbroadcastq zmm0, rsi +; CHECK-NEXT: vpcmpnleuq k0, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kshiftrb k1, k0, 6 +; CHECK-NEXT: kmovd r8d, k1 +; CHECK-NEXT: kshiftrb k1, k0, 5 +; CHECK-NEXT: kmovd r9d, k1 +; CHECK-NEXT: kshiftrb k1, k0, 4 +; CHECK-NEXT: kmovd r10d, k1 +; CHECK-NEXT: kshiftrb k1, k0, 3 +; CHECK-NEXT: kmovd edi, k1 +; CHECK-NEXT: kshiftrb k1, k0, 2 +; CHECK-NEXT: kmovd ecx, k1 +; CHECK-NEXT: kshiftrb k1, k0, 1 +; CHECK-NEXT: kmovd edx, k1 +; CHECK-NEXT: kmovd esi, k0 +; CHECK-NEXT: and sil, 1 +; CHECK-NEXT: and dl, 1 +; CHECK-NEXT: add dl, dl +; CHECK-NEXT: or dl, sil +; CHECK-NEXT: and cl, 1 +; CHECK-NEXT: shl cl, 2 +; CHECK-NEXT: or cl, dl +; CHECK-NEXT: and dil, 1 +; CHECK-NEXT: shl dil, 3 +; CHECK-NEXT: or dil, cl +; CHECK-NEXT: and r10b, 1 +; CHECK-NEXT: shl r10b, 4 +; CHECK-NEXT: or r10b, dil +; CHECK-NEXT: and r9b, 1 +; CHECK-NEXT: shl r9b, 5 +; CHECK-NEXT: or r9b, r10b +; CHECK-NEXT: shl r8b, 6 +; CHECK-NEXT: or r8b, r9b +; CHECK-NEXT: and r8b, 127 +; CHECK-NEXT: mov byte ptr [rax], r8b +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret + %2 = call <7 x i1> @llvm.get.active.lane.mask.v7i1.i64(i64 0, i64 %0) + ret <7 x i1> %2 +} + +define <16 x i1> @create_mask16(i64 %0) { +; CHECK-LABEL: create_mask16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastq zmm0, rdi +; CHECK-NEXT: vpcmpnleuq k0, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: vpcmpnleuq k1, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckbw k0, k1, k0 +; CHECK-NEXT: vpmovm2b xmm0, k0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret + %2 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 %0) + ret <16 x i1> %2 +} + +define <32 x i1> @create_mask32(i64 %0) { +; CHECK-LABEL: create_mask32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastq zmm0, rdi +; CHECK-NEXT: vpcmpnleuq k0, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: vpcmpnleuq k1, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: vpcmpnleuq k2, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckbw k0, k1, k0 +; CHECK-NEXT: vpcmpnleuq k1, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckbw k1, k1, k2 +; CHECK-NEXT: kunpckwd k0, k1, k0 +; CHECK-NEXT: vpmovm2b ymm0, k0 +; CHECK-NEXT: ret + %2 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 0, i64 %0) + ret <32 x i1> %2 +} + +define <64 x i1> @create_mask64(i64 %0) { +; CHECK-LABEL: create_mask64: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastq zmm0, rdi +; CHECK-NEXT: vpcmpnleuq k0, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: vpcmpnleuq k1, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: vpcmpnleuq k2, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckbw k0, k1, k0 +; CHECK-NEXT: vpcmpnleuq k1, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckbw k1, k1, k2 +; CHECK-NEXT: vpcmpnleuq k2, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckwd k0, k1, k0 +; CHECK-NEXT: vpcmpnleuq k1, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckbw k1, k1, k2 +; CHECK-NEXT: vpcmpnleuq k2, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: vpcmpnleuq k3, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckbw k2, k3, k2 +; CHECK-NEXT: kunpckwd k1, k2, k1 +; CHECK-NEXT: kunpckdq k0, k1, k0 +; CHECK-NEXT: vpmovm2b zmm0, k0 +; CHECK-NEXT: ret + %2 = call <64 x i1> @llvm.get.active.lane.mask.v64i1.i64(i64 0, i64 %0) + ret <64 x i1> %2 +} + +define <16 x i1> @create_mask16_i32(i32 %0) { +; CHECK-LABEL: create_mask16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastd zmm0, edi +; CHECK-NEXT: vpcmpnleud k0, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: vpmovm2b xmm0, k0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret + %2 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 0, i32 %0) + ret <16 x i1> %2 +} + +define <64 x i1> @create_mask64_i32(i32 %0) { +; CHECK-LABEL: create_mask64_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastd zmm0, edi +; CHECK-NEXT: vpcmpnleud k0, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: vpcmpnleud k1, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: vpcmpnleud k2, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckwd k0, k1, k0 +; CHECK-NEXT: vpcmpnleud k1, zmm0, zmmword ptr [rip + {{\.LCPI.*}}] +; CHECK-NEXT: kunpckwd k1, k1, k2 +; CHECK-NEXT: kunpckdq k0, k1, k0 +; CHECK-NEXT: vpmovm2b zmm0, k0 +; CHECK-NEXT: ret + %2 = call <64 x i1> @llvm.get.active.lane.mask.v64i1.i32(i32 0, i32 %0) + ret <64 x i1> %2 +} diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll index f73a608e096b7..7c46c5982bfd4 100644 --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC ; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=athlon | FileCheck %s --check-prefix=ATHLON ; RUN: llc < %s -mtriple=i386-intel-elfiamcu | FileCheck %s --check-prefix=MCU diff --git a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll index 295fdfb5a2617..f2c7c2fa4a564 100644 --- a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll +++ b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll @@ -3,6 +3,8 @@ ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefix=SLOW ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefix=SLOW ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefix=SLOW +; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=SLOW +; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=SLOW ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefix=SLOW ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefix=SLOW ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefix=SLOW @@ -12,10 +14,6 @@ ; Intel chips with fast unaligned memory accesses -; Marked fast because this is the default 32-bit mode CPU in clang. -; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=FAST -; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=FAST - ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=FAST ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=FAST ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=FAST diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll index a0a65e5f24234..49c0b8b30fbb4 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -903,14 +903,12 @@ define <4 x float> @div_sqrt_v4f32(<4 x float> %x, <4 x float> %y) { define double @sqrt_fdiv_common_operand(double %x) nounwind { ; SSE-LABEL: sqrt_fdiv_common_operand: ; SSE: # %bb.0: -; SSE-NEXT: sqrtsd %xmm0, %xmm1 -; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: sqrtsd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sqrt_fdiv_common_operand: ; AVX: # %bb.0: -; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %sqrt = call fast double @llvm.sqrt.f64(double %x) %r = fdiv fast double %x, %sqrt @@ -920,33 +918,29 @@ define double @sqrt_fdiv_common_operand(double %x) nounwind { define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind { ; SSE-LABEL: sqrt_fdiv_common_operand_vec: ; SSE: # %bb.0: -; SSE-NEXT: sqrtpd %xmm0, %xmm1 -; SSE-NEXT: divpd %xmm1, %xmm0 +; SSE-NEXT: sqrtpd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sqrt_fdiv_common_operand_vec: ; AVX: # %bb.0: -; AVX-NEXT: vsqrtpd %xmm0, %xmm1 -; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vsqrtpd %xmm0, %xmm0 ; AVX-NEXT: retq %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) - %r = fdiv arcp reassoc <2 x double> %x, %sqrt + %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt ret <2 x double> %r } define double @sqrt_fdiv_common_operand_extra_use(double %x, double* %p) nounwind { ; SSE-LABEL: sqrt_fdiv_common_operand_extra_use: ; SSE: # %bb.0: -; SSE-NEXT: sqrtsd %xmm0, %xmm1 -; SSE-NEXT: movsd %xmm1, (%rdi) -; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: sqrtsd %xmm0, %xmm0 +; SSE-NEXT: movsd %xmm0, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: sqrt_fdiv_common_operand_extra_use: ; AVX: # %bb.0: -; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vmovsd %xmm1, (%rdi) -; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovsd %xmm0, (%rdi) ; AVX-NEXT: retq %sqrt = call fast double @llvm.sqrt.f64(double %x) store double %sqrt, double* %p diff --git a/llvm/test/CodeGen/X86/statepoint-vector.ll b/llvm/test/CodeGen/X86/statepoint-vector.ll index a7d7be8ed0699..367eea88c185e 100644 --- a/llvm/test/CodeGen/X86/statepoint-vector.ll +++ b/llvm/test/CodeGen/X86/statepoint-vector.ll @@ -57,16 +57,15 @@ entry: define <2 x i64 addrspace(1)*> @test3(i1 %cnd, <2 x i64 addrspace(1)*>* %ptr) gc "statepoint-example" { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: movaps (%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, (%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: callq do_safepoint ; CHECK-NEXT: .Ltmp2: ; CHECK-NEXT: movaps (%rsp), %xmm0 -; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -115,7 +114,7 @@ entry: ; Check that we can lower a constant typed as i128 correctly. We don't have ; a representation of larger than 64 bit constant in the StackMap format. At ; the moment, this simply means spilling them, but there's a potential -; optimization for values representable as sext(Con64). +; optimization for values representable as sext(Con64). define void @test5() gc "statepoint-example" { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry @@ -172,31 +171,17 @@ entry: ; CHECK: .long 0 ; CHECK: .Ltmp2-test3 -; Check for the four spill slots -; Stack Maps: Loc 3: Indirect 7+16 [encoding: .byte 3, .byte 0, .short 16, .short 7, .short 0, .int 16] -; Stack Maps: Loc 4: Indirect 7+16 [encoding: .byte 3, .byte 0, .short 16, .short 7, .short 0, .int 16] -; Stack Maps: Loc 5: Indirect 7+16 [encoding: .byte 3, .byte 0, .short 16, .short 7, .short 0, .int 16] -; Stack Maps: Loc 6: Indirect 7+0 [encoding: .byte 3, .byte 0, .short 16, .short 7, .short 0, .int 0] +; Check for the two spill slots +; Stack Maps: Loc 3: Indirect 7+0 [encoding: .byte 3, .byte 0, .short 16, .short 7, .short 0, .int 0] +; Stack Maps: Loc 4: Indirect 7+0 [encoding: .byte 3, .byte 0, .short 16, .short 7, .short 0, .int 0] ; CHECK: .byte 3 ; CHECK: .byte 0 ; CHECK: .short 16 ; CHECK: .short 7 ; CHECK: .short 0 -; CHECK: .long 16 -; CHECK: .byte 3 -; CHECK: .byte 0 -; CHECK: .short 16 -; CHECK: .short 7 -; CHECK: .short 0 -; CHECK: .long 16 -; CHECK: .byte 3 -; CHECK: .byte 0 -; CHECK: .short 16 -; CHECK: .short 7 -; CHECK: .short 0 -; CHECK: .long 16 +; CHECK: .long 0 ; CHECK: .byte 3 -; CHECK: .byte 0 +; CHECK: .byte 0 ; CHECK: .short 16 ; CHECK: .short 7 ; CHECK: .short 0 diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll index 4467fec9f2b45..85a086503410e 100644 --- a/llvm/test/CodeGen/X86/stores-merging.ll +++ b/llvm/test/CodeGen/X86/stores-merging.ll @@ -468,9 +468,7 @@ define void @trunc_i32_to_i8(i32 %x, i8* %p) { define void @trunc_i32_to_i16(i32 %x, i16* %p) { ; CHECK-LABEL: trunc_i32_to_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movw %di, (%rsi) -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: movw %di, 2(%rsi) +; CHECK-NEXT: movl %edi, (%rsi) ; CHECK-NEXT: retq %t1 = trunc i32 %x to i16 %sh = lshr i32 %x, 16 @@ -522,15 +520,7 @@ define void @trunc_i64_to_i8(i64 %x, i8* %p) { define void @trunc_i64_to_i16(i64 %x, i16* %p) { ; CHECK-LABEL: trunc_i64_to_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: movw %di, (%rsi) -; CHECK-NEXT: shrq $16, %rdi -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: shrq $48, %rcx -; CHECK-NEXT: movw %di, 2(%rsi) -; CHECK-NEXT: movw %ax, 4(%rsi) -; CHECK-NEXT: movw %cx, 6(%rsi) +; CHECK-NEXT: movq %rdi, (%rsi) ; CHECK-NEXT: retq %t1 = trunc i64 %x to i16 %sh1 = lshr i64 %x, 16 @@ -552,9 +542,7 @@ define void @trunc_i64_to_i16(i64 %x, i16* %p) { define void @trunc_i64_to_i32(i64 %x, i32* %p) { ; CHECK-LABEL: trunc_i64_to_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, (%rsi) -; CHECK-NEXT: shrq $32, %rdi -; CHECK-NEXT: movl %edi, 4(%rsi) +; CHECK-NEXT: movq %rdi, (%rsi) ; CHECK-NEXT: retq %t1 = trunc i64 %x to i32 %sh = lshr i64 %x, 32 diff --git a/llvm/test/CodeGen/X86/strict-fadd-combines.ll b/llvm/test/CodeGen/X86/strict-fadd-combines.ll new file mode 100644 index 0000000000000..8560e1bb5bf3a --- /dev/null +++ b/llvm/test/CodeGen/X86/strict-fadd-combines.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define float @fneg_strict_fadd_to_strict_fsub(float %x, float %y) { + ; CHECK: subss %{{.*}}, %{{.*}} + ; CHECK-NEXT: retq + %neg = fneg float %y + %add = call float @llvm.experimental.constrained.fadd.f32(float %x, float %neg, metadata!"round.dynamic", metadata!"fpexcept.strict") + ret float %add +} + +define float @fneg_strict_fadd_to_strict_fsub_2(float %x, float %y) { + ; CHECK: subss %{{.*}}, %{{.*}} + ; CHECK-NEXT: retq + %neg = fneg float %y + %add = call float @llvm.experimental.constrained.fadd.f32(float %neg, float %x, metadata!"round.dynamic", metadata!"fpexcept.strict") + ret float %add +} + +define double @fneg_strict_fadd_to_strict_fsub_d(double %x, double %y) { + ; CHECK: subsd %{{.*}}, %{{.*}} + ; CHECK-NEXT: retq + %neg = fneg double %y + %add = call double @llvm.experimental.constrained.fadd.f64(double %x, double %neg, metadata!"round.dynamic", metadata!"fpexcept.strict") + ret double %add +} + +define double @fneg_strict_fadd_to_strict_fsub_2d(double %x, double %y) { + ; CHECK: subsd %{{.*}}, %{{.*}} + ; CHECK-NEXT: retq + %neg = fneg double %y + %add = call double @llvm.experimental.constrained.fadd.f64(double %neg, double %x, metadata!"round.dynamic", metadata!"fpexcept.strict") + ret double %add +} + + +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vec_call.ll b/llvm/test/CodeGen/X86/vec_call.ll index e0f95ec314865..cc620d3e5f5fb 100644 --- a/llvm/test/CodeGen/X86/vec_call.ll +++ b/llvm/test/CodeGen/X86/vec_call.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mcpu=generic -mattr=+sse2 -mtriple=i686-apple-darwin8 | FileCheck %s +; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin8 | FileCheck %s define void @test() { diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index e1f780da4fce6..6bee501e06a40 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -1372,48 +1372,48 @@ define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) ; ; AVX512-LABEL: saddo_v2i128: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r14 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: setns %al -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: setns %bl -; AVX512-NEXT: cmpb %al, %bl -; AVX512-NEXT: sete %bpl -; AVX512-NEXT: addq %r8, %rdi -; AVX512-NEXT: adcq %r9, %rsi -; AVX512-NEXT: setns %al -; AVX512-NEXT: cmpb %al, %bl -; AVX512-NEXT: setne %al -; AVX512-NEXT: andb %bpl, %al +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq %rcx, %rbp -; AVX512-NEXT: adcq %r10, %rbp +; AVX512-NEXT: movq %rcx, %r14 +; AVX512-NEXT: adcq %r11, %r14 ; AVX512-NEXT: setns %bl ; AVX512-NEXT: testq %rcx, %rcx ; AVX512-NEXT: setns %cl ; AVX512-NEXT: cmpb %bl, %cl -; AVX512-NEXT: setne %r8b -; AVX512-NEXT: testq %r10, %r10 +; AVX512-NEXT: setne %bl +; AVX512-NEXT: testq %r11, %r11 +; AVX512-NEXT: setns %al +; AVX512-NEXT: cmpb %al, %cl +; AVX512-NEXT: sete %al +; AVX512-NEXT: andb %bl, %al +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: testq %r9, %r9 +; AVX512-NEXT: setns %al +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: cmpb %al, %cl +; AVX512-NEXT: sete %al +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: adcq %r9, %rsi ; AVX512-NEXT: setns %bl ; AVX512-NEXT: cmpb %bl, %cl -; AVX512-NEXT: sete %cl -; AVX512-NEXT: andb %r8b, %cl -; AVX512-NEXT: kmovd %ecx, %k0 +; AVX512-NEXT: setne %cl +; AVX512-NEXT: andb %al, %cl +; AVX512-NEXT: andl $1, %ecx +; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rdx, 16(%r11) -; AVX512-NEXT: movq %rdi, (%r11) -; AVX512-NEXT: movq %rbp, 24(%r11) -; AVX512-NEXT: movq %rsi, 8(%r11) +; AVX512-NEXT: movq %rdx, 16(%r10) +; AVX512-NEXT: movq %rdi, (%r10) +; AVX512-NEXT: movq %r14, 24(%r10) +; AVX512-NEXT: movq %rsi, 8(%r10) ; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: popq %r14 ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_setcc-2.ll b/llvm/test/CodeGen/X86/vec_setcc-2.ll index 32d5c0b78a1c2..fb377a251bc91 100644 --- a/llvm/test/CodeGen/X86/vec_setcc-2.ll +++ b/llvm/test/CodeGen/X86/vec_setcc-2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -o - -mcpu=generic -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: llc < %s -o - -mcpu=generic -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE41 +; RUN: llc < %s -o - -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: llc < %s -o - -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE41 ; For a setult against a constant, turn it into a setule and lower via psubusw. diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index ad0a8f8ff12a0..1b5aef61ebf3b 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -3942,39 +3942,39 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: subq $24, %rsp ; AVX512-NEXT: movq %r8, %rax -; AVX512-NEXT: movq %rcx, %r15 +; AVX512-NEXT: movq %rcx, %r14 ; AVX512-NEXT: movq %rdx, %rbx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; AVX512-NEXT: movq %rax, %rdx ; AVX512-NEXT: movq %r9, %rcx ; AVX512-NEXT: callq __muloti4 -; AVX512-NEXT: movq %rax, %r14 +; AVX512-NEXT: movq %rax, %r13 ; AVX512-NEXT: movq %rdx, %rbp ; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; AVX512-NEXT: movq %rbx, %rdi -; AVX512-NEXT: movq %r15, %rsi +; AVX512-NEXT: movq %r14, %rsi ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq %r13, %rcx +; AVX512-NEXT: movq %r12, %rcx ; AVX512-NEXT: callq __muloti4 ; AVX512-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k0 ; AVX512-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: setne %cl -; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: andl $1, %ecx ; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rdx, 24(%r12) -; AVX512-NEXT: movq %rax, 16(%r12) -; AVX512-NEXT: movq %rbp, 8(%r12) -; AVX512-NEXT: movq %r14, (%r12) +; AVX512-NEXT: movq %rdx, 24(%r15) +; AVX512-NEXT: movq %rax, 16(%r15) +; AVX512-NEXT: movq %rbp, 8(%r15) +; AVX512-NEXT: movq %r13, (%r15) ; AVX512-NEXT: addq $24, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 4e2c3a57831f5..9981643ba2d4c 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -1381,48 +1381,48 @@ define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) ; ; AVX512-LABEL: ssubo_v2i128: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r14 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: setns %al -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: setns %bl -; AVX512-NEXT: cmpb %al, %bl -; AVX512-NEXT: setne %bpl -; AVX512-NEXT: subq %r8, %rdi -; AVX512-NEXT: sbbq %r9, %rsi -; AVX512-NEXT: setns %al -; AVX512-NEXT: cmpb %al, %bl -; AVX512-NEXT: setne %al -; AVX512-NEXT: andb %bpl, %al +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq %rcx, %rbp -; AVX512-NEXT: sbbq %r10, %rbp +; AVX512-NEXT: movq %rcx, %r14 +; AVX512-NEXT: sbbq %r11, %r14 ; AVX512-NEXT: setns %bl ; AVX512-NEXT: testq %rcx, %rcx ; AVX512-NEXT: setns %cl ; AVX512-NEXT: cmpb %bl, %cl -; AVX512-NEXT: setne %r8b -; AVX512-NEXT: testq %r10, %r10 +; AVX512-NEXT: setne %bl +; AVX512-NEXT: testq %r11, %r11 +; AVX512-NEXT: setns %al +; AVX512-NEXT: cmpb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: andb %bl, %al +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: testq %r9, %r9 +; AVX512-NEXT: setns %al +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: cmpb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: subq %r8, %rdi +; AVX512-NEXT: sbbq %r9, %rsi ; AVX512-NEXT: setns %bl ; AVX512-NEXT: cmpb %bl, %cl ; AVX512-NEXT: setne %cl -; AVX512-NEXT: andb %r8b, %cl -; AVX512-NEXT: kmovd %ecx, %k0 +; AVX512-NEXT: andb %al, %cl +; AVX512-NEXT: andl $1, %ecx +; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rdx, 16(%r11) -; AVX512-NEXT: movq %rdi, (%r11) -; AVX512-NEXT: movq %rbp, 24(%r11) -; AVX512-NEXT: movq %rsi, 8(%r11) +; AVX512-NEXT: movq %rdx, 16(%r10) +; AVX512-NEXT: movq %rdi, (%r10) +; AVX512-NEXT: movq %r14, 24(%r10) +; AVX512-NEXT: movq %rsi, 8(%r10) ; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: popq %r14 ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 9a153253a1695..c34653be4a02c 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -1282,16 +1282,16 @@ define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) ; AVX512-LABEL: uaddo_v2i128: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: addq %r8, %rdi -; AVX512-NEXT: adcq %r9, %rsi -; AVX512-NEXT: setb %r8b ; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: setb %al ; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: adcq %r9, %rsi +; AVX512-NEXT: setb %al +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: andl $1, %r8d -; AVX512-NEXT: kmovw %r8d, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 54bb86dc5a0f4..cc25fd5bec783 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -3689,68 +3689,66 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq %r9, %r10 -; AVX512-NEXT: movq %rcx, %r9 -; AVX512-NEXT: movq %rdx, %r11 -; AVX512-NEXT: movq %rsi, %rax -; AVX512-NEXT: movq %rdi, %rsi +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: movq %rdx, %r12 +; AVX512-NEXT: movq %rdi, %r11 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: testq %r10, %r10 ; AVX512-NEXT: setne %dl -; AVX512-NEXT: testq %rax, %rax -; AVX512-NEXT: setne %bl -; AVX512-NEXT: andb %dl, %bl -; AVX512-NEXT: mulq %r8 -; AVX512-NEXT: movq %rax, %r13 +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: setne %r13b +; AVX512-NEXT: andb %dl, %r13b +; AVX512-NEXT: mulq %r15 +; AVX512-NEXT: movq %rax, %rdi ; AVX512-NEXT: seto %bpl ; AVX512-NEXT: movq %r10, %rax -; AVX512-NEXT: mulq %rdi -; AVX512-NEXT: movq %rax, %rdi +; AVX512-NEXT: mulq %r12 +; AVX512-NEXT: movq %rax, %rbx ; AVX512-NEXT: seto %cl ; AVX512-NEXT: orb %bpl, %cl -; AVX512-NEXT: addq %r13, %rdi +; AVX512-NEXT: addq %rdi, %rbx +; AVX512-NEXT: movq %r12, %rax +; AVX512-NEXT: mulq %r15 +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq %rdx, %r15 +; AVX512-NEXT: addq %rbx, %r15 +; AVX512-NEXT: setb %al +; AVX512-NEXT: orb %cl, %al +; AVX512-NEXT: orb %r13b, %al +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: testq %r9, %r9 +; AVX512-NEXT: setne %al +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: setne %cl +; AVX512-NEXT: andb %al, %cl ; AVX512-NEXT: movq %rsi, %rax ; AVX512-NEXT: mulq %r8 -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq %rdx, %r10 -; AVX512-NEXT: addq %rdi, %r10 -; AVX512-NEXT: setb %sil -; AVX512-NEXT: orb %cl, %sil -; AVX512-NEXT: orb %bl, %sil -; AVX512-NEXT: testq %r12, %r12 -; AVX512-NEXT: setne %al -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: setne %bpl -; AVX512-NEXT: andb %al, %bpl +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: seto %bpl ; AVX512-NEXT: movq %r9, %rax -; AVX512-NEXT: mulq %r15 -; AVX512-NEXT: movq %rax, %rdi -; AVX512-NEXT: seto %r9b -; AVX512-NEXT: movq %r12, %rax ; AVX512-NEXT: mulq %r11 -; AVX512-NEXT: movq %rax, %rbx -; AVX512-NEXT: seto %cl -; AVX512-NEXT: orb %r9b, %cl -; AVX512-NEXT: addq %rdi, %rbx +; AVX512-NEXT: movq %rax, %rdi +; AVX512-NEXT: seto %bl +; AVX512-NEXT: orb %bpl, %bl +; AVX512-NEXT: addq %rsi, %rdi ; AVX512-NEXT: movq %r11, %rax -; AVX512-NEXT: mulq %r15 -; AVX512-NEXT: addq %rbx, %rdx -; AVX512-NEXT: setb %dil -; AVX512-NEXT: orb %cl, %dil -; AVX512-NEXT: orb %bpl, %dil -; AVX512-NEXT: kmovd %edi, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 +; AVX512-NEXT: mulq %r8 +; AVX512-NEXT: addq %rdi, %rdx +; AVX512-NEXT: setb %sil +; AVX512-NEXT: orb %bl, %sil +; AVX512-NEXT: orb %cl, %sil ; AVX512-NEXT: andl $1, %esi ; AVX512-NEXT: kmovw %esi, %k1 +; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rax, 16(%r14) -; AVX512-NEXT: movq %r8, (%r14) -; AVX512-NEXT: movq %rdx, 24(%r14) -; AVX512-NEXT: movq %r10, 8(%r14) +; AVX512-NEXT: movq %r10, 16(%r14) +; AVX512-NEXT: movq %rax, (%r14) +; AVX512-NEXT: movq %r15, 24(%r14) +; AVX512-NEXT: movq %rdx, 8(%r14) ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 ; AVX512-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index 0381394e74134..76c3e5ad32909 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -1329,16 +1329,16 @@ define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) ; AVX512-LABEL: usubo_v2i128: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: subq %r8, %rdi -; AVX512-NEXT: sbbq %r9, %rsi -; AVX512-NEXT: setb %r8b ; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: setb %al ; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: subq %r8, %rdi +; AVX512-NEXT: sbbq %r9, %rsi +; AVX512-NEXT: setb %al +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: andl $1, %r8d -; AVX512-NEXT: kmovw %r8d, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll new file mode 100644 index 0000000000000..d21fd8f531048 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -0,0 +1,476 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 + +; Just one 32-bit run to make sure we do reasonable things for i64 cases. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2 + +declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) + +; +; Variable Shifts +; + +define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { +; SSE2-LABEL: var_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pslld $23, %xmm1 +; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_funnnel_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_funnnel_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] +; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: var_funnnel_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_funnnel_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: var_funnnel_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: var_funnnel_v2i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; XOP-LABEL: var_funnnel_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; X32-SSE-LABEL: var_funnnel_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pslld $23, %xmm1 +; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: retl + %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> %amt) + ret <2 x i32> %res +} + +; +; Uniform Variable Shifts +; + +define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { +; SSE2-LABEL: splatvar_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pslld $23, %xmm1 +; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_funnnel_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] +; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: splatvar_funnnel_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatvar_funnnel_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatvar_funnnel_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatvar_funnnel_v2i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_funnnel_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; X32-SSE-LABEL: splatvar_funnnel_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pslld $23, %xmm1 +; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer + %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> %splat) + ret <2 x i32> %res +} + +; +; Constant Shifts +; + +define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { +; SSE2-LABEL: constant_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,32,1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_funnnel_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,32,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_funnnel_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: constant_funnnel_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: constant_funnnel_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: constant_funnnel_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: constant_funnnel_v2i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; XOP-LABEL: constant_funnnel_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; X32-SSE-LABEL: constant_funnnel_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,1,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: retl + %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> ) + ret <2 x i32> %res +} + +; +; Uniform Constant Shifts +; + +define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x) nounwind { +; SSE2-LABEL: splatconstant_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrld $28, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $4, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatconstant_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $28, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $4, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatconstant_funnnel_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $28, %xmm0, %xmm1 +; AVX1-NEXT: vpslld $4, %xmm0, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_funnnel_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: splatconstant_funnnel_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_funnnel_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_funnnel_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_funnnel_v2i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; XOP-LABEL: splatconstant_funnnel_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vprotd $4, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_funnnel_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrld $28, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: pslld $4, %xmm1 +; X32-SSE-NEXT: por %xmm2, %xmm1 +; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> ) + ret <2 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll new file mode 100644 index 0000000000000..ebb95d6e0410f --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -0,0 +1,502 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 + +; Just one 32-bit run to make sure we do reasonable things for i64 cases. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2 + +declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) + +; +; Variable Shifts +; + +define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { +; SSE2-LABEL: var_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pslld $23, %xmm2 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2 +; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd {{.*}}(%rip), %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_funnnel_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_funnnel_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] +; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: var_funnnel_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_funnnel_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: var_funnnel_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: var_funnnel_v2i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; XOP-LABEL: var_funnnel_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; X32-SSE-LABEL: var_funnnel_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: psubd %xmm1, %xmm2 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: pslld $23, %xmm2 +; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: retl + %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> %amt) + ret <2 x i32> %res +} + +; +; Uniform Variable Shifts +; + +define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { +; SSE2-LABEL: splatvar_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pslld $23, %xmm2 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2 +; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd {{.*}}(%rip), %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_funnnel_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] +; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: splatvar_funnnel_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatvar_funnnel_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VL-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatvar_funnnel_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatvar_funnnel_v2i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_funnnel_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; X32-SSE-LABEL: splatvar_funnnel_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: psubd %xmm1, %xmm2 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: pslld $23, %xmm2 +; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer + %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> %splat) + ret <2 x i32> %res +} + +; +; Constant Shifts +; + +define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { +; SSE2-LABEL: constant_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [268435456,134217728,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [268435456,134217728,1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_funnnel_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [268435456,134217728,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_funnnel_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: constant_funnnel_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: constant_funnnel_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: constant_funnnel_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: constant_funnnel_v2i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; XOP-LABEL: constant_funnnel_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; X32-SSE-LABEL: constant_funnnel_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [268435456,134217728,1,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: retl + %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> ) + ret <2 x i32> %res +} + +; +; Uniform Constant Shifts +; + +define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x) nounwind { +; SSE2-LABEL: splatconstant_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrld $4, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $28, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatconstant_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $4, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $28, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatconstant_funnnel_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1 +; AVX1-NEXT: vpslld $28, %xmm0, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_funnnel_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: splatconstant_funnnel_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vprord $4, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_funnnel_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vprord $4, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_funnnel_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vprord $4, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_funnnel_v2i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vprord $4, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; XOP-LABEL: splatconstant_funnnel_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vprotd $28, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_funnnel_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrld $4, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: pslld $28, %xmm1 +; X32-SSE-NEXT: por %xmm2, %xmm1 +; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> ) + ret <2 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index b2c0acdf9b228..82d1997cddfa9 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4944,6 +4944,66 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) { ret <4 x i64> %3 } +define <32 x i8> @PR47262(<4 x i64> %a0) { +; AVX1-LABEL: PR47262: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR47262: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,u,u,1,5,u,u,2,6,u,u,3,7,u,u,u,u,24,28,u,u,25,29,u,u,26,30,u,u,27,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,4,u,u,1,5,u,u,2,6,u,u,3,7,24,28,u,u,25,29,u,u,26,30,u,u,27,31,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: PR47262: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,u,u,1,5,u,u,2,6,u,u,3,7,u,u,u,u,24,28,u,u,25,29,u,u,26,30,u,u,27,31] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,4,u,u,1,5,u,u,2,6,u,u,3,7,24,28,u,u,25,29,u,u,26,30,u,u,27,31,u,u] +; AVX512VLBW-NEXT: movw $21930, %ax # imm = 0x55AA +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: PR47262: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23,8,12,24,28,9,13,25,29,10,14,26,30,11,15,27,31] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq +; +; XOPAVX1-LABEL: PR47262: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm2 = xmm0[8,12],xmm1[8,12],xmm0[9,13],xmm1[9,13],xmm0[10,14],xmm1[10,14],xmm0[11,15],xmm1[11,15] +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm1[0,4],xmm0[1,5],xmm1[1,5],xmm0[2,6],xmm1[2,6],xmm0[3,7],xmm1[3,7] +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: PR47262: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,u,u,1,5,u,u,2,6,u,u,3,7,u,u,u,u,24,28,u,u,25,29,u,u,26,30,u,u,27,31] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,4,u,u,1,5,u,u,2,6,u,u,3,7,24,28,u,u,25,29,u,u,26,30,u,u,27,31,u,u] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] +; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq + %t1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %t2 = bitcast <4 x i64> %t1 to <32 x i8> + %t3 = shufflevector <32 x i8> %t2, <32 x i8> undef, <32 x i32> + ret <32 x i8> %t3 +} + define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) { ; AVX1-LABEL: insert_dup_mem_v32i8_i32: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/widen_cast-1.ll b/llvm/test/CodeGen/X86/widen_cast-1.ll index 2401e005be216..f759b87394c29 100644 --- a/llvm/test/CodeGen/X86/widen_cast-1.ll +++ b/llvm/test/CodeGen/X86/widen_cast-1.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=i686-unknown-unknown -mcpu=generic -mattr=+sse4.2 < %s | FileCheck %s +; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse4.2 < %s | FileCheck %s ; RUN: llc -mtriple=i686-unknown-unknown -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s ; Scheduler causes produce a different instruction order diff --git a/llvm/test/DebugInfo/AArch64/dbg-sve-types.ll b/llvm/test/DebugInfo/AArch64/dbg-sve-types.ll new file mode 100644 index 0000000000000..62b86f294861d --- /dev/null +++ b/llvm/test/DebugInfo/AArch64/dbg-sve-types.ll @@ -0,0 +1,44 @@ +; Test that the debug info for the vector type is correctly codegenerated +; when the DISubrange has no count, but only an upperbound. +; RUN: llc -mtriple aarch64 -mattr=+sve -filetype=obj -o %t %s +; RUN: llvm-dwarfdump %t | FileCheck %s +; RUN: rm %t + +; CHECK: {{.*}}: DW_TAG_subrange_type +; CHECK-NEXT: DW_AT_type ({{.*}} "__ARRAY_SIZE_TYPE__") +; CHECK-NEXT: DW_AT_upper_bound (DW_OP_lit8, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_lit1, DW_OP_minus) + +define @test_svint8_t( returned %op1) !dbg !7 { +entry: + call void @llvm.dbg.value(metadata %op1, metadata !19, metadata !DIExpression()), !dbg !20 + ret %op1, !dbg !21 +} + +declare void @llvm.dbg.value(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "dbg-sve-types.ll", directory: "") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 12.0.0"} +!7 = distinct !DISubprogram(name: "test_svint8_t", scope: !8, file: !8, line: 5, type: !9, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !18) +!8 = !DIFile(filename: "dbg-sve-types.ll", directory: "") +!9 = !DISubroutineType(types: !10) +!10 = !{!11, !11} +!11 = !DIDerivedType(tag: DW_TAG_typedef, name: "svint8_t", file: !12, line: 32, baseType: !13) +!12 = !DIFile(filename: "lib/clang/12.0.0/include/arm_sve.h", directory: "") +!13 = !DIDerivedType(tag: DW_TAG_typedef, name: "__SVInt8_t", file: !1, baseType: !14) +!14 = !DICompositeType(tag: DW_TAG_array_type, baseType: !15, flags: DIFlagVector, elements: !16) +!15 = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char) +!16 = !{!17} +!17 = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) +!18 = !{!19} +!19 = !DILocalVariable(name: "op1", arg: 1, scope: !7, file: !8, line: 5, type: !11) +!20 = !DILocation(line: 0, scope: !7) +!21 = !DILocation(line: 5, column: 39, scope: !7) diff --git a/llvm/test/DebugInfo/AArch64/inlined-argument.ll b/llvm/test/DebugInfo/AArch64/inlined-argument.ll index 747d6d787aef5..4fb7663fa49b4 100644 --- a/llvm/test/DebugInfo/AArch64/inlined-argument.ll +++ b/llvm/test/DebugInfo/AArch64/inlined-argument.ll @@ -4,14 +4,8 @@ ; CHECK-NEXT: DW_AT_location (DW_OP_reg1 W1) ; CHECK-NEXT: DW_AT_abstract_origin {{.*}}"resource" ; -; XFAIL: * -; This test now fails as it requires the single-location variable recognizer -; to spot that the inlined function goes out of scope before the 'find.exit' -; exit block. Previously, unchanging variable locations could be extended to -; the end of the function, often erronously, and that's why this test used to -; pass. -; A future algorithm _should_ be able to recognize that "resource"/!37 covers -; all blocks in its lexical scope. +; Inlined variable "resource"/!37 covers all blocks in its lexical scope. Check +; that it is given a single location. ; ; Generated from: ; typedef struct t *t_t; diff --git a/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll b/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll index 719d69d16a625..08aecdac5b794 100644 --- a/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll +++ b/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll @@ -40,7 +40,7 @@ ; OBJ: SubSectionType: FrameData (0xF5) ; OBJ: FrameData { ; OBJ: RvaStart: 0x0 -; OBJ: CodeSize: 0x36 +; OBJ: CodeSize: 0x34 ; OBJ: PrologSize: 0x9 ; OBJ: FrameFunc [ ; OBJ-NEXT: $T0 .raSearch = @@ -50,7 +50,7 @@ ; OBJ: } ; OBJ: FrameData { ; OBJ: RvaStart: 0x7 -; OBJ: CodeSize: 0x2F +; OBJ: CodeSize: 0x2D ; OBJ: PrologSize: 0x2 ; OBJ: FrameFunc [ ; OBJ-NEXT: $T0 .raSearch = @@ -61,7 +61,7 @@ ; OBJ: } ; OBJ: FrameData { ; OBJ: RvaStart: 0x8 -; OBJ: CodeSize: 0x2E +; OBJ: CodeSize: 0x2C ; OBJ: PrologSize: 0x1 ; OBJ: FrameFunc [ ; OBJ-NEXT: $T0 .raSearch = @@ -73,7 +73,7 @@ ; OBJ: } ; OBJ: FrameData { ; OBJ: RvaStart: 0x9 -; OBJ: CodeSize: 0x2D +; OBJ: CodeSize: 0x2B ; OBJ: PrologSize: 0x0 ; OBJ: FrameFunc [ ; OBJ-NEXT: $T0 .raSearch = diff --git a/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll b/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll index c604234a60554..26fe7c49e7acf 100644 --- a/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll +++ b/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll @@ -15,9 +15,9 @@ ; CHECK: subl $20, %esp ; CHECK: .cv_fpo_stackalloc 20 ; CHECK: .cv_fpo_endprologue -; CHECK: movl 28(%esp), %esi ; CHECK: ___security_cookie +; CHECK: movl 28(%esp), %esi ; CHECK: movl %esi, {{[0-9]*}}(%esp) ; CHECK: movl %esi, {{[0-9]*}}(%esp) ; CHECK: movl %esi, {{[0-9]*}}(%esp) @@ -30,7 +30,7 @@ ; CHECK: addl $20, %esp ; CHECK: popl %esi ; CHECK: retl -; CHECK: Ltmp2: +; CHECK: Ltmp3: ; CHECK: .cv_fpo_endproc ; ModuleID = 't.c' diff --git a/llvm/test/DebugInfo/COFF/types-array.ll b/llvm/test/DebugInfo/COFF/types-array.ll index 19ddcf9ffe2c9..2962f970aca14 100644 --- a/llvm/test/DebugInfo/COFF/types-array.ll +++ b/llvm/test/DebugInfo/COFF/types-array.ll @@ -51,7 +51,7 @@ ; CHECK: PtrParent: 0x0 ; CHECK: PtrEnd: 0x0 ; CHECK: PtrNext: 0x0 -; CHECK: CodeSize: 0x2A +; CHECK: CodeSize: 0x39 ; CHECK: DbgStart: 0x0 ; CHECK: DbgEnd: 0x0 ; CHECK: FunctionType: f (0x1002) @@ -73,7 +73,7 @@ ; CHECK: LocalVariableAddrRange { ; CHECK: OffsetStart: .text+0x6 ; CHECK: ISectStart: 0x0 -; CHECK: Range: 0x24 +; CHECK: Range: 0x33 ; CHECK: } ; CHECK: } ; CHECK: ProcEnd { diff --git a/llvm/test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir b/llvm/test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir index d85f2d25391de..2c2fb5e5de9e7 100644 --- a/llvm/test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir +++ b/llvm/test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir @@ -5,9 +5,8 @@ # encountering an IMPLICIT_DEF in its own lexical scope. # CHECK: .debug_info contents: -# CHECK: DW_TAG_formal_parameter -# CHECK: DW_AT_location [DW_FORM_sec_offset] -# CHECK-NEXT: DW_OP_lit0, DW_OP_stack_value +# CHECK: DW_TAG_formal_parameter [14] +# CHECK-NEXT: DW_AT_const_value [DW_FORM_udata] (0) # CHECK-NEXT: DW_AT_abstract_origin {{.*}} "name" --- | ; ModuleID = 't.ll' diff --git a/llvm/test/DebugInfo/MIR/ARM/subregister-full-piece.mir b/llvm/test/DebugInfo/MIR/ARM/subregister-full-piece.mir index 1fa172b976456..fb201ab523aad 100644 --- a/llvm/test/DebugInfo/MIR/ARM/subregister-full-piece.mir +++ b/llvm/test/DebugInfo/MIR/ARM/subregister-full-piece.mir @@ -1,24 +1,53 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -start-after=livedebugvalues -filetype=obj -o - %s | \ # RUN: llvm-dwarfdump - | FileCheck %s # # This tests the edge-case where a complex fragment has exactly # the size of a subregister of the register the DBG_VALUE points to. # -# CHECK: .debug_info contents: + +# CHECK: DW_TAG_compile_unit +# CHECK: DW_AT_producer ("") +# CHECK: DW_AT_language (DW_LANG_C_plus_plus_14) +# CHECK: DW_AT_name ("t.cpp") +# CHECK: DW_AT_stmt_list (0x00000000) +# CHECK: DW_AT_comp_dir ("/") +# CHECK: DW_AT_APPLE_optimized (true) +# CHECK: DW_AT_low_pc (0x0000000000000000) +# CHECK: DW_AT_high_pc (0x0000000000000008) + +# CHECK: DW_TAG_subprogram +# CHECK: DW_AT_low_pc (0x0000000000000000) +# CHECK: DW_AT_high_pc (0x0000000000000008) +# CHECK: DW_AT_APPLE_omit_frame_ptr (true) +# CHECK: DW_AT_frame_base (DW_OP_reg13 SP) +# CHECK: DW_AT_name ("f") +# CHECK: DW_AT_decl_file ("/t.cpp") +# CHECK: DW_AT_decl_line (1) +# CHECK: DW_AT_external (true) +# CHECK: DW_AT_APPLE_isa (0x01) + # CHECK: DW_TAG_variable -# CHECK-NOT: DW_TAG -# CHECK: DW_AT_location -# Q8 = {D16, D17} -# CHECK-NEXT: DW_OP_regx D16, DW_OP_piece 0x8) -# CHECK-NOT: DW_TAG -# CHECK: DW_AT_name ("q8") +# CHECK: DW_AT_location (DW_OP_regx D16, DW_OP_piece 0x8) +# CHECK: DW_AT_name ("q8") +# CHECK: DW_AT_decl_file ("/t.cpp") +# CHECK: DW_AT_decl_line (1) +# CHECK: DW_AT_type (0x0000005b "uint8x8x2_t") + # CHECK: DW_TAG_variable -# CHECK-NOT: DW_TAG -# CHECK: DW_AT_location -# Q9 = {D18, D19} -# CHECK-NEXT: DW_OP_regx D18, DW_OP_piece 0x7) -# CHECK-NOT: DW_TAG -# CHECK: DW_AT_name ("q9") +# CHECK: DW_AT_location (DW_OP_regx D18, DW_OP_piece 0x7) +# CHECK: DW_AT_name ("q9") +# CHECK: DW_AT_decl_file ("/t.cpp") +# CHECK: DW_AT_decl_line (1) +# CHECK: DW_AT_type (0x0000005b "uint8x8x2_t") + +# CHECK: DW_TAG_structure_type +# CHECK: DW_AT_calling_convention (DW_CC_pass_by_value) +# CHECK: DW_AT_name ("uint8x8x2_t") +# CHECK: DW_AT_byte_size (0x10) +# CHECK: DW_AT_decl_file ("/t.cpp") +# CHECK: DW_AT_decl_line (113) + --- | target triple = "thumbv7s-apple-ios" @@ -45,7 +74,7 @@ body: | liveins: $r2 t2CMPri killed renamable $r2, 5, 14, $noreg, implicit-def $cpsr - t2Bcc %bb.2.for.body, 0, killed $cpsr, debug-location !20 DBG_VALUE $q8, $noreg, !8, !DIExpression(DW_OP_LLVM_fragment, 0, 64), debug-location !20 DBG_VALUE $q9, $noreg, !9, !DIExpression(DW_OP_LLVM_fragment, 0, 56), debug-location !20 + t2Bcc %bb.2.for.body, 0, killed $cpsr, debug-location !20 tB %bb.2.for.body, 14, $noreg diff --git a/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir b/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir index 5b9ecf08150be..4362f8e66b214 100644 --- a/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir +++ b/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir @@ -2,8 +2,7 @@ # RUN: -emit-call-site-info | llvm-dwarfdump - | FileCheck %s -implicit-check-not=call_site_parameter # CHECK: DW_TAG_formal_parameter -# CHECK-NEXT: DW_AT_location -# CHECK-NEXT: DW_OP_reg17 +# CHECK-NEXT: DW_AT_location (DW_OP_reg17 XMM0) # struct S { # float w; diff --git a/llvm/test/DebugInfo/MIR/X86/singlelocation-cutoffs.mir b/llvm/test/DebugInfo/MIR/X86/singlelocation-cutoffs.mir deleted file mode 100644 index 6ad64d9d74bbb..0000000000000 --- a/llvm/test/DebugInfo/MIR/X86/singlelocation-cutoffs.mir +++ /dev/null @@ -1,65 +0,0 @@ -# Test cutoffs for single-location variable analysis. -# Disable validThroughout if the input size exceeds the specified limit - -# RUN: llc %s -o - -start-after=livedebugvalues -mtriple=x86_64-unknown-unknown \ -# RUN: --singlevarlocation-input-bb-limit=0 -filetype=obj\ -# RUN: | llvm-dwarfdump -v -\ -# RUN: | FileCheck %s -check-prefix=LIMITED - -# RUN: llc %s -o - -start-after=livedebugvalues -mtriple=x86_64-unknown-unknown \ -# RUN: --singlevarlocation-input-bb-limit=20 -filetype=obj | llvm-dwarfdump -v -\ -# RUN: | FileCheck %s -check-prefix=UNLIMITED - -# LIMITED: DW_AT_location [DW_FORM_sec_offset] - -# UNLIMITED: DW_AT_location [DW_FORM_exprloc] - ---- | - target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" - - declare i32 @use(i32) - - define i32 @foo(i32 %x) !dbg !6 { - entry: - ret i32 1, !dbg !15 - } - - declare void @llvm.dbg.value(metadata, metadata, metadata) - - !llvm.dbg.cu = !{!0} - !llvm.debugify = !{!3, !4} - !llvm.module.flags = !{!5} - - !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) - !1 = !DIFile(filename: "/tmp/t.ll", directory: "/") - !2 = !{} - !3 = !{i32 4} - !4 = !{i32 2} - !5 = !{i32 2, !"Debug Info Version", i32 3} - !6 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: !1, line: 1, type: !7, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) - !7 = !DISubroutineType(types: !2) - !8 = !{!9, !11} - !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10) - !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) - !11 = !DILocalVariable(name: "2", scope: !6, file: !1, line: 3, type: !10) - !12 = !DILocation(line: 1, column: 1, scope: !6) - !13 = !DILocation(line: 2, column: 1, scope: !6) - !14 = !DILocation(line: 3, column: 1, scope: !6) - !15 = !DILocation(line: 4, column: 1, scope: !6) - -... ---- -name: foo -liveins: - - { reg: '$edi', virtual-reg: '' } -stack: - - { id: 0, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -body: | - bb.0.entry: - liveins: $edi - DBG_VALUE renamable $edi, $noreg, !11, !DIExpression(), debug-location !14 - RETQ debug-location !14 - -... diff --git a/llvm/test/DebugInfo/X86/inlined-formal-parameter.ll b/llvm/test/DebugInfo/X86/inlined-formal-parameter.ll index 00562df3922b4..376505f9832e9 100644 --- a/llvm/test/DebugInfo/X86/inlined-formal-parameter.ll +++ b/llvm/test/DebugInfo/X86/inlined-formal-parameter.ll @@ -19,8 +19,7 @@ ; CHECK: DW_TAG_inlined_subroutine ; CHECK-NEXT: DW_AT_abstract_origin {{.*}} "bar" ; CHECK: DW_TAG_formal_parameter -; CHECK-NEXT: DW_AT_location [DW_FORM_data4] ( -; CHECK-NEXT: [{{.*}}, {{.*}}): DW_OP_consts +0) +; CHECK-NEXT: DW_AT_const_value [DW_FORM_sdata] (0) ; CHECK-NEXT: DW_AT_abstract_origin {{.*}} "a" target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/DebugInfo/X86/location-range-inlined-xblock.mir b/llvm/test/DebugInfo/X86/location-range-inlined-xblock.mir new file mode 100644 index 0000000000000..1a39006d358ae --- /dev/null +++ b/llvm/test/DebugInfo/X86/location-range-inlined-xblock.mir @@ -0,0 +1,172 @@ +# RUN: llc -start-after=livedebugvalues --filetype=obj %s -o - \ +# RUN: | llvm-dwarfdump -v --name local - \ +# RUN: | FileCheck %s +# +## Generated with opt -inline -mem2reg, llc -stop-after=livedebugvalues from: +## int glob; +## int ext1(int); +## int ext2(int); +## +## __attribute__((always_inline)) +## static int inline_me() { +## int local = glob; +## int r = 0; +## if (local) +## r = ext1(local); +## else +## r = ext2(local); +## return r * local; +## } +## +## int fun(int p) { +## glob = p; +## glob = inline_me(); +## return 0; +## } +## +## Check that the location for inlined variable 'local' (RBX), which spans +## multiple basic blocks, is given a single location. +# +# CHECK: DW_AT_location [DW_FORM_exprloc] (DW_OP_reg3 RBX) +# CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4] ({{.*}}"local") + +--- | + target triple = "x86_64-unknown-linux-gnu" + + @glob = dso_local global i32 0, align 4, !dbg !0 + + define dso_local i32 @_Z3funi(i32 %p) !dbg !11 { + entry: + call void @llvm.dbg.value(metadata i32 %p, metadata !15, metadata !DIExpression()), !dbg !16 + store i32 %p, i32* @glob, align 4, !dbg !17 + %0 = load i32, i32* @glob, align 4, !dbg !22 + call void @llvm.dbg.value(metadata i32 %0, metadata !27, metadata !DIExpression()), !dbg !30 + call void @llvm.dbg.value(metadata i32 0, metadata !28, metadata !DIExpression()), !dbg !30 + %tobool.i = icmp ne i32 %0, 0, !dbg !31 + br i1 %tobool.i, label %if.then.i, label %if.else.i, !dbg !33 + + if.then.i: ; preds = %entry + %call.i = call i32 @_Z4ext1i(i32 %0), !dbg !34 + call void @llvm.dbg.value(metadata i32 %call.i, metadata !28, metadata !DIExpression()), !dbg !30 + br label %_ZL9inline_mev.exit, !dbg !35 + + if.else.i: ; preds = %entry + %call1.i = call i32 @_Z4ext2i(i32 %0), !dbg !36 + call void @llvm.dbg.value(metadata i32 %call1.i, metadata !28, metadata !DIExpression()), !dbg !30 + br label %_ZL9inline_mev.exit + + _ZL9inline_mev.exit: ; preds = %if.else.i, %if.then.i + %r.0.i = phi i32 [ %call.i, %if.then.i ], [ %call1.i, %if.else.i ], !dbg !37 + call void @llvm.dbg.value(metadata i32 %r.0.i, metadata !28, metadata !DIExpression()), !dbg !30 + %mul.i = mul nsw i32 %r.0.i, %0, !dbg !38 + store i32 %mul.i, i32* @glob, align 4, !dbg !39 + ret i32 0, !dbg !40 + } + + declare !dbg !41 dso_local i32 @_Z4ext1i(i32) + declare !dbg !42 dso_local i32 @_Z4ext2i(i32) + declare void @llvm.dbg.value(metadata, metadata, metadata) + + !llvm.dbg.cu = !{!2} + !llvm.module.flags = !{!7, !8, !9} + !llvm.ident = !{!10} + + !0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) + !1 = distinct !DIGlobalVariable(name: "glob", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true) + !2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) + !3 = !DIFile(filename: "test.cpp", directory: "/") + !4 = !{} + !5 = !{!0} + !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !7 = !{i32 7, !"Dwarf Version", i32 4} + !8 = !{i32 2, !"Debug Info Version", i32 3} + !9 = !{i32 1, !"wchar_size", i32 4} + !10 = !{!"clang version 12.0.0"} + !11 = distinct !DISubprogram(name: "fun", linkageName: "_Z3funi", scope: !3, file: !3, line: 16, type: !12, scopeLine: 16, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !14) + !12 = !DISubroutineType(types: !13) + !13 = !{!6, !6} + !14 = !{!15} + !15 = !DILocalVariable(name: "p", arg: 1, scope: !11, file: !3, line: 16, type: !6) + !16 = !DILocation(line: 0, scope: !11) + !17 = !DILocation(line: 17, column: 8, scope: !11) + !22 = !DILocation(line: 7, column: 15, scope: !23, inlinedAt: !29) + !23 = distinct !DISubprogram(name: "inline_me", linkageName: "_ZL9inline_mev", scope: !3, file: !3, line: 6, type: !24, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !26) + !24 = !DISubroutineType(types: !25) + !25 = !{!6} + !26 = !{!27, !28} + !27 = !DILocalVariable(name: "local", scope: !23, file: !3, line: 7, type: !6) + !28 = !DILocalVariable(name: "r", scope: !23, file: !3, line: 8, type: !6) + !29 = distinct !DILocation(line: 18, column: 10, scope: !11) + !30 = !DILocation(line: 0, scope: !23, inlinedAt: !29) + !31 = !DILocation(line: 9, column: 7, scope: !32, inlinedAt: !29) + !32 = distinct !DILexicalBlock(scope: !23, file: !3, line: 9, column: 7) + !33 = !DILocation(line: 9, column: 7, scope: !23, inlinedAt: !29) + !34 = !DILocation(line: 10, column: 9, scope: !32, inlinedAt: !29) + !35 = !DILocation(line: 10, column: 5, scope: !32, inlinedAt: !29) + !36 = !DILocation(line: 12, column: 9, scope: !32, inlinedAt: !29) + !37 = !DILocation(line: 0, scope: !32, inlinedAt: !29) + !38 = !DILocation(line: 13, column: 12, scope: !23, inlinedAt: !29) + !39 = !DILocation(line: 18, column: 8, scope: !11) + !40 = !DILocation(line: 19, column: 3, scope: !11) + !41 = !DISubprogram(name: "ext1", linkageName: "_Z4ext1i", scope: !3, file: !3, line: 2, type: !12, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) + !42 = !DISubprogram(name: "ext2", linkageName: "_Z4ext2i", scope: !3, file: !3, line: 3, type: !12, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) + +... +--- +name: _Z3funi +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $edi, $rbx + + DBG_VALUE $edi, $noreg, !15, !DIExpression(), debug-location !16 + frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 16 + CFI_INSTRUCTION offset $rbx, -16 + $ebx = MOV32rr $edi + DBG_VALUE $ebx, $noreg, !15, !DIExpression(), debug-location !16 + MOV32mr $rip, 1, $noreg, @glob, $noreg, $edi, debug-location !17 :: (store 4 into @glob) + DBG_VALUE $ebx, $noreg, !27, !DIExpression(), debug-location !30 + DBG_VALUE 0, $noreg, !28, !DIExpression(), debug-location !30 + TEST32rr $edi, $edi, implicit-def $eflags, debug-location !31 + JCC_1 %bb.2, 4, implicit killed $eflags, debug-location !33 + + bb.1.if.then.i: + successors: %bb.3(0x80000000) + liveins: $ebx + + DBG_VALUE $ebx, $noreg, !27, !DIExpression(), debug-location !30 + DBG_VALUE $ebx, $noreg, !15, !DIExpression(), debug-location !16 + DBG_VALUE 0, $noreg, !28, !DIExpression(), debug-location !30 + $edi = MOV32rr $ebx, debug-location !34 + CALL64pcrel32 @_Z4ext1i, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax, debug-location !34 + DBG_VALUE $eax, $noreg, !28, !DIExpression(), debug-location !30 + JMP_1 %bb.3 + + bb.2.if.else.i: + successors: %bb.3(0x80000000) + liveins: $ebx + + DBG_VALUE $ebx, $noreg, !27, !DIExpression(), debug-location !30 + DBG_VALUE $ebx, $noreg, !15, !DIExpression(), debug-location !16 + DBG_VALUE 0, $noreg, !28, !DIExpression(), debug-location !30 + $edi = MOV32rr $ebx, debug-location !36 + CALL64pcrel32 @_Z4ext2i, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax, debug-location !36 + DBG_VALUE $eax, $noreg, !28, !DIExpression(), debug-location !30 + + bb.3._ZL9inline_mev.exit: + liveins: $eax, $ebx + + DBG_VALUE $ebx, $noreg, !27, !DIExpression(), debug-location !30 + DBG_VALUE $ebx, $noreg, !15, !DIExpression(), debug-location !16 + DBG_VALUE $eax, $noreg, !28, !DIExpression(), debug-location !30 + DBG_VALUE $eax, $noreg, !28, !DIExpression(), debug-location !30 + renamable $eax = nsw IMUL32rr killed renamable $eax, killed renamable $ebx, implicit-def dead $eflags, debug-location !38 + MOV32mr $rip, 1, $noreg, @glob, $noreg, killed renamable $eax, debug-location !39 :: (store 4 into @glob) + $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, debug-location !40 + $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !40 + DBG_VALUE $edi, $noreg, !15, !DIExpression(DW_OP_LLVM_entry_value, 1), debug-location !16 + CFI_INSTRUCTION def_cfa_offset 8, debug-location !40 + RETQ $eax, debug-location !40 + +... diff --git a/llvm/test/DebugInfo/X86/single-location-2.mir b/llvm/test/DebugInfo/X86/single-location-2.mir new file mode 100644 index 0000000000000..e3f0ec979e22e --- /dev/null +++ b/llvm/test/DebugInfo/X86/single-location-2.mir @@ -0,0 +1,92 @@ +# RUN: llc %s --start-after=livedebugvalues -filetype=obj -o - \ +# RUN: | llvm-dwarfdump - -name local* -regex \ +# RUN: | FileCheck %s +# +## This tests certain single location detection functionality. The Test MIR +## is hand written. Test directives and comments inline. + +--- | + target triple = "x86_64-unknown-linux-gnu" + define dso_local i32 @fun() local_unnamed_addr !dbg !7 { + entry: + ret i32 0 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!3, !4, !5} + !llvm.ident = !{!6} + + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) + !1 = !DIFile(filename: "example.c", directory: "/") + !3 = !{i32 7, !"Dwarf Version", i32 4} + !4 = !{i32 2, !"Debug Info Version", i32 3} + !5 = !{i32 1, !"wchar_size", i32 4} + !6 = !{!"clang version 11.0.0"} + !8 = !DISubroutineType(types: !9) + !9 = !{!10} + !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !22 = !DISubroutineType(types: !23) + !23 = !{!10, !10} + ; --- Important metadata --- + !7 = distinct !DISubprogram(name: "fun", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) + !24 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 3) + !14 = distinct !DILexicalBlock(scope: !7, file: !1, line: 4, column: 3) + !12 = !DILocalVariable(name: "locala", scope: !7, file: !1, line: 1, type: !10) + !13 = !DILocalVariable(name: "localb", scope: !14, file: !1, line: 2, type: !10) + !25 = !DILocalVariable(name: "localc", scope: !24, file: !1, line: 3, type: !10) + !27 = !DILocalVariable(name: "tmp", scope: !14, file: !1, line: 2, type: !10) + !15 = !DILocation(line: 1, column: 0, scope: !7) + !18 = !DILocation(line: 2, column: 1, scope: !14) + !26 = !DILocation(line: 3, column: 1, scope: !24) +... +--- +name: fun +body: | + bb.0.entry: + ;; This is the scope and variable structure: + ;; int fun() { // scope fun !7 + ;; int locala; // scope fun !7, var locala !12, debug-location !15 + ;; { int localb; // scope fun:block !14, var localb !13, debug-location !18 + ;; int tmp; } // scope fun:block !14, var localb !27, debug-location !18 + ;; { int localc; } // scope fun:block !24, var localc !25, debug-location !26 + ;; } + ;; + ;; (1) Check that frame-setup instructions are not counted against + ;; locations being valid throughout the function call. + ; + ; CHECK: DW_TAG_variable + ; CHECK-NEXT: DW_AT_location (DW_OP_reg5 RDI) + ; CHECK-NEXT: DW_AT_name ("locala") + $rbp = frame-setup MOV64rr $rsp + DBG_VALUE $edi, $noreg, !12, !DIExpression(), debug-location !15 + $eax = MOV32ri 0, debug-location !15 + + ;; (2) The scope block ends with a meta instruction. A location range ends + ;; with the final non-meta instruction in the scope. Check that + ;; location is considered valid throughout. + ; + ; CHECK: DW_TAG_variable + ; CHECK-NEXT: DW_AT_location (DW_OP_reg2 RCX) + ; CHECK-NEXT: DW_AT_name ("localb") + ; + ;; start scope, start location range + DBG_VALUE $ecx, $noreg, !13, !DIExpression(), debug-location !18 + ;; end location range + $ecx = MOV32ri 1, debug-location !18 + ;; end scope + DBG_VALUE $noreg, $noreg, !27, !DIExpression(), debug-location !18 + + ;; (3) The final instruction in the scope closes a location range. Check + ;; that location is considered valid throughout. + ; + ; CHECK: DW_TAG_variable + ; CHECK-NEXT: DW_AT_location (DW_OP_reg4 RSI) + ; CHECK-NEXT: DW_AT_name ("localc") + ; + ;; start scope, start location range + DBG_VALUE $esi, $noreg, !25, !DIExpression(), debug-location !26 + ;; end scope, end location range + $esi = MOV32ri 2, debug-location !26 + + RETQ debug-location !15 +... diff --git a/llvm/test/DebugInfo/X86/trim-var-locs.mir b/llvm/test/DebugInfo/X86/trim-var-locs.mir index 04ab56d302ae6..9c1de2593fa5f 100644 --- a/llvm/test/DebugInfo/X86/trim-var-locs.mir +++ b/llvm/test/DebugInfo/X86/trim-var-locs.mir @@ -81,8 +81,7 @@ body: | ; block is not trimmed. ; ; CHECK: DW_TAG_variable - ; CHECK-NEXT: DW_AT_location - ; CHECK-NEXT: DW_OP_reg5 RDI + ; CHECK-NEXT: DW_AT_location (DW_OP_reg5 RDI) ; CHECK-NEXT: DW_AT_name ("localb") ; ; localb range 2 clobber in scope fun !7 (outside block !14) diff --git a/llvm/test/FileCheck/multiple-check-not-failures.txt b/llvm/test/FileCheck/multiple-check-not-failures.txt new file mode 100644 index 0000000000000..3b7b465719a0c --- /dev/null +++ b/llvm/test/FileCheck/multiple-check-not-failures.txt @@ -0,0 +1,32 @@ +; Check that all errors in a CHECK-NOT blocks are reported, but that FileCheck +; does not check past the block. +RUN: %ProtectFileCheckOutput \ +RUN: not FileCheck --dump-input=never --input-file %s %s 2>&1 | \ +RUN: FileCheck --strict-whitespace --check-prefix CHECK-ERRORS %s + +foo +bar + +barrier + +baz + +CHECK-NOT: bar +CHECK-NOT: foo +CHECK: barrier +CHECK-NOT: baz + +CHECK-ERRORS: multiple-check-not-failures.txt:[[#@LINE-5]]:12: error: {{C}}HECK-NOT: excluded string found in input +CHECK-ERRORS-NEXT: {{C}}HECK-NOT: bar +CHECK-ERRORS-NEXT: {{^}} ^{{$}} +CHECK-ERRORS-NEXT: multiple-check-not-failures.txt:[[#@LINE-14]]:1: note: found here +CHECK-ERRORS-NEXT: bar +CHECK-ERRORS-NEXT: {{^}}^~~{{$}} +CHECK-ERRORS-NEXT: multiple-check-not-failures.txt:[[#@LINE-10]]:12: error: {{C}}HECK-NOT: excluded string found in input +CHECK-ERRORS-NEXT: {{C}}HECK-NOT: foo +CHECK-ERRORS-NEXT: {{^}} ^{{$}} +CHECK-ERRORS-NEXT: multiple-check-not-failures.txt:[[#@LINE-21]]:1: note: found here +CHECK-ERRORS-NEXT: foo +CHECK-ERRORS-NEXT: {{^}}^~~{{$}} +CHECK-ERRORS-NOT: error: +CHECK-ERRORS-NOT: {{C}}HECK-NOT: baz diff --git a/llvm/test/Instrumentation/HeapProfiler/basic.ll b/llvm/test/Instrumentation/HeapProfiler/basic.ll new file mode 100644 index 0000000000000..a26dae15f5090 --- /dev/null +++ b/llvm/test/Instrumentation/HeapProfiler/basic.ll @@ -0,0 +1,179 @@ +; Test basic address sanitizer instrumentation. +; +; RUN: opt < %s -heapprof -heapprof-module -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s +; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s + +; We need the requires since both heapprof and heapprof-module require reading module level metadata which is done once by the heapprof-globals-md analysis +; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s +; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" +; CHECK: @llvm.global_ctors = {{.*}}@heapprof.module_ctor + +define i32 @test_load(i32* %a) { +entry: + %tmp1 = load i32, i32* %a, align 4 + ret i32 %tmp1 +} +; CHECK-LABEL: @test_load +; CHECK: %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address +; CHECK-NEXT: %[[LOAD_ADDR:[^ ]*]] = ptrtoint i32* %a to i64 +; CHECK-NEXT: %[[MASKED_ADDR:[^ ]*]] = and i64 %[[LOAD_ADDR]], -64 +; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3 +; CHECK-S5-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 5 +; CHECK-NEXT: add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]] +; CHECK-NEXT: %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr +; CHECK-NEXT: %[[LOAD_SHADOW:[^ ]*]] = load i64, i64* %[[LOAD_SHADOW_PTR]] +; CHECK-NEXT: %[[NEW_SHADOW:[^ ]*]] = add i64 %[[LOAD_SHADOW]], 1 +; CHECK-NEXT: store i64 %[[NEW_SHADOW]], i64* %[[LOAD_SHADOW_PTR]] +; The actual load. +; CHECK-NEXT: %tmp1 = load i32, i32* %a +; CHECK-NEXT: ret i32 %tmp1 + +define void @test_store(i32* %a) { +entry: + store i32 42, i32* %a, align 4 + ret void +} +; CHECK-LABEL: @test_store +; CHECK: %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address +; CHECK-NEXT: %[[STORE_ADDR:[^ ]*]] = ptrtoint i32* %a to i64 +; CHECK-NEXT: %[[MASKED_ADDR:[^ ]*]] = and i64 %[[STORE_ADDR]], -64 +; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3 +; CHECK-S5-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 5 +; CHECK-NEXT: add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]] +; CHECK-NEXT: %[[STORE_SHADOW_PTR:[^ ]*]] = inttoptr +; CHECK-NEXT: %[[STORE_SHADOW:[^ ]*]] = load i64, i64* %[[STORE_SHADOW_PTR]] +; CHECK-NEXT: %[[NEW_SHADOW:[^ ]*]] = add i64 %[[STORE_SHADOW]], 1 +; CHECK-NEXT: store i64 %[[NEW_SHADOW]], i64* %[[STORE_SHADOW_PTR]] +; The actual store. +; CHECK-NEXT: store i32 42, i32* %a +; CHECK-NEXT: ret void + +define void @FP80Test(x86_fp80* nocapture %a) nounwind uwtable { +entry: + store x86_fp80 0xK3FFF8000000000000000, x86_fp80* %a, align 16 + ret void +} +; CHECK-LABEL: @FP80Test +; Exactly one shadow update for store access. +; CHECK-NOT: store i64 +; CHECK: %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1 +; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]] +; CHECK-NOT: store i64 +; The actual store. +; CHECK: store x86_fp80 0xK3FFF8000000000000000, x86_fp80* %a +; CHECK: ret void + +define void @i40test(i40* %a, i40* %b) nounwind uwtable { +entry: + %t = load i40, i40* %a + store i40 %t, i40* %b, align 8 + ret void +} +; CHECK-LABEL: @i40test +; Exactly one shadow update for load access. +; CHECK-NOT: store i64 +; CHECK: %[[NEW_LD_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1 +; CHECK-NEXT: store i64 %[[NEW_LD_SHADOW]] +; CHECK-NOT: store i64 +; The actual load. +; CHECK: %t = load i40, i40* %a +; Exactly one shadow update for store access. +; CHECK-NOT: store i64 +; CHECK: %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1 +; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]] +; CHECK-NOT: store i64 +; The actual store. +; CHECK: store i40 %t, i40* %b +; CHECK: ret void + +define void @i64test_align1(i64* %b) nounwind uwtable { + entry: + store i64 0, i64* %b, align 1 + ret void +} +; CHECK-LABEL: @i64test +; Exactly one shadow update for store access. +; CHECK-NOT: store i64 +; CHECK: %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1 +; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]] +; CHECK-NOT: store i64 +; The actual store. +; CHECK: store i64 0, i64* %b +; CHECK: ret void + +define void @i80test(i80* %a, i80* %b) nounwind uwtable { + entry: + %t = load i80, i80* %a + store i80 %t, i80* %b, align 8 + ret void +} +; CHECK-LABEL: i80test +; Exactly one shadow update for load access. +; CHECK-NOT: store i64 +; CHECK: %[[NEW_LD_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1 +; CHECK-NEXT: store i64 %[[NEW_LD_SHADOW]] +; CHECK-NOT: store i64 +; The actual load. +; CHECK: %t = load i80, i80* %a +; Exactly one shadow update for store access. +; CHECK-NOT: store i64 +; CHECK: %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1 +; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]] +; CHECK-NOT: store i64 +; The actual store. +; CHECK: store i80 %t, i80* %b +; CHECK: ret void + +; heapprof should not instrument functions with available_externally linkage. +define available_externally i32 @f_available_externally(i32* %a) { +entry: + %tmp1 = load i32, i32* %a + ret i32 %tmp1 +} +; CHECK-LABEL: @f_available_externally +; CHECK-NOT: __heapprof_shadow_memory_dynamic_address +; CHECK: ret i32 + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) nounwind + +define void @memintr_test(i8* %a, i8* %b) nounwind uwtable { + entry: + tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 100, i1 false) + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i1 false) + ret void +} + +; CHECK-LABEL: memintr_test +; CHECK: __heapprof_memset +; CHECK: __heapprof_memmove +; CHECK: __heapprof_memcpy +; CHECK: ret void + +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind + +define void @memintr_element_atomic_test(i8* %a, i8* %b) nounwind uwtable { + ; This is a canary test to make sure that these don't get lowered into calls that don't + ; have the element-atomic property. Eventually, heapprof will have to be enhanced to lower + ; these properly. + ; CHECK-LABEL: memintr_element_atomic_test + ; CHECK: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1) + ; CHECK: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1) + ; CHECK: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1) + ; CHECK: ret void + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1) + tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1) + ret void +} + + +; CHECK: define internal void @heapprof.module_ctor() +; CHECK: call void @__heapprof_init() diff --git a/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll b/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll new file mode 100644 index 0000000000000..9df3df47d3d0a --- /dev/null +++ b/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll @@ -0,0 +1,36 @@ +; Test heapprof internal compiler flags: +; -heapprof-use-callbacks +; -heapprof-memory-access-callback-prefix + +; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-DEFAULT +; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -heapprof-memory-access-callback-prefix=__foo_ -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-CUSTOM +; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks=false -S | FileCheck %s --check-prefix=CHECK-INLINE +; RUN: opt < %s -heapprof -heapprof-module -S | FileCheck %s --check-prefix=CHECK-INLINE +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +define void @test_load(i32* %a, i64* %b, i512* %c, i80* %d) { +entry: +; CHECK-CALL: %[[LOAD_ADDR1:[^ ]*]] = ptrtoint i32* %a to i64 +; CHECK-CALL-DEFAULT: call void @__heapprof_load(i64 %[[LOAD_ADDR1]]) +; CHECK-CALL-CUSTOM: call void @__foo_load(i64 %[[LOAD_ADDR1]]) +; CHECK-CALL: %[[LOAD_ADDR2:[^ ]*]] = ptrtoint i64* %b to i64 +; CHECK-CALL-DEFAULT: call void @__heapprof_load(i64 %[[LOAD_ADDR2]]) +; CHECK-CALL-CUSTOM: call void @__foo_load(i64 %[[LOAD_ADDR2]]) +; CHECK-CALL: %[[LOAD_ADDR3:[^ ]*]] = ptrtoint i512* %c to i64 +; CHECK-CALL-DEFAULT: call void @__heapprof_load(i64 %[[LOAD_ADDR3]]) +; CHECK-CALL-CUSTOM: call void @__foo_load(i64 %[[LOAD_ADDR3]]) +; CHECK-CALL: %[[LOAD_ADDR4:[^ ]*]] = ptrtoint i80* %d to i64 +; CHECK-CALL-DEFAULT: call void @__heapprof_load(i64 %[[LOAD_ADDR4]]) +; CHECK-CALL-CUSTOM: call void @__foo_load(i64 %[[LOAD_ADDR4]]) +; CHECK-CALL-DEFAULT-NOT: call void @__heapprof_load +; CHECK-CALL-CUSTOM-NOT: call void @__foo_load +; CHECK-INLINE-NOT: call void @__heapprof_load + %tmp1 = load i32, i32* %a, align 4 + %tmp2 = load i64, i64* %b, align 8 + %tmp3 = load i512, i512* %c, align 32 + %tmp4 = load i80, i80* %d, align 8 + ret void +} + + diff --git a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll new file mode 100644 index 0000000000000..fa493a454ef10 --- /dev/null +++ b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll @@ -0,0 +1,246 @@ +; RUN: opt < %s -heapprof -heapprof-use-callbacks -S \ +; RUN: | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=ALL +; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -S \ +; RUN: | FileCheck %s -check-prefix=NOLOAD -check-prefix=STORE -check-prefix=ALL +; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-writes=0 -S \ +; RUN: | FileCheck %s -check-prefix=LOAD -check-prefix=NOSTORE -check-prefix=ALL +; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -heapprof-instrument-writes=0 -S \ +; RUN: | FileCheck %s -check-prefix=NOLOAD -check-prefix=NOSTORE -check-prefix=ALL +; Support heap profiling instrumentation for constant-mask llvm.masked.{load,store} + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@v4f32 = global <4 x float>* zeroinitializer, align 8 +@v8i32 = global <8 x i32>* zeroinitializer, align 8 +@v4i64 = global <4 x i32*>* zeroinitializer, align 8 + +;;;;;;;;;;;;;;;; STORE +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) argmemonly nounwind +declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) argmemonly nounwind +declare void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*>, <4 x i32*>*, i32, <4 x i1>) argmemonly nounwind + +define void @store.v4f32.1110(<4 x float> %arg) { +; ALL-LABEL: @store.v4f32.1110 + %p = load <4 x float>*, <4 x float>** @v4f32, align 8 +; NOSTORE-NOT: call void @__heapprof_store +; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 +; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP0]]) +; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1 +; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP1]]) +; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2 +; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP2]]) +; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) + ret void +} + +define void @store.v8i32.10010110(<8 x i32> %arg) { +; ALL-LABEL: @store.v8i32.10010110 + %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8 +; NOSTORE-NOT: call void @__heapprof_store +; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0 +; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP0]]) +; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 3 +; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP3]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP3]]) +; STORE: [[GEP5:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 5 +; STORE: [[PGEP5:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP5]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP5]]) +; STORE: [[GEP6:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 6 +; STORE: [[PGEP6:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP6]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP6]]) +; STORE: tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> ) + tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> ) + ret void +} + +define void @store.v4i64.0001(<4 x i32*> %arg) { +; ALL-LABEL: @store.v4i64.0001 + %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8 +; NOSTORE-NOT: call void @__heapprof_store +; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3 +; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP3]]) +; STORE: tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> ) + tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> ) + ret void +} + +define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { +; ALL-LABEL: @store.v4f32.variable + %p = load <4 x float>*, <4 x float>** @v4f32, align 8 +; STORE: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0 +; STORE: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]] +; STORE: [[THEN0]]: +; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 +; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP0]]) +; STORE: br label %[[AFTER0]] +; STORE: [[AFTER0]]: + +; STORE: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1 +; STORE: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]] +; STORE: [[THEN1]]: +; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1 +; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP1]]) +; STORE: br label %[[AFTER1]] +; STORE: [[AFTER1]]: + +; STORE: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2 +; STORE: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]] +; STORE: [[THEN2]]: +; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2 +; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP2]]) +; STORE: br label %[[AFTER2]] +; STORE: [[AFTER2]]: + +; STORE: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3 +; STORE: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]] +; STORE: [[THEN3]]: +; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3 +; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP3]]) +; STORE: br label %[[AFTER3]] +; STORE: [[AFTER3]]: + +; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask) + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask) + ret void +} + +;; Store using two masked.stores, which should instrument them both. +define void @store.v4f32.1010.split(<4 x float> %arg) { +; BOTH-LABEL: @store.v4f32.1010.split + %p = load <4 x float>*, <4 x float>** @v4f32, align 8 +; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 +; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP0]]) +; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) +; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2 +; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64 +; STORE: call void @__heapprof_store(i64 [[PGEP1]]) +; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) + ret void +} + +;;;;;;;;;;;;;;;; LOAD +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) argmemonly nounwind +declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) argmemonly nounwind +declare <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>*, i32, <4 x i1>, <4 x i32*>) argmemonly nounwind + +define <8 x i32> @load.v8i32.11100001(<8 x i32> %arg) { +; ALL-LABEL: @load.v8i32.11100001 + %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8 +; NOLOAD-NOT: call void @__heapprof_load +; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0 +; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP0]]) +; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 1 +; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP1]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP1]]) +; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 2 +; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP2]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP2]]) +; LOAD: [[GEP7:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 7 +; LOAD: [[PGEP7:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP7]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP7]]) +; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> , <8 x i32> %arg) + %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> , <8 x i32> %arg) + ret <8 x i32> %res +} + +define <4 x float> @load.v4f32.1001(<4 x float> %arg) { +; ALL-LABEL: @load.v4f32.1001 + %p = load <4 x float>*, <4 x float>** @v4f32, align 8 +; NOLOAD-NOT: call void @__heapprof_load +; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 +; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP0]]) +; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3 +; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP3]]) +; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %arg) + %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %arg) + ret <4 x float> %res +} + +define <4 x i32*> @load.v4i64.0001(<4 x i32*> %arg) { +; ALL-LABEL: @load.v4i64.0001 + %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8 +; NOLOAD-NOT: call void @__heapprof_load +; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3 +; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP3]]) +; LOAD: tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> , <4 x i32*> %arg) + %res = tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> , <4 x i32*> %arg) + ret <4 x i32*> %res +} + +define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { +; ALL-LABEL: @load.v4f32.variable + %p = load <4 x float>*, <4 x float>** @v4f32, align 8 +; LOAD: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0 +; LOAD: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]] +; LOAD: [[THEN0]]: +; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 +; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP0]]) +; LOAD: br label %[[AFTER0]] +; LOAD: [[AFTER0]]: + +; LOAD: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1 +; LOAD: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]] +; LOAD: [[THEN1]]: +; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1 +; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP1]]) +; LOAD: br label %[[AFTER1]] +; LOAD: [[AFTER1]]: + +; LOAD: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2 +; LOAD: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]] +; LOAD: [[THEN2]]: +; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2 +; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP2]]) +; LOAD: br label %[[AFTER2]] +; LOAD: [[AFTER2]]: + +; LOAD: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3 +; LOAD: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]] +; LOAD: [[THEN3]]: +; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3 +; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP3]]) +; LOAD: br label %[[AFTER3]] +; LOAD: [[AFTER3]]: + +; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg) + %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg) + ret <4 x float> %res +} + +;; Load using two masked.loads, which should instrument them both. +define <4 x float> @load.v4f32.1001.split(<4 x float> %arg) { +; BOTH-LABEL: @load.v4f32.1001 + %p = load <4 x float>*, <4 x float>** @v4f32, align 8 +; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 +; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP0]]) +; LOAD: %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %arg) + %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %arg) +; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3 +; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64 +; LOAD: call void @__heapprof_load(i64 [[PGEP3]]) +; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %res) + %res2 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %res) + ret <4 x float> %res2 +} diff --git a/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll b/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll new file mode 100644 index 0000000000000..c8c3a6d605db3 --- /dev/null +++ b/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll @@ -0,0 +1,29 @@ +; Test that the scale (-heapprof-mapping-scale) and granularity (-heapprof-mapping-granularity) command-line options work as expected +; +; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 32 -S | FileCheck --check-prefix=CHECK-GRAN %s +; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale 1 -S | FileCheck --check-prefix=CHECK-SCALE %s +; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 16 -heapprof-mapping-scale 0 -S | FileCheck --check-prefix=CHECK-BOTH %s +target triple = "x86_64-unknown-linux-gnu" + +define i32 @read(i32* %a) { +entry: + %tmp1 = load i32, i32* %a, align 4 + ret i32 %tmp1 +} +; CHECK-GRAN-LABEL: @read +; CHECK-GRAN-NOT: ret +; CHECK-GRAN: and {{.*}} -32 +; CHECK-GRAN-NEXT: lshr {{.*}} 3 +; CHECK-GRAN: ret + +; CHECK-SCALE-LABEL: @read +; CHECK-SCALE-NOT: ret +; CHECK-SCALE: and {{.*}} -64 +; CHECK-SCALE-NEXT: lshr {{.*}} 1 +; CHECK-SCALE: ret + +; CHECK-BOTH-LABEL: @read +; CHECK-BOTH-NOT: ret +; CHECK-BOTH: and {{.*}} -16 +; CHECK-BOTH-NEXT: lshr {{.*}} 0 +; CHECK-BOTH: ret diff --git a/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll b/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll new file mode 100644 index 0000000000000..84e039551d702 --- /dev/null +++ b/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll @@ -0,0 +1,12 @@ +; Check that the HeapProf module constructor guards against compiler/runtime version +; mismatch. + +; RUN: opt < %s -heapprof-module -S | FileCheck %s +; RUN: opt < %s -heapprof-module -heapprof-guard-against-version-mismatch=0 -S | FileCheck %s --check-prefix=NOGUARD + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: define internal void @heapprof.module_ctor() +; CHECK: call void @__heapprof_version_mismatch_check_v1 +; NOGUARD-NOT: call void @__heapprof_version_mismatch_check_ diff --git a/llvm/test/LTO/ARM/lto-linking-metadata.ll b/llvm/test/LTO/ARM/lto-linking-metadata.ll index ae6f42ff9be82..75b65ac85bed8 100644 --- a/llvm/test/LTO/ARM/lto-linking-metadata.ll +++ b/llvm/test/LTO/ARM/lto-linking-metadata.ll @@ -1,7 +1,8 @@ ; RUN: opt %s -o %t1.bc -; RUN: llvm-lto %t1.bc -o %t1.save.opt -save-merged-module -O1 --exported-symbol=foo +; RUN: llvm-lto %t1.bc -o %t1.save.opt -save-linked-module -save-merged-module -O1 --exported-symbol=foo ; RUN: llvm-dis < %t1.save.opt.merged.bc | FileCheck %s +; RUN: llvm-dis < %t1.save.opt.linked.bc | FileCheck %s --check-prefix=CHECK-LINKED ; RUN: llvm-lto2 run %t1.bc -o %t.out.o -save-temps \ ; RUN: -r=%t1.bc,foo,pxl @@ -17,3 +18,6 @@ entry: ; CHECK: !llvm.module.flags = !{[[MD_NUM:![0-9]+]]} ; CHECK: [[MD_NUM]] = !{i32 1, !"LTOPostLink", i32 1} + +; CHECK-LINKED: @foo +; CHECK-LINKED-NOT: LTOPostLink diff --git a/llvm/test/LTO/Resolution/X86/dead-strip-fulllto.ll b/llvm/test/LTO/Resolution/X86/dead-strip-fulllto.ll index 43659313f2dc5..fb4fde7661b7d 100644 --- a/llvm/test/LTO/Resolution/X86/dead-strip-fulllto.ll +++ b/llvm/test/LTO/Resolution/X86/dead-strip-fulllto.ll @@ -1,7 +1,8 @@ ; RUN: opt -module-summary -o %t %s ; RUN: opt -module-summary -o %t2 %S/Inputs/dead-strip-fulllto.ll -; RUN: llvm-lto2 run --pass-remarks-output=%t4.yaml --pass-remarks-filter=. \ +; Adding '--pass-remarks-with-hotness' should not cause crash. +; RUN: llvm-lto2 run --pass-remarks-output=%t4.yaml --pass-remarks-filter=. --pass-remarks-with-hotness \ ; RUN: %t -r %t,main,px -r %t,live1, -r %t,live2,p -r %t,dead2,p \ ; RUN: %t2 -r %t2,live1,p -r %t2,live2, -r %t2,dead1,p -r %t2,dead2, -r %t2,odr, \ ; RUN: -save-temps -o %t3 diff --git a/llvm/test/LTO/X86/embed-bitcode.ll b/llvm/test/LTO/X86/embed-bitcode.ll index f57ea1fa32aef..151f27f55eefb 100644 --- a/llvm/test/LTO/X86/embed-bitcode.ll +++ b/llvm/test/LTO/X86/embed-bitcode.ll @@ -10,11 +10,11 @@ ; RUN: llvm-lto2 run -r %t1.o,_start,px -r %t2.o,foo,px -r %t3.o,bar,px -r %t2.o,bar,lx -lto-embed-bitcode -o %t3 %t1.o %t2.o %t3.o ; RUN: llvm-readelf -S %t3.0 | FileCheck %s --check-prefix=CHECK-ELF -; RUN: llvm-objcopy -O binary -j .llvmbc %t3.0 %t-embedded.bc +; RUN: llvm-objcopy --dump-section=.llvmbc=%t-embedded.bc %t3.0 /dev/null ; RUN: llvm-dis %t-embedded.bc -o - | FileCheck %s --check-prefix=CHECK-LL -; CHECK-ELF: .text -; CHECK-ELF: .llvmbc +; CHECK-ELF: .text PROGBITS 0000000000000000 [[#%x,OFF:]] [[#%x,SIZE:]] 00 AX 0 +; CHECK-ELF-NEXT: .llvmbc PROGBITS 0000000000000000 [[#%x,OFF:]] [[#%x,SIZE:]] 00 0 ; CHECK-LL: @_start ; CHECK-LL: @foo diff --git a/llvm/test/MC/AArch64/SVE/st1b.s b/llvm/test/MC/AArch64/SVE/st1b.s index a6f766bdfd7cc..40b830709ead4 100644 --- a/llvm/test/MC/AArch64/SVE/st1b.s +++ b/llvm/test/MC/AArch64/SVE/st1b.s @@ -168,3 +168,27 @@ st1b { z31.d }, p7, [z31.d, #31] // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe4] // CHECK-ERROR: instruction requires: sve // CHECK-UNKNOWN: ff bf 5f e4 + +st1b { z0.s }, p7, [z0.s, #0] +// CHECK-INST: st1b { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 60 e4 + +st1b { z0.s }, p7, [z0.s] +// CHECK-INST: st1b { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 60 e4 + +st1b { z0.d }, p7, [z0.d, #0] +// CHECK-INST: st1b { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 40 e4 + +st1b { z0.d }, p7, [z0.d] +// CHECK-INST: st1b { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 40 e4 diff --git a/llvm/test/MC/AArch64/SVE/st1d.s b/llvm/test/MC/AArch64/SVE/st1d.s index ba4a0e5be114b..a5a19e772b528 100644 --- a/llvm/test/MC/AArch64/SVE/st1d.s +++ b/llvm/test/MC/AArch64/SVE/st1d.s @@ -78,3 +78,15 @@ st1d { z31.d }, p7, [z31.d, #248] // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe5] // CHECK-ERROR: instruction requires: sve // CHECK-UNKNOWN: ff bf df e5 + +st1d { z0.d }, p7, [z0.d, #0] +// CHECK-INST: st1d { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc c0 e5 + +st1d { z0.d }, p7, [z0.d] +// CHECK-INST: st1d { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc c0 e5 diff --git a/llvm/test/MC/AArch64/SVE/st1h.s b/llvm/test/MC/AArch64/SVE/st1h.s index cd6c20d83482e..fe22c52bb9bef 100644 --- a/llvm/test/MC/AArch64/SVE/st1h.s +++ b/llvm/test/MC/AArch64/SVE/st1h.s @@ -168,3 +168,27 @@ st1h { z31.d }, p7, [z31.d, #62] // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe4] // CHECK-ERROR: instruction requires: sve // CHECK-UNKNOWN: ff bf df e4 + +st1h { z0.s }, p7, [z0.s, #0] +// CHECK-INST: st1h { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc e0 e4 + +st1h { z0.s }, p7, [z0.s] +// CHECK-INST: st1h { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc e0 e4 + +st1h { z0.d }, p7, [z0.d, #0] +// CHECK-INST: st1h { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc c0 e4 + +st1h { z0.d }, p7, [z0.d] +// CHECK-INST: st1h { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc c0 e4 diff --git a/llvm/test/MC/AArch64/SVE/st1w.s b/llvm/test/MC/AArch64/SVE/st1w.s index e20194f5747e9..5bbcd2e1ea0ff 100644 --- a/llvm/test/MC/AArch64/SVE/st1w.s +++ b/llvm/test/MC/AArch64/SVE/st1w.s @@ -138,3 +138,27 @@ st1w { z31.d }, p7, [z31.d, #124] // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe5] // CHECK-ERROR: instruction requires: sve // CHECK-UNKNOWN: ff bf 5f e5 + +st1w { z0.s }, p7, [z0.s, #0] +// CHECK-INST: st1w { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 60 e5 + +st1w { z0.s }, p7, [z0.s] +// CHECK-INST: st1w { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 60 e5 + +st1w { z0.d }, p7, [z0.d, #0] +// CHECK-INST: st1w { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 40 e5 + +st1w { z0.d }, p7, [z0.d] +// CHECK-INST: st1w { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 40 e5 diff --git a/llvm/test/MC/AArch64/seh.s b/llvm/test/MC/AArch64/seh.s index eb6a94a3f7377..878b7ad944cfb 100644 --- a/llvm/test/MC/AArch64/seh.s +++ b/llvm/test/MC/AArch64/seh.s @@ -1,6 +1,8 @@ -// This test checks that the SEH directives emit the correct unwind data. +// This test checks that the SEH directives don't cause the assembler to fail. +// Checking that llvm-readobj doesn't bail out on the unwind data, but not +// really checking the contents yet. -// RUN: llvm-mc -triple aarch64-pc-win32 -filetype=obj %s | llvm-readobj -S -r - | FileCheck %s +// RUN: llvm-mc -triple aarch64-pc-win32 -filetype=obj %s | llvm-readobj -S -r -u - | FileCheck %s // CHECK: Sections [ // CHECK: Section { @@ -15,7 +17,7 @@ // CHECK-NEXT: } // CHECK: Section { // CHECK: Name: .xdata -// CHECK: RawDataSize: 24 +// CHECK: RawDataSize: 20 // CHECK: RelocationCount: 1 // CHECK: Characteristics [ // CHECK-NEXT: ALIGN_4BYTES @@ -25,7 +27,7 @@ // CHECK-NEXT: } // CHECK: Section { // CHECK: Name: .pdata -// CHECK: RelocationCount: 6 +// CHECK: RelocationCount: 2 // CHECK: Characteristics [ // CHECK-NEXT: ALIGN_4BYTES // CHECK-NEXT: CNT_INITIALIZED_DATA @@ -41,10 +43,6 @@ // CHECK-NEXT: Section (5) .pdata { // CHECK-NEXT: 0x0 IMAGE_REL_ARM64_ADDR32NB func // CHECK-NEXT: 0x4 IMAGE_REL_ARM64_ADDR32NB .xdata -// CHECK-NEXT: 0x8 IMAGE_REL_ARM64_ADDR32NB func -// CHECK-NEXT: 0xC IMAGE_REL_ARM64_ADDR32NB .xdata -// CHECK-NEXT: 0x10 IMAGE_REL_ARM64_ADDR32NB smallFunc -// CHECK-NEXT: 0x14 IMAGE_REL_ARM64_ADDR32NB .xdata // CHECK-NEXT: } // CHECK-NEXT: ] @@ -65,14 +63,12 @@ func: .seh_handlerdata .long 0 .text - .seh_startchained - .seh_endprologue - .seh_endchained add sp, sp, #24 ret .seh_endproc -// Test emission of small functions. + // Function with no .seh directives; no pdata/xdata entries are + // generated. .globl smallFunc .def smallFunc .scl 2 @@ -82,3 +78,21 @@ func: smallFunc: ret .seh_endproc + + // Function with no .seh directives, but with .seh_handlerdata. + // No xdata/pdata entries are generated, but the custom handler data + // (the .long after .seh_handlerdata) is left orphaned in the xdata + // section. + .globl handlerFunc + .def handlerFunc + .scl 2 + .type 32 + .endef + .seh_proc handlerFunc +handlerFunc: + ret + .seh_handler __C_specific_handler, @except + .seh_handlerdata + .long 0 + .text + .seh_endproc diff --git a/llvm/test/MC/RISCV/rvv/zvamo.s b/llvm/test/MC/RISCV/rvv/zvamo.s new file mode 100644 index 0000000000000..8c38ff8e1a189 --- /dev/null +++ b/llvm/test/MC/RISCV/rvv/zvamo.s @@ -0,0 +1,874 @@ +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+a,+experimental-zvamo %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+a,+experimental-zvamo %s \ +# RUN: | llvm-objdump -d --mattr=+a,+experimental-zvamo - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+a,+experimental-zvamo %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN + + +vamoswapei8.v v8, (a0), v4, v8 +# CHECK-INST: vamoswapei8.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x04,0x45,0x0e] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 0e + +vamoswapei16.v v8, (a0), v4, v8 +# CHECK-INST: vamoswapei16.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x54,0x45,0x0e] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 0e + +vamoswapei32.v v8, (a0), v4, v8 +# CHECK-INST: vamoswapei32.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x64,0x45,0x0e] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 0e + +vamoswapei64.v v8, (a0), v4, v8 +# CHECK-INST: vamoswapei64.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x74,0x45,0x0e] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 0e + +vamoswapei8.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoswapei8.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x04,0x45,0x0c] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 0c + +vamoswapei16.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoswapei16.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x54,0x45,0x0c] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 0c + +vamoswapei32.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoswapei32.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x64,0x45,0x0c] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 0c + +vamoswapei64.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoswapei64.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x74,0x45,0x0c] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 0c + +vamoaddei8.v v8, (a0), v4, v8 +# CHECK-INST: vamoaddei8.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x04,0x45,0x06] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 06 + +vamoaddei16.v v8, (a0), v4, v8 +# CHECK-INST: vamoaddei16.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x54,0x45,0x06] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 06 + +vamoaddei32.v v8, (a0), v4, v8 +# CHECK-INST: vamoaddei32.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x64,0x45,0x06] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 06 + +vamoaddei64.v v8, (a0), v4, v8 +# CHECK-INST: vamoaddei64.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x74,0x45,0x06] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 06 + +vamoaddei8.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoaddei8.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x04,0x45,0x04] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 04 + +vamoaddei16.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoaddei16.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x54,0x45,0x04] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 04 + +vamoaddei32.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoaddei32.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x64,0x45,0x04] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 04 + +vamoaddei64.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoaddei64.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x74,0x45,0x04] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 04 + +vamoxorei8.v v8, (a0), v4, v8 +# CHECK-INST: vamoxorei8.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x04,0x45,0x26] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 26 + +vamoxorei16.v v8, (a0), v4, v8 +# CHECK-INST: vamoxorei16.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x54,0x45,0x26] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 26 + +vamoxorei32.v v8, (a0), v4, v8 +# CHECK-INST: vamoxorei32.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x64,0x45,0x26] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 26 + +vamoxorei64.v v8, (a0), v4, v8 +# CHECK-INST: vamoxorei64.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x74,0x45,0x26] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 26 + +vamoxorei8.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoxorei8.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x04,0x45,0x24] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 24 + +vamoxorei16.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoxorei16.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x54,0x45,0x24] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 24 + +vamoxorei32.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoxorei32.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x64,0x45,0x24] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 24 + +vamoxorei64.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoxorei64.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x74,0x45,0x24] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 24 + +vamoandei8.v v8, (a0), v4, v8 +# CHECK-INST: vamoandei8.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x04,0x45,0x66] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 66 + +vamoandei16.v v8, (a0), v4, v8 +# CHECK-INST: vamoandei16.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x54,0x45,0x66] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 66 + +vamoandei32.v v8, (a0), v4, v8 +# CHECK-INST: vamoandei32.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x64,0x45,0x66] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 66 + +vamoandei64.v v8, (a0), v4, v8 +# CHECK-INST: vamoandei64.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x74,0x45,0x66] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 66 + +vamoandei8.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoandei8.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x04,0x45,0x64] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 64 + +vamoandei16.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoandei16.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x54,0x45,0x64] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 64 + +vamoandei32.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoandei32.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x64,0x45,0x64] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 64 + +vamoandei64.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoandei64.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x74,0x45,0x64] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 64 + +vamoorei8.v v8, (a0), v4, v8 +# CHECK-INST: vamoorei8.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x04,0x45,0x46] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 46 + +vamoorei16.v v8, (a0), v4, v8 +# CHECK-INST: vamoorei16.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x54,0x45,0x46] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 46 + +vamoorei32.v v8, (a0), v4, v8 +# CHECK-INST: vamoorei32.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x64,0x45,0x46] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 46 + +vamoorei64.v v8, (a0), v4, v8 +# CHECK-INST: vamoorei64.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x74,0x45,0x46] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 46 + +vamoorei8.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoorei8.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x04,0x45,0x44] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 44 + +vamoorei16.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoorei16.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x54,0x45,0x44] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 44 + +vamoorei32.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoorei32.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x64,0x45,0x44] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 44 + +vamoorei64.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamoorei64.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x74,0x45,0x44] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 44 + +vamominei8.v v8, (a0), v4, v8 +# CHECK-INST: vamominei8.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x04,0x45,0x86] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 86 + +vamominei16.v v8, (a0), v4, v8 +# CHECK-INST: vamominei16.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x54,0x45,0x86] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 86 + +vamominei32.v v8, (a0), v4, v8 +# CHECK-INST: vamominei32.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x64,0x45,0x86] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 86 + +vamominei64.v v8, (a0), v4, v8 +# CHECK-INST: vamominei64.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x74,0x45,0x86] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 86 + +vamominei8.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamominei8.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x04,0x45,0x84] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 84 + +vamominei16.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamominei16.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x54,0x45,0x84] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 84 + +vamominei32.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamominei32.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x64,0x45,0x84] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 84 + +vamominei64.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamominei64.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x74,0x45,0x84] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 84 + +vamomaxei8.v v8, (a0), v4, v8 +# CHECK-INST: vamomaxei8.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x04,0x45,0xa6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 a6 + +vamomaxei16.v v8, (a0), v4, v8 +# CHECK-INST: vamomaxei16.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x54,0x45,0xa6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 a6 + +vamomaxei32.v v8, (a0), v4, v8 +# CHECK-INST: vamomaxei32.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x64,0x45,0xa6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 a6 + +vamomaxei64.v v8, (a0), v4, v8 +# CHECK-INST: vamomaxei64.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x74,0x45,0xa6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 a6 + +vamomaxei8.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamomaxei8.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x04,0x45,0xa4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 a4 + +vamomaxei16.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamomaxei16.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x54,0x45,0xa4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 a4 + +vamomaxei32.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamomaxei32.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x64,0x45,0xa4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 a4 + +vamomaxei64.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamomaxei64.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x74,0x45,0xa4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 a4 + +vamominuei8.v v8, (a0), v4, v8 +# CHECK-INST: vamominuei8.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x04,0x45,0xc6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 c6 + +vamominuei16.v v8, (a0), v4, v8 +# CHECK-INST: vamominuei16.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x54,0x45,0xc6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 c6 + +vamominuei32.v v8, (a0), v4, v8 +# CHECK-INST: vamominuei32.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x64,0x45,0xc6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 c6 + +vamominuei64.v v8, (a0), v4, v8 +# CHECK-INST: vamominuei64.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x74,0x45,0xc6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 c6 + +vamominuei8.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamominuei8.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x04,0x45,0xc4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 c4 + +vamominuei16.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamominuei16.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x54,0x45,0xc4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 c4 + +vamominuei32.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamominuei32.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x64,0x45,0xc4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 c4 + +vamominuei64.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamominuei64.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x74,0x45,0xc4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 c4 + +vamomaxuei8.v v8, (a0), v4, v8 +# CHECK-INST: vamomaxuei8.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x04,0x45,0xe6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 e6 + +vamomaxuei16.v v8, (a0), v4, v8 +# CHECK-INST: vamomaxuei16.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x54,0x45,0xe6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 e6 + +vamomaxuei32.v v8, (a0), v4, v8 +# CHECK-INST: vamomaxuei32.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x64,0x45,0xe6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 e6 + +vamomaxuei64.v v8, (a0), v4, v8 +# CHECK-INST: vamomaxuei64.v v8, (a0), v4, v8 +# CHECK-ENCODING: [0x2f,0x74,0x45,0xe6] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 e6 + +vamomaxuei8.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamomaxuei8.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x04,0x45,0xe4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 04 45 e4 + +vamomaxuei16.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamomaxuei16.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x54,0x45,0xe4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 54 45 e4 + +vamomaxuei32.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamomaxuei32.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x64,0x45,0xe4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 64 45 e4 + +vamomaxuei64.v v8, (a0), v4, v8, v0.t +# CHECK-INST: vamomaxuei64.v v8, (a0), v4, v8, v0.t +# CHECK-ENCODING: [0x2f,0x74,0x45,0xe4] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 74 45 e4 + +vamoswapei8.v x0, (a0), v4, v24 +# CHECK-INST: vamoswapei8.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x0a] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 0a + +vamoswapei16.v x0, (a0), v4, v24 +# CHECK-INST: vamoswapei16.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x0a] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 0a + +vamoswapei32.v x0, (a0), v4, v24 +# CHECK-INST: vamoswapei32.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x0a] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 0a + +vamoswapei64.v x0, (a0), v4, v24 +# CHECK-INST: vamoswapei64.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x0a] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 0a + +vamoswapei8.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoswapei8.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x08] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 08 + +vamoswapei16.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoswapei16.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x08] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 08 + +vamoswapei32.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoswapei32.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x08] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 08 + +vamoswapei64.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoswapei64.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x08] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 08 + +vamoaddei8.v x0, (a0), v4, v24 +# CHECK-INST: vamoaddei8.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x02] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 02 + +vamoaddei16.v x0, (a0), v4, v24 +# CHECK-INST: vamoaddei16.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x02] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 02 + +vamoaddei32.v x0, (a0), v4, v24 +# CHECK-INST: vamoaddei32.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x02] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 02 + +vamoaddei64.v x0, (a0), v4, v24 +# CHECK-INST: vamoaddei64.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x02] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 02 + +vamoaddei8.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoaddei8.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x00] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 00 + +vamoaddei16.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoaddei16.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x00] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 00 + +vamoaddei32.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoaddei32.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x00] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 00 + +vamoaddei64.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoaddei64.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x00] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 00 + +vamoxorei8.v x0, (a0), v4, v24 +# CHECK-INST: vamoxorei8.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x22] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 22 + +vamoxorei16.v x0, (a0), v4, v24 +# CHECK-INST: vamoxorei16.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x22] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 22 + +vamoxorei32.v x0, (a0), v4, v24 +# CHECK-INST: vamoxorei32.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x22] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 22 + +vamoxorei64.v x0, (a0), v4, v24 +# CHECK-INST: vamoxorei64.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x22] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 22 + +vamoxorei8.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoxorei8.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x20] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 20 + +vamoxorei16.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoxorei16.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x20] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 20 + +vamoxorei32.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoxorei32.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x20] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 20 + +vamoxorei64.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoxorei64.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x20] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 20 + +vamoandei8.v x0, (a0), v4, v24 +# CHECK-INST: vamoandei8.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x62] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 62 + +vamoandei16.v x0, (a0), v4, v24 +# CHECK-INST: vamoandei16.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x62] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 62 + +vamoandei32.v x0, (a0), v4, v24 +# CHECK-INST: vamoandei32.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x62] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 62 + +vamoandei64.v x0, (a0), v4, v24 +# CHECK-INST: vamoandei64.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x62] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 62 + +vamoandei8.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoandei8.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x60] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 60 + +vamoandei16.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoandei16.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x60] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 60 + +vamoandei32.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoandei32.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x60] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 60 + +vamoandei64.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoandei64.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x60] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 60 + +vamoorei8.v x0, (a0), v4, v24 +# CHECK-INST: vamoorei8.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x42] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 42 + +vamoorei16.v x0, (a0), v4, v24 +# CHECK-INST: vamoorei16.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x42] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 42 + +vamoorei32.v x0, (a0), v4, v24 +# CHECK-INST: vamoorei32.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x42] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 42 + +vamoorei64.v x0, (a0), v4, v24 +# CHECK-INST: vamoorei64.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x42] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 42 + +vamoorei8.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoorei8.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x40] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 40 + +vamoorei16.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoorei16.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x40] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 40 + +vamoorei32.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoorei32.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x40] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 40 + +vamoorei64.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamoorei64.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x40] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 40 + +vamominei8.v x0, (a0), v4, v24 +# CHECK-INST: vamominei8.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x82] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 82 + +vamominei16.v x0, (a0), v4, v24 +# CHECK-INST: vamominei16.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x82] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 82 + +vamominei32.v x0, (a0), v4, v24 +# CHECK-INST: vamominei32.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x82] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 82 + +vamominei64.v x0, (a0), v4, v24 +# CHECK-INST: vamominei64.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x82] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 82 + +vamominei8.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamominei8.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x0c,0x45,0x80] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 80 + +vamominei16.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamominei16.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x5c,0x45,0x80] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 80 + +vamominei32.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamominei32.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x6c,0x45,0x80] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 80 + +vamominei64.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamominei64.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x7c,0x45,0x80] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 80 + +vamomaxei8.v x0, (a0), v4, v24 +# CHECK-INST: vamomaxei8.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x0c,0x45,0xa2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 a2 + +vamomaxei16.v x0, (a0), v4, v24 +# CHECK-INST: vamomaxei16.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x5c,0x45,0xa2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 a2 + +vamomaxei32.v x0, (a0), v4, v24 +# CHECK-INST: vamomaxei32.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x6c,0x45,0xa2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 a2 + +vamomaxei64.v x0, (a0), v4, v24 +# CHECK-INST: vamomaxei64.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x7c,0x45,0xa2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 a2 + +vamomaxei8.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamomaxei8.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x0c,0x45,0xa0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 a0 + +vamomaxei16.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamomaxei16.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x5c,0x45,0xa0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 a0 + +vamomaxei32.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamomaxei32.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x6c,0x45,0xa0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 a0 + +vamomaxei64.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamomaxei64.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x7c,0x45,0xa0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 a0 + +vamominuei8.v x0, (a0), v4, v24 +# CHECK-INST: vamominuei8.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x0c,0x45,0xc2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 c2 + +vamominuei16.v x0, (a0), v4, v24 +# CHECK-INST: vamominuei16.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x5c,0x45,0xc2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 c2 + +vamominuei32.v x0, (a0), v4, v24 +# CHECK-INST: vamominuei32.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x6c,0x45,0xc2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 c2 + +vamominuei64.v x0, (a0), v4, v24 +# CHECK-INST: vamominuei64.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x7c,0x45,0xc2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 c2 + +vamominuei8.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamominuei8.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x0c,0x45,0xc0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 c0 + +vamominuei16.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamominuei16.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x5c,0x45,0xc0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 c0 + +vamominuei32.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamominuei32.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x6c,0x45,0xc0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 c0 + +vamominuei64.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamominuei64.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x7c,0x45,0xc0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 c0 + +vamomaxuei8.v x0, (a0), v4, v24 +# CHECK-INST: vamomaxuei8.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x0c,0x45,0xe2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 e2 + +vamomaxuei16.v x0, (a0), v4, v24 +# CHECK-INST: vamomaxuei16.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x5c,0x45,0xe2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 e2 + +vamomaxuei32.v x0, (a0), v4, v24 +# CHECK-INST: vamomaxuei32.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x6c,0x45,0xe2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 e2 + +vamomaxuei64.v x0, (a0), v4, v24 +# CHECK-INST: vamomaxuei64.v x0, (a0), v4, v24 +# CHECK-ENCODING: [0x2f,0x7c,0x45,0xe2] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 e2 + +vamomaxuei8.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamomaxuei8.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x0c,0x45,0xe0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 0c 45 e0 + +vamomaxuei16.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamomaxuei16.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x5c,0x45,0xe0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 5c 45 e0 + +vamomaxuei32.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamomaxuei32.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x6c,0x45,0xe0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 6c 45 e0 + +vamomaxuei64.v x0, (a0), v4, v24, v0.t +# CHECK-INST: vamomaxuei64.v x0, (a0), v4, v24, v0.t +# CHECK-ENCODING: [0x2f,0x7c,0x45,0xe0] +# CHECK-ERROR: instruction requires the following: 'A' (Atomic Instructions), 'Zvamo'(Vector AMO Operations) +# CHECK-UNKNOWN: 2f 7c 45 e0 \ No newline at end of file diff --git a/llvm/test/MC/X86/x86-32.s b/llvm/test/MC/X86/x86-32.s index ef9f5ebfdd92d..426ab617b6b08 100644 --- a/llvm/test/MC/X86/x86-32.s +++ b/llvm/test/MC/X86/x86-32.s @@ -1110,16 +1110,24 @@ ptwritel 0xdeadbeef(%ebx,%ecx,8) // CHECK: encoding: [0xf3,0x0f,0xae,0xe0] ptwritel %eax +// CHECK: jmp foo +// CHECK: encoding: [0xe9,A,A,A,A] +// CHECK: fixup A - offset: 1, value: foo-4, kind: FK_PCRel_4 // CHECK: jmp foo // CHECK: encoding: [0xe9,A,A,A,A] // CHECK: fixup A - offset: 1, value: foo-4, kind: FK_PCRel_4 {disp32} jmp foo +jmp.d32 foo foo: +// CHECK: je foo +// CHECK: encoding: [0x0f,0x84,A,A,A,A] +// CHECK: fixup A - offset: 2, value: foo-4, kind: FK_PCRel_4 // CHECK: je foo // CHECK: encoding: [0x0f,0x84,A,A,A,A] // CHECK: fixup A - offset: 2, value: foo-4, kind: FK_PCRel_4 {disp32} je foo +je.d32 foo // CHECK: ljmpl *%cs:305419896 // CHECK: encoding: [0x2e,0xff,0x2d,0x78,0x56,0x34,0x12] diff --git a/llvm/test/MC/X86/x86-64.s b/llvm/test/MC/X86/x86-64.s index c61cae69c3ffe..911aa294fbd0c 100644 --- a/llvm/test/MC/X86/x86-64.s +++ b/llvm/test/MC/X86/x86-64.s @@ -1912,9 +1912,15 @@ ud2b (%rbx), %rcx // CHECK: encoding: [0xc7,0x40,0x00,0x01,0x00,0x00,0x00] // CHECK: movl $1, (%rax) // CHECK: encoding: [0xc7,0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00] +// CHECK: movl $1, (%rax) +// CHECK: encoding: [0xc7,0x40,0x00,0x01,0x00,0x00,0x00] +// CHECK: movl $1, (%rax) +// CHECK: encoding: [0xc7,0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00] movl $1, (%rax) {disp8} movl $1, (%rax) {disp32} movl $1, (%rax) +movl.d8 $1, (%rax) +movl.d32 $1, (%rax) // Requires disp8 by default // CHECK: movl $1, (%rbp) diff --git a/llvm/test/MachineVerifier/test_copy.mir b/llvm/test/MachineVerifier/test_copy.mir index 64c2761e7ea7b..9b96902880a2b 100644 --- a/llvm/test/MachineVerifier/test_copy.mir +++ b/llvm/test/MachineVerifier/test_copy.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- | ; ModuleID = 'test.ll' source_filename = "test.ll" diff --git a/llvm/test/MachineVerifier/test_copy_mismatch_types.mir b/llvm/test/MachineVerifier/test_copy_mismatch_types.mir index 3b7e54e0c1c4c..0fe1622f4cac6 100644 --- a/llvm/test/MachineVerifier/test_copy_mismatch_types.mir +++ b/llvm/test/MachineVerifier/test_copy_mismatch_types.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- | ; ModuleID = 'test.ll' source_filename = "test.ll" diff --git a/llvm/test/MachineVerifier/test_g_add.mir b/llvm/test/MachineVerifier/test_g_add.mir index 331f4bf351ab4..df98e1eaa1c70 100644 --- a/llvm/test/MachineVerifier/test_g_add.mir +++ b/llvm/test/MachineVerifier/test_g_add.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -march=aarch64 -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_add diff --git a/llvm/test/MachineVerifier/test_g_addrspacecast.mir b/llvm/test/MachineVerifier/test_g_addrspacecast.mir index fb71057c585c6..1a6d9c3576e34 100644 --- a/llvm/test/MachineVerifier/test_g_addrspacecast.mir +++ b/llvm/test/MachineVerifier/test_g_addrspacecast.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -march=arm64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_addrspacecast diff --git a/llvm/test/MachineVerifier/test_g_bitcast.mir b/llvm/test/MachineVerifier/test_g_bitcast.mir index 24ee95ba4b63e..abb66e9840611 100644 --- a/llvm/test/MachineVerifier/test_g_bitcast.mir +++ b/llvm/test/MachineVerifier/test_g_bitcast.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, amdgpu-registered-target +# REQUIRES: amdgpu-registered-target --- name: test_bitcast diff --git a/llvm/test/MachineVerifier/test_g_brjt.mir b/llvm/test/MachineVerifier/test_g_brjt.mir index 7a8417efab850..0fc45783919d5 100644 --- a/llvm/test/MachineVerifier/test_g_brjt.mir +++ b/llvm/test/MachineVerifier/test_g_brjt.mir @@ -1,5 +1,5 @@ # RUN: not --crash llc -march=aarch64 -o /dev/null -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_jump_table diff --git a/llvm/test/MachineVerifier/test_g_concat_vectors.mir b/llvm/test/MachineVerifier/test_g_concat_vectors.mir index 53e2eca008080..6f1f51e3168fe 100644 --- a/llvm/test/MachineVerifier/test_g_concat_vectors.mir +++ b/llvm/test/MachineVerifier/test_g_concat_vectors.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- | target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-unknown" diff --git a/llvm/test/MachineVerifier/test_g_constant.mir b/llvm/test/MachineVerifier/test_g_constant.mir index cfdcae929ce1e..f78b4c7d297b7 100644 --- a/llvm/test/MachineVerifier/test_g_constant.mir +++ b/llvm/test/MachineVerifier/test_g_constant.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -march=aarch64 -o /dev/null -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_constant diff --git a/llvm/test/MachineVerifier/test_g_extract.mir b/llvm/test/MachineVerifier/test_g_extract.mir index 2f326cb56ccf0..863689e063007 100644 --- a/llvm/test/MachineVerifier/test_g_extract.mir +++ b/llvm/test/MachineVerifier/test_g_extract.mir @@ -1,5 +1,5 @@ # RUN: not --crash llc -march=aarch64 -o /dev/null -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_extract diff --git a/llvm/test/MachineVerifier/test_g_fcmp.mir b/llvm/test/MachineVerifier/test_g_fcmp.mir index 15373f8ff3874..b463bc4bbe31b 100644 --- a/llvm/test/MachineVerifier/test_g_fcmp.mir +++ b/llvm/test/MachineVerifier/test_g_fcmp.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -march=arm64 -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_fcmp diff --git a/llvm/test/MachineVerifier/test_g_fconstant.mir b/llvm/test/MachineVerifier/test_g_fconstant.mir index 249a74a501576..7512b92382b9a 100644 --- a/llvm/test/MachineVerifier/test_g_fconstant.mir +++ b/llvm/test/MachineVerifier/test_g_fconstant.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -march=aarch64 -o /dev/null -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_fconstant diff --git a/llvm/test/MachineVerifier/test_g_icmp.mir b/llvm/test/MachineVerifier/test_g_icmp.mir index 74448e736fd6a..bff2b36b39195 100644 --- a/llvm/test/MachineVerifier/test_g_icmp.mir +++ b/llvm/test/MachineVerifier/test_g_icmp.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -march=arm64 -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_icmp diff --git a/llvm/test/MachineVerifier/test_g_insert.mir b/llvm/test/MachineVerifier/test_g_insert.mir index d12a2206c6c40..789539a6f7aec 100644 --- a/llvm/test/MachineVerifier/test_g_insert.mir +++ b/llvm/test/MachineVerifier/test_g_insert.mir @@ -1,5 +1,5 @@ # RUN: not --crash llc -march=aarch64 -o /dev/null -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_insert diff --git a/llvm/test/MachineVerifier/test_g_inttoptr.mir b/llvm/test/MachineVerifier/test_g_inttoptr.mir index d0d356a1d7b6d..1744ef47b0129 100644 --- a/llvm/test/MachineVerifier/test_g_inttoptr.mir +++ b/llvm/test/MachineVerifier/test_g_inttoptr.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -march=arm64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_inttoptr diff --git a/llvm/test/MachineVerifier/test_g_jump_table.mir b/llvm/test/MachineVerifier/test_g_jump_table.mir index 3c837c2951401..b6318db32dfb0 100644 --- a/llvm/test/MachineVerifier/test_g_jump_table.mir +++ b/llvm/test/MachineVerifier/test_g_jump_table.mir @@ -1,5 +1,5 @@ # RUN: not --crash llc -march=aarch64 -o /dev/null -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_jump_table diff --git a/llvm/test/MachineVerifier/test_g_load.mir b/llvm/test/MachineVerifier/test_g_load.mir index ac28b513c1d85..282831cc17a64 100644 --- a/llvm/test/MachineVerifier/test_g_load.mir +++ b/llvm/test/MachineVerifier/test_g_load.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -march=arm64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_load diff --git a/llvm/test/MachineVerifier/test_g_memcpy.mir b/llvm/test/MachineVerifier/test_g_memcpy.mir new file mode 100644 index 0000000000000..6b1584a4bbfd1 --- /dev/null +++ b/llvm/test/MachineVerifier/test_g_memcpy.mir @@ -0,0 +1,50 @@ +#RUN: not --crash llc -o - -march=arm64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s +# REQUIRES: aarch64-registered-target +--- +name: test_memcpy +legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: +body: | + bb.0: + + %0:_(p0) = G_CONSTANT i64 0 + %1:_(p0) = G_CONSTANT i64 4 + %2:_(s64) = G_CONSTANT i64 4 + + ; CHECK: *** Bad machine code: memcpy/memmove must have 2 memory operands *** + G_MEMCPY %0, %1, %2, 0 + + ; CHECK: *** Bad machine code: memcpy/memmove must have 2 memory operands *** + G_MEMCPY %0, %1, %2, 0 :: (load 4) + + ; CHECK: *** Bad machine code: memcpy/memmove must have 2 memory operands *** + G_MEMCPY %0, %1, %2, 0 :: (store 4) + + ; CHECK: *** Bad machine code: wrong memory operand types *** + G_MEMCPY %0, %1, %2, 0 :: (load 4), (store 4) + + ; CHECK: *** Bad machine code: inconsistent memory operand sizes *** + G_MEMCPY %0, %1, %2, 0 :: (store 8), (load 4) + + ; CHECK: *** Bad machine code: inconsistent memory operand sizes *** + G_MEMCPY %0, %1, %2, 0 :: (store unknown-size), (load 4) + + ; CHECK: *** Bad machine code: inconsistent memory operand sizes *** + G_MEMCPY %0, %1, %2, 0 :: (store 8), (load unknown-size) + + ; CHECK: *** Bad machine code: inconsistent store address space *** + G_MEMCPY %0, %1, %2, 0 :: (store 4, addrspace 1), (load 4) + + ; CHECK: *** Bad machine code: inconsistent load address space *** + G_MEMCPY %0, %1, %2, 0 :: (store 4), (load 4, addrspace 1) + + ; CHECK: *** Bad machine code: memory instruction operand must be a pointer *** + G_MEMCPY %2, %0, %2, 0 :: (store 4), (load 4) + + ; CHECK: *** Bad machine code: memory instruction operand must be a pointer *** + G_MEMCPY %0, %2, %2, 0 :: (store 4), (load 4) + +... diff --git a/llvm/test/MachineVerifier/test_g_memset.mir b/llvm/test/MachineVerifier/test_g_memset.mir new file mode 100644 index 0000000000000..faad17c766c08 --- /dev/null +++ b/llvm/test/MachineVerifier/test_g_memset.mir @@ -0,0 +1,33 @@ +#RUN: not --crash llc -o - -march=arm64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s +# REQUIRES: aarch64-registered-target +--- +name: test_memset +legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: +body: | + bb.0: + + %0:_(p0) = G_CONSTANT i64 0 + %1:_(s64) = G_CONSTANT i64 4 + %2:_(s8) = G_CONSTANT i8 7 + + ; CHECK: *** Bad machine code: memset must have 1 memory operand *** + G_MEMSET %0, %1, %2, 0 + + ; CHECK: *** Bad machine code: memset memory operand must be a store *** + G_MEMSET %0, %1, %2, 0 :: (load 4) + + ; CHECK: *** Bad machine code: Missing mayLoad flag *** + ; CHECK: *** Bad machine code: memset memory operand must be a store *** + G_MEMSET %0, %1, %2, 0 :: (load store 4) + + ; CHECK: *** Bad machine code: inconsistent memset address space *** + G_MEMSET %0, %1, %2, 0 :: (store 4, addrspace 1) + + ; CHECK: *** Bad machine code: memset operand must be a pointer *** + G_MEMSET %1, %1, %2, 0 :: (store 4) + +... diff --git a/llvm/test/MachineVerifier/test_g_phi.mir b/llvm/test/MachineVerifier/test_g_phi.mir index 11e18e2220adb..6a1443cacac32 100644 --- a/llvm/test/MachineVerifier/test_g_phi.mir +++ b/llvm/test/MachineVerifier/test_g_phi.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- | ; ModuleID = 'test.ll' source_filename = "test.ll" diff --git a/llvm/test/MachineVerifier/test_g_ptr_add.mir b/llvm/test/MachineVerifier/test_g_ptr_add.mir index 9a918d2fc7f9b..0838a2d6ea133 100644 --- a/llvm/test/MachineVerifier/test_g_ptr_add.mir +++ b/llvm/test/MachineVerifier/test_g_ptr_add.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -march=arm64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_gep diff --git a/llvm/test/MachineVerifier/test_g_ptrtoint.mir b/llvm/test/MachineVerifier/test_g_ptrtoint.mir index f289a3d1dbede..5b851c4ed23ee 100644 --- a/llvm/test/MachineVerifier/test_g_ptrtoint.mir +++ b/llvm/test/MachineVerifier/test_g_ptrtoint.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -o - -march=arm64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_ptrtoint diff --git a/llvm/test/MachineVerifier/test_g_select.mir b/llvm/test/MachineVerifier/test_g_select.mir index ca0a94d6bc97d..8672968cb130f 100644 --- a/llvm/test/MachineVerifier/test_g_select.mir +++ b/llvm/test/MachineVerifier/test_g_select.mir @@ -1,5 +1,5 @@ #RUN: not --crash llc -march=aarch64 -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_select diff --git a/llvm/test/MachineVerifier/test_g_sext_inreg.mir b/llvm/test/MachineVerifier/test_g_sext_inreg.mir index 120f9995d87d3..387fd2ff35fc6 100644 --- a/llvm/test/MachineVerifier/test_g_sext_inreg.mir +++ b/llvm/test/MachineVerifier/test_g_sext_inreg.mir @@ -1,5 +1,5 @@ # RUN: not --crash llc -verify-machineinstrs -run-pass none -o /dev/null %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- | diff --git a/llvm/test/MachineVerifier/test_g_sextload.mir b/llvm/test/MachineVerifier/test_g_sextload.mir index f12fe1cb6bf01..fdfe6642dff1a 100644 --- a/llvm/test/MachineVerifier/test_g_sextload.mir +++ b/llvm/test/MachineVerifier/test_g_sextload.mir @@ -1,5 +1,5 @@ # RUN: not --crash llc -o - -march=arm64 -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_sextload diff --git a/llvm/test/MachineVerifier/test_g_store.mir b/llvm/test/MachineVerifier/test_g_store.mir index 183935f052df6..6060823f5d751 100644 --- a/llvm/test/MachineVerifier/test_g_store.mir +++ b/llvm/test/MachineVerifier/test_g_store.mir @@ -1,5 +1,5 @@ # RUN: not --crash llc -o - -march=arm64 -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_store diff --git a/llvm/test/MachineVerifier/test_g_trunc.mir b/llvm/test/MachineVerifier/test_g_trunc.mir index 9dbeab2c60395..d57231c129a64 100644 --- a/llvm/test/MachineVerifier/test_g_trunc.mir +++ b/llvm/test/MachineVerifier/test_g_trunc.mir @@ -1,5 +1,5 @@ # RUN: not --crash llc -o - -march=arm64 -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_trunc diff --git a/llvm/test/MachineVerifier/test_g_zextload.mir b/llvm/test/MachineVerifier/test_g_zextload.mir index 3b65bf9c17266..dac335bec4735 100644 --- a/llvm/test/MachineVerifier/test_g_zextload.mir +++ b/llvm/test/MachineVerifier/test_g_zextload.mir @@ -1,5 +1,5 @@ # RUN: not --crash llc -o - -march=arm64 -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: global-isel, aarch64-registered-target +# REQUIRES: aarch64-registered-target --- name: test_zextload diff --git a/llvm/test/MachineVerifier/test_memccpy_intrinsics.mir b/llvm/test/MachineVerifier/test_memccpy_intrinsics.mir deleted file mode 100644 index 03ba9e0d06f21..0000000000000 --- a/llvm/test/MachineVerifier/test_memccpy_intrinsics.mir +++ /dev/null @@ -1,27 +0,0 @@ -# RUN: not --crash llc -o - -march=aarch64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s -# REQUIRES: aarch64-registered-target - ---- -name: test_memcpy_et_al -legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -liveins: -body: | - bb.0: - - %0:_(p0) = G_IMPLICIT_DEF - %1:_(s64) = G_IMPLICIT_DEF - %2:_(s1) = G_IMPLICIT_DEF - - ; CHECK: Bad machine code: Expected memcpy intrinsic to have 5 operands - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %0(p0), %1(s64) - - ; CHECK: Bad machine code: Expected memmove intrinsic to have 5 operands - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %0(p0), %1(s64) - - ; CHECK: Bad machine code: Expected memset intrinsic to have 5 operands - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %0(p0), %1(s64) - -... diff --git a/llvm/test/Reduce/remove-function-bodies-used-in-globals.ll b/llvm/test/Reduce/remove-function-bodies-used-in-globals.ll index 0ad801491c6bd..6e441c2312878 100644 --- a/llvm/test/Reduce/remove-function-bodies-used-in-globals.ll +++ b/llvm/test/Reduce/remove-function-bodies-used-in-globals.ll @@ -1,12 +1,15 @@ ; RUN: llvm-reduce --test FileCheck --test-arg --check-prefixes=CHECK-ALL,CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t ; RUN: cat %t | FileCheck --check-prefixes=CHECK-ALL,CHECK-FINAL %s +; We cannot change the @alias to undef, because it would result in invalid IR +; (Aliasee should be either GlobalValue or ConstantExpr). + ; CHECK-INTERESTINGNESS: @alias = -; CHECK-FINAL: @alias = alias void (i32), void (i32)* undef +; CHECK-FINAL: @alias = alias void (i32), bitcast (void ()* @func to void (i32)*) @alias = alias void (i32), void (i32)* @func -; CHECK-FINAL-NOT: @func() +; CHECK-FINAL: @func() define void @func(i32 %arg) { entry: diff --git a/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td b/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td index df04ffa632828..7ff56c06f21ca 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td +++ b/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td @@ -4,13 +4,20 @@ include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" +// GISEL-OPT: GIM_SwitchOpcode +// GISEL-OPT-NEXT: /*TargetOpcode::G_SHL*/ +// GISEL-OPT-NEXT: /*TargetOpcode::G_LSHR*/ +// GISEL-OPT-NEXT: // Label + +// GISEL-OPT: GIM_Try, // GISEL-OPT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_v4s16, -// GISEL-OPT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_BUILD_VECTOR, +// GISEL-OPT: GIM_CheckOpcodeIsEither, /*MI*/1, TargetOpcode::G_BUILD_VECTOR, TargetOpcode::G_BUILD_VECTOR_TRUNC, // GISEL-OPT: GIM_CheckIsBuildVectorAllZeros, /*MI*/1, +// GISEL-OPT: GIM_Try, // GISEL-OPT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_v4s16, -// GISEL-OPT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_BUILD_VECTOR, +// GISEL-OPT: GIM_CheckOpcodeIsEither, /*MI*/1, TargetOpcode::G_BUILD_VECTOR, TargetOpcode::G_BUILD_VECTOR_TRUNC, // GISEL-OPT: GIM_CheckIsBuildVectorAllOnes, /*MI*/1, @@ -19,7 +26,7 @@ include "GlobalISelEmitterCommon.td" // GISEL-NOOPT: // MIs[0] Operand 2 // GISEL-NOOPT-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_v4s16, // GISEL-NOOPT-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/2, // MIs[1] -// GISEL-NOOPT-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_BUILD_VECTOR, +// GISEL-NOOPT-NEXT: GIM_CheckOpcodeIsEither, /*MI*/1, TargetOpcode::G_BUILD_VECTOR, TargetOpcode::G_BUILD_VECTOR_TRUNC, // GISEL-NOOPT-NEXT: GIM_CheckIsBuildVectorAllOnes, /*MI*/1, // GISEL-NOOPT-NEXT: // MIs[1] Operand 0 // GISEL-NOOPT-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_v4s16, @@ -34,7 +41,7 @@ def VFOOONES : I<(outs VecReg128:$dst), (ins VecReg128:$src0), // GISEL-NOOPT: // MIs[0] Operand 2 // GISEL-NOOPT-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_v4s16, // GISEL-NOOPT-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/2, // MIs[1] -// GISEL-NOOPT-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_BUILD_VECTOR, +// GISEL-NOOPT-NEXT: GIM_CheckOpcodeIsEither, /*MI*/1, TargetOpcode::G_BUILD_VECTOR, TargetOpcode::G_BUILD_VECTOR_TRUNC, // GISEL-NOOPT-NEXT: GIM_CheckIsBuildVectorAllZeros, /*MI*/1, // GISEL-NOOPT-NEXT: // MIs[1] Operand 0 // GISEL-NOOPT-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_v4s16, diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td b/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td new file mode 100644 index 0000000000000..cd7a177b8426a --- /dev/null +++ b/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td @@ -0,0 +1,85 @@ +// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s + +include "llvm/Target/Target.td" +include "GlobalISelEmitterCommon.td" + +// Two LOADs with same output size but different input size, hence their +// GIM_CheckPointerToAny should *not* be merged +def LOAD8 : I<(outs GPR8:$dst), (ins GPR8:$src), []>; +def LOAD32 : I<(outs GPR8:$dst), (ins GPR32:$src), []>; +// CHECK: Label 1: @{{[0-9]+}} +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L1_ID:[0-9]+]]*/ [[L1_AT:[0-9]+]], +// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0, +// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(int64_t)AtomicOrdering::NotAtomic, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR8RegClassID, +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L2_ID:[0-9]+]]*/ [[L2_AT:[0-9]+]], +// CHECK-NEXT: // MIs[0] src +// CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/8, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR8RegClassID, +// CHECK-NEXT: // (ld:{ *:[i8] } GPR8:{ *:[i8] }:$src)<><> => (LOAD8:{ *:[i8] } GPR8:{ *:[i8] }:$src) +// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/MyTarget::LOAD8, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: // GIR_Coverage, 0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: // Label [[L2_ID]]: @[[L2_AT]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L3_ID:[0-9]+]]*/ [[L3_AT:[0-9]+]], +// CHECK-NEXT: // MIs[0] src +// CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // (ld:{ *:[i8] } GPR32:{ *:[i32] }:$src)<><> => (LOAD32:{ *:[i8] } GPR32:{ *:[i32] }:$src) +// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/MyTarget::LOAD32, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: // GIR_Coverage, 1, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: // Label [[L3_ID]]: @[[L3_AT]] +// CHECK-NEXT: GIM_Reject, +// CHECK-NEXT: // Label [[L1_ID]]: @[[L1_AT]] +def : Pat<(i8 (load GPR8:$src)), + (LOAD8 GPR8:$src)>; +def : Pat<(i8 (load GPR32:$src)), + (LOAD32 GPR32:$src)>; + +// Two LOADs with same output size and input size, hence their +// GIM_CheckPointerToAny *should* be merged +def S0 : Register<"s0"> { let Namespace = "MyTarget"; } +def GPR16 : RegisterClass<"MyTarget", [i16], 16, (add S0)>; +def LOAD16 : I<(outs GPR16:$dst), (ins GPR16:$src), []>; +def LOAD16Imm : I<(outs GPR16:$dst), (ins GPR16:$src), []>; +// CHECK: // Label 2: @{{[0-9]+}} +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L1_ID:[0-9]+]]*/ [[L1_AT:[0-9]+]], +// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0, +// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(int64_t)AtomicOrdering::NotAtomic, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR16RegClassID, +// CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/16, +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L2_ID:[0-9]+]]*/ [[L2_AT:[0-9]+]], +// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1] +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ADD, +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s16, +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s16, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR16RegClassID, +// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/1, /*Op*/2, 10, +// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1, +// CHECK-NEXT: // (ld:{ *:[i16] } (add:{ *:[i16] } GPR16:{ *:[i16] }:$src, 10:{ *:[i16] }))<><> => (LOAD16Imm:{ *:[i16] } GPR16:{ *:[i16] }:$src) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::LOAD16Imm, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*MergeInsnID's*/0, 1, GIU_MergeMemOperands_EndOfList, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: // GIR_Coverage, 3, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: // Label [[L2_ID]]: @[[L2_AT]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L3_ID:[0-9]+]]*/ [[L3_AT:[0-9]+]], +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR16RegClassID, +// CHECK-NEXT: // (ld:{ *:[i16] } GPR16:{ *:[i16] }:$src)<><> => (LOAD16:{ *:[i16] } GPR16:{ *:[i16] }:$src) +// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/MyTarget::LOAD16, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: // GIR_Coverage, 2, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: // Label [[L3_ID]]: @[[L3_AT]] +// CHECK-NEXT: GIM_Reject, +// CHECK-NEXT: // Label [[L1_ID]]: @[[L1_AT]] +def : Pat<(i16 (load GPR16:$src)), + (LOAD16 GPR16:$src)>; +def : Pat<(i16 (load (add GPR16:$src, 10))), + (LOAD16Imm GPR16:$src)>; diff --git a/llvm/test/TableGen/intrin-side-effects.td b/llvm/test/TableGen/intrin-side-effects.td index 7588855830fae..f58d374532829 100644 --- a/llvm/test/TableGen/intrin-side-effects.td +++ b/llvm/test/TableGen/intrin-side-effects.td @@ -11,7 +11,10 @@ class LLVMType { def llvm_i32_ty : LLVMType; -class IntrinsicProperty; +class IntrinsicProperty { + bit IsDefault = is_default; +} + def IntrNoMem : IntrinsicProperty; def IntrHasSideEffects : IntrinsicProperty; @@ -27,6 +30,8 @@ class Intrinsic ret_types, list ParamTypes = param_types; list IntrProperties = intr_properties; let Properties = sd_properties; + bit DisableDefaultAttributes = 1; + bit isTarget = 0; } diff --git a/llvm/test/TableGen/intrinsic-long-name.td b/llvm/test/TableGen/intrinsic-long-name.td index c2f696e8ca187..d66173202302b 100644 --- a/llvm/test/TableGen/intrinsic-long-name.td +++ b/llvm/test/TableGen/intrinsic-long-name.td @@ -1,7 +1,10 @@ // RUN: llvm-tblgen -gen-intrinsic-enums %s | FileCheck %s // XFAIL: vg_leak -class IntrinsicProperty; +class IntrinsicProperty { + bit IsDefault = is_default; +} + class SDNodeProperty; class ValueType { @@ -22,6 +25,7 @@ class Intrinsic param_types = []> { list ParamTypes = param_types; list IntrProperties = []; list Properties = []; + bit DisableDefaultAttributes = 1; } def iAny : ValueType<0, 253>; diff --git a/llvm/test/TableGen/intrinsic-pointer-to-any.td b/llvm/test/TableGen/intrinsic-pointer-to-any.td index c58595acfde77..0b0bc15107754 100644 --- a/llvm/test/TableGen/intrinsic-pointer-to-any.td +++ b/llvm/test/TableGen/intrinsic-pointer-to-any.td @@ -6,7 +6,10 @@ // case, so TableGen would hit an assertion in EncodeFixedType that was checking // to ensure that the substitution being processed was correctly replaced. -class IntrinsicProperty; +class IntrinsicProperty { + bit IsDefault = is_default; +} + class SDNodeProperty; class ValueType { @@ -32,6 +35,7 @@ class Intrinsic ret_types> { list IntrProperties = []; list Properties = []; bit isTarget = 0; + bit DisableDefaultAttributes = 1; } class LLVMQualPointerType diff --git a/llvm/test/TableGen/intrinsic-struct.td b/llvm/test/TableGen/intrinsic-struct.td index 7a3089c802a9c..bc044a4a6f858 100644 --- a/llvm/test/TableGen/intrinsic-struct.td +++ b/llvm/test/TableGen/intrinsic-struct.td @@ -1,7 +1,10 @@ // RUN: llvm-tblgen -gen-intrinsic-enums %s | FileCheck %s // XFAIL: vg_leak -class IntrinsicProperty; +class IntrinsicProperty { + bit IsDefault = is_default; +} + class SDNodeProperty; class ValueType { @@ -22,6 +25,7 @@ class Intrinsic ret_types = []> { list ParamTypes = []; list IntrProperties = []; list Properties = []; + bit DisableDefaultAttributes = 1; } def iAny : ValueType<0, 253>; diff --git a/llvm/test/TableGen/intrinsic-varargs.td b/llvm/test/TableGen/intrinsic-varargs.td index 6a2252215a830..da860ed0129c8 100644 --- a/llvm/test/TableGen/intrinsic-varargs.td +++ b/llvm/test/TableGen/intrinsic-varargs.td @@ -3,7 +3,9 @@ include "llvm/CodeGen/ValueTypes.td" -class IntrinsicProperty; +class IntrinsicProperty { + bit IsDefault = is_default; +} class SDNodeProperty; class LLVMType { @@ -18,6 +20,7 @@ class Intrinsic param_types = []> { list ParamTypes = param_types; list IntrProperties = []; list Properties = []; + bit DisableDefaultAttributes = 1; } def llvm_vararg_ty : LLVMType; // this means vararg here diff --git a/llvm/test/TableGen/searchabletables-intrinsic.td b/llvm/test/TableGen/searchabletables-intrinsic.td index e5cb9db3aa6be..75722d19b16e9 100644 --- a/llvm/test/TableGen/searchabletables-intrinsic.td +++ b/llvm/test/TableGen/searchabletables-intrinsic.td @@ -3,7 +3,10 @@ include "llvm/TableGen/SearchableTable.td" -class IntrinsicProperty; +class IntrinsicProperty { + bit IsDefault = is_default; +} + class SDNodeProperty; class ValueType { @@ -24,6 +27,7 @@ class Intrinsic param_types = []> { list ParamTypes = param_types; list IntrProperties = []; list Properties = []; + bit DisableDefaultAttributes = 1; } def iAny : ValueType<0, 253>; diff --git a/llvm/test/ThinLTO/X86/internalize.ll b/llvm/test/ThinLTO/X86/internalize.ll index edd5abe8ab424..5d80a4fe375a2 100644 --- a/llvm/test/ThinLTO/X86/internalize.ll +++ b/llvm/test/ThinLTO/X86/internalize.ll @@ -4,13 +4,13 @@ ; prevailing the %t1.bc copy as non-prevailing. ; RUN: llvm-lto -thinlto-action=thinlink -o %t.index.bc %t2.bc %t1.bc ; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=REGULAR -; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - --exported-symbol=foo | llvm-dis -o - | FileCheck %s --check-prefix=INTERNALIZE +; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - --exported-symbol=_foo | llvm-dis -o - | FileCheck %s --check-prefix=INTERNALIZE ; Test the enable-lto-internalization option by setting it to false. ; This makes sure indices are not marked as internallinkage and therefore ; internalization does not happen. ; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc \ -; RUN: -enable-lto-internalization=false --exported-symbol=foo +; RUN: -enable-lto-internalization=false --exported-symbol=_foo ; RUN: llvm-dis < %t1.bc.thinlto.internalized.bc | FileCheck %s --check-prefix=INTERNALIZE-OPTION-DISABLE ; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps \ diff --git a/llvm/test/ThinLTO/X86/mangled_symbol.ll b/llvm/test/ThinLTO/X86/mangled_symbol.ll new file mode 100644 index 0000000000000..ffdefe3e60b30 --- /dev/null +++ b/llvm/test/ThinLTO/X86/mangled_symbol.ll @@ -0,0 +1,26 @@ +; RUN: opt -module-summary %s -o %t1.bc +; RUN: llvm-lto -thinlto-action=thinlink -o %t.index.bc %t1.bc +;; Check baseline when both of them internalized when not exported. +; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - --exported-symbol=_exported | llvm-dis -o - | FileCheck %s --check-prefix=INTERNALIZED +;; Check symbols are exported, including the ones with `\01` prefix. +; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - --exported-symbol=_exported --exported-symbol=_extern_not_mangled --exported-symbol=_extern_mangled | llvm-dis -o - | FileCheck %s --check-prefix=EXPORTED + +; INTERNALIZED: define internal void @extern_not_mangled +; INTERNALIZED: define internal void @"\01_extern_mangled" +; EXPORTED: define void @extern_not_mangled +; EXPORTED: define void @"\01_extern_mangled" + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.11.0" + +define void @exported() { + ret void +} + +define void @extern_not_mangled() { + ret void +} + +define void @"\01_extern_mangled"() { + ret void +} diff --git a/llvm/test/ThinLTO/X86/weak_resolution.ll b/llvm/test/ThinLTO/X86/weak_resolution.ll index b9f10afd6d624..ccebb56f90deb 100644 --- a/llvm/test/ThinLTO/X86/weak_resolution.ll +++ b/llvm/test/ThinLTO/X86/weak_resolution.ll @@ -7,10 +7,10 @@ ; non-prevailing ODR are not kept when possible, but non-ODR non-prevailing ; are not affected. ; RUN: llvm-lto -thinlto-action=promote %t.bc -thinlto-index=%t3.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=MOD1 -; RUN: llvm-lto -thinlto-action=internalize %t.bc -thinlto-index=%t3.bc -exported-symbol=linkoncefunc -o - | llvm-dis -o - | FileCheck %s --check-prefix=MOD1-INT +; RUN: llvm-lto -thinlto-action=internalize %t.bc -thinlto-index=%t3.bc -exported-symbol=_linkoncefunc -o - | llvm-dis -o - | FileCheck %s --check-prefix=MOD1-INT ; RUN: llvm-lto -thinlto-action=promote %t2.bc -thinlto-index=%t3.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=MOD2 ; When exported, we always preserve a linkonce -; RUN: llvm-lto -thinlto-action=promote %t.bc -thinlto-index=%t3.bc -o - --exported-symbol=linkonceodrfuncInSingleModule | llvm-dis -o - | FileCheck %s --check-prefix=EXPORTED +; RUN: llvm-lto -thinlto-action=promote %t.bc -thinlto-index=%t3.bc -o - --exported-symbol=_linkonceodrfuncInSingleModule | llvm-dis -o - | FileCheck %s --check-prefix=EXPORTED target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" diff --git a/llvm/test/ThinLTO/X86/weak_resolution_single.ll b/llvm/test/ThinLTO/X86/weak_resolution_single.ll index 779b967ab89a9..203f60f03eb47 100644 --- a/llvm/test/ThinLTO/X86/weak_resolution_single.ll +++ b/llvm/test/ThinLTO/X86/weak_resolution_single.ll @@ -1,7 +1,7 @@ ; RUN: opt -module-summary %s -o %t.bc ; RUN: llvm-lto -thinlto-action=thinlink -o %t2.bc %t.bc -; RUN: llvm-lto -thinlto-action=internalize %t.bc -thinlto-index=%t2.bc -exported-symbol=foo -o - | llvm-dis -o - | FileCheck %s +; RUN: llvm-lto -thinlto-action=internalize %t.bc -thinlto-index=%t2.bc -exported-symbol=_foo -o - | llvm-dis -o - | FileCheck %s ; CHECK: define weak_odr void @foo() target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll index 5da4437f3ae24..1427a8efd5b62 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll @@ -78,7 +78,7 @@ entry: define internal i1 @g(%struct.ss* %a, %struct.ss* inalloca %b) nounwind { ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@g -; IS__CGSCC____-SAME: (%struct.ss* nocapture nofree noundef nonnull readnone align 4 dereferenceable(8) [[A:%.*]], %struct.ss* inalloca nocapture nofree noundef nonnull writeonly align 4 dereferenceable(8) [[B:%.*]]) +; IS__CGSCC____-SAME: (%struct.ss* nocapture nofree nonnull readnone align 4 dereferenceable(8) [[A:%.*]], %struct.ss* inalloca nocapture nofree nonnull writeonly align 4 dereferenceable(8) [[B:%.*]]) ; IS__CGSCC____-NEXT: entry: ; IS__CGSCC____-NEXT: ret i1 undef ; diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll index bf3ee0ff8eec5..5ae9d99c332f4 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll @@ -36,8 +36,8 @@ define dso_local i32 @main() { ; IS__TUNIT____-NEXT: [[ALLOC1:%.*]] = alloca i8, align 8 ; IS__TUNIT____-NEXT: [[ALLOC2:%.*]] = alloca i8, align 8 ; IS__TUNIT____-NEXT: [[THREAD:%.*]] = alloca i64, align 8 -; IS__TUNIT____-NEXT: [[CALL:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @foo, i8* noalias nocapture nofree noundef readnone align 536870912 undef) -; IS__TUNIT____-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @bar, i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(8) "no-capture-maybe-returned" undef) +; IS__TUNIT____-NEXT: [[CALL:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @foo, i8* noalias nocapture nofree readnone align 536870912 undef) +; IS__TUNIT____-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @bar, i8* noalias nofree nonnull readnone align 8 dereferenceable(8) "no-capture-maybe-returned" undef) ; IS__TUNIT____-NEXT: [[CALL2:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @baz, i8* noalias nocapture nofree noundef nonnull readnone align 8 dereferenceable(1) [[ALLOC1]]) ; IS__TUNIT____-NEXT: [[CALL3:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @buz, i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[ALLOC2]]) ; IS__TUNIT____-NEXT: ret i32 0 @@ -69,13 +69,13 @@ declare !callback !0 dso_local i32 @pthread_create(i64*, %union.pthread_attr_t*, define internal i8* @foo(i8* %arg) { ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn ; IS__TUNIT____-LABEL: define {{[^@]+}}@foo -; IS__TUNIT____-SAME: (i8* noalias nofree noundef readnone returned align 536870912 "no-capture-maybe-returned" [[ARG:%.*]]) +; IS__TUNIT____-SAME: (i8* noalias nofree readnone returned align 536870912 "no-capture-maybe-returned" [[ARG:%.*]]) ; IS__TUNIT____-NEXT: entry: ; IS__TUNIT____-NEXT: ret i8* null ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@foo -; IS__CGSCC____-SAME: (i8* noalias nofree noundef readnone returned align 536870912 "no-capture-maybe-returned" [[ARG:%.*]]) +; IS__CGSCC____-SAME: (i8* noalias nofree readnone returned align 536870912 "no-capture-maybe-returned" [[ARG:%.*]]) ; IS__CGSCC____-NEXT: entry: ; IS__CGSCC____-NEXT: ret i8* null ; @@ -86,13 +86,13 @@ entry: define internal i8* @bar(i8* %arg) { ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn ; IS__TUNIT____-LABEL: define {{[^@]+}}@bar -; IS__TUNIT____-SAME: (i8* noalias nofree noundef nonnull readnone returned align 8 dereferenceable(8) "no-capture-maybe-returned" [[ARG:%.*]]) +; IS__TUNIT____-SAME: (i8* noalias nofree nonnull readnone returned align 8 dereferenceable(8) "no-capture-maybe-returned" [[ARG:%.*]]) ; IS__TUNIT____-NEXT: entry: ; IS__TUNIT____-NEXT: ret i8* bitcast (i8** @GlobalVPtr to i8*) ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@bar -; IS__CGSCC____-SAME: (i8* nofree noundef readnone returned "no-capture-maybe-returned" [[ARG:%.*]]) +; IS__CGSCC____-SAME: (i8* nofree readnone returned "no-capture-maybe-returned" [[ARG:%.*]]) ; IS__CGSCC____-NEXT: entry: ; IS__CGSCC____-NEXT: ret i8* bitcast (i8** @GlobalVPtr to i8*) ; diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll b/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll index 4405b7bc1b095..f773d9aa96022 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll @@ -26,7 +26,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define internal i32 @callee(i32* %thread_local_ptr, i32* %shared_ptr) { ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readonly willreturn ; IS__TUNIT____-LABEL: define {{[^@]+}}@callee -; IS__TUNIT____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[THREAD_LOCAL_PTR:%.*]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[SHARED_PTR:%.*]]) +; IS__TUNIT____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[THREAD_LOCAL_PTR:%.*]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[SHARED_PTR:%.*]]) ; IS__TUNIT____-NEXT: entry: ; IS__TUNIT____-NEXT: [[TMP:%.*]] = load i32, i32* [[THREAD_LOCAL_PTR]], align 4 ; IS__TUNIT____-NEXT: [[TMP1:%.*]] = load i32, i32* @gsh, align 4 @@ -35,7 +35,7 @@ define internal i32 @callee(i32* %thread_local_ptr, i32* %shared_ptr) { ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readonly willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@callee -; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[THREAD_LOCAL_PTR:%.*]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[SHARED_PTR:%.*]]) +; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[THREAD_LOCAL_PTR:%.*]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[SHARED_PTR:%.*]]) ; IS__CGSCC____-NEXT: entry: ; IS__CGSCC____-NEXT: [[TMP:%.*]] = load i32, i32* [[THREAD_LOCAL_PTR]], align 4 ; IS__CGSCC____-NEXT: [[TMP1:%.*]] = load i32, i32* @gsh, align 4 @@ -52,7 +52,7 @@ entry: define dso_local void @caller() { ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller() ; IS__TUNIT____-NEXT: entry: -; IS__TUNIT____-NEXT: call void @broker(i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) @gtl, i32 (i32*, i32*)* noundef nonnull @callee, i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) undef) +; IS__TUNIT____-NEXT: call void @broker(i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) @gtl, i32 (i32*, i32*)* noundef nonnull @callee, i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) undef) ; IS__TUNIT____-NEXT: ret void ; ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller() diff --git a/llvm/test/Transforms/Attributor/misc_crash.ll b/llvm/test/Transforms/Attributor/misc_crash.ll index e420f58af1368..f79a3c11d5230 100644 --- a/llvm/test/Transforms/Attributor/misc_crash.ll +++ b/llvm/test/Transforms/Attributor/misc_crash.ll @@ -40,7 +40,7 @@ define internal i32* @func1a([1 x i32]* %arg) { define internal void @func2a(i32* %0) { ; CHECK: Function Attrs: nofree nosync nounwind willreturn writeonly ; CHECK-LABEL: define {{[^@]+}}@func2a -; CHECK-SAME: (i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0:%.*]]) +; CHECK-SAME: (i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[TMP0:%.*]]) ; CHECK-NEXT: store i32 0, i32* @var2, align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/Attributor/noundef.ll b/llvm/test/Transforms/Attributor/noundef.ll index b7c1d45205a60..ed87977164a1c 100644 --- a/llvm/test/Transforms/Attributor/noundef.ll +++ b/llvm/test/Transforms/Attributor/noundef.ll @@ -20,3 +20,47 @@ define void @foo() { call void @bar(i32* %x) ret void } + +define internal i8* @returned_dead() { +; CHECK-LABEL: define internal noalias align 536870912 i8* @returned_dead( +; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: ret i8* undef +; + call void @unknown() + ret i8* null +} + +define void @caller1() { +; CHECK-LABEL: @caller1( +; CHECK-NEXT: [[TMP1:%.*]] = call i8* @returned_dead() +; CHECK-NEXT: ret void +; + call i8* @returned_dead() + ret void +} + +define internal void @argument_dead_callback_callee(i8* %c) { +; CHECK-LABEL: @argument_dead_callback_callee( +; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: ret void +; + call void @unknown() + ret void +} + +define void @callback_caller() { +; IS__TUNIT____-LABEL: @callback_caller( +; IS__TUNIT____-NEXT: call void @callback_broker(void (i8*)* noundef @argument_dead_callback_callee, i8* noalias nocapture nofree readnone align 536870912 undef) +; IS__TUNIT____-NEXT: ret void +; +; IS__CGSCC____-LABEL: @callback_caller( +; IS__CGSCC____-NEXT: call void @callback_broker(void (i8*)* noundef @argument_dead_callback_callee, i8* noalias nocapture nofree noundef readnone align 536870912 null) +; IS__CGSCC____-NEXT: ret void +; + call void @callback_broker(void (i8*)* @argument_dead_callback_callee, i8* null) + ret void +} + +declare !callback !0 void @callback_broker(void (i8*)*, i8*) +!1 = !{i64 0, i64 1, i1 false} +!0 = !{!1} diff --git a/llvm/test/Transforms/Attributor/potential.ll b/llvm/test/Transforms/Attributor/potential.ll index 41818fc959248..99e2a71ddc853 100644 --- a/llvm/test/Transforms/Attributor/potential.ll +++ b/llvm/test/Transforms/Attributor/potential.ll @@ -493,20 +493,6 @@ end: ; and returned value of @potential_test10 can be simplified to 0(false) define internal i32 @may_return_undef(i32 %c) { -; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn -; IS__TUNIT____-LABEL: define {{[^@]+}}@may_return_undef -; IS__TUNIT____-SAME: (i32 [[C:%.*]]) -; IS__TUNIT____-NEXT: switch i32 [[C]], label [[OTHERWISE:%.*]] [ -; IS__TUNIT____-NEXT: i32 1, label [[A:%.*]] -; IS__TUNIT____-NEXT: i32 -1, label [[B:%.*]] -; IS__TUNIT____-NEXT: ] -; IS__TUNIT____: a: -; IS__TUNIT____-NEXT: ret i32 1 -; IS__TUNIT____: b: -; IS__TUNIT____-NEXT: ret i32 -1 -; IS__TUNIT____: otherwise: -; IS__TUNIT____-NEXT: ret i32 undef -; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@may_return_undef ; IS__CGSCC____-SAME: (i32 [[C:%.*]]) @@ -532,19 +518,10 @@ otherwise: } define i1 @potential_test10(i32 %c) { -; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn -; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@potential_test10 -; IS__TUNIT_OPM-SAME: (i32 [[C:%.*]]) -; IS__TUNIT_OPM-NEXT: [[RET:%.*]] = call i32 @may_return_undef(i32 [[C]]) [[ATTR0]], [[RNG2:!range !.*]] -; IS__TUNIT_OPM-NEXT: [[CMP:%.*]] = icmp eq i32 [[RET]], 0 -; IS__TUNIT_OPM-NEXT: ret i1 [[CMP]] -; -; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn -; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@potential_test10 -; IS__TUNIT_NPM-SAME: (i32 [[C:%.*]]) -; IS__TUNIT_NPM-NEXT: [[RET:%.*]] = call i32 @may_return_undef(i32 [[C]]) [[ATTR0]], [[RNG3:!range !.*]] -; IS__TUNIT_NPM-NEXT: [[CMP:%.*]] = icmp eq i32 [[RET]], 0 -; IS__TUNIT_NPM-NEXT: ret i1 [[CMP]] +; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT____-LABEL: define {{[^@]+}}@potential_test10 +; IS__TUNIT____-SAME: (i32 [[C:%.*]]) +; IS__TUNIT____-NEXT: ret i1 false ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test10 @@ -558,15 +535,350 @@ define i1 @potential_test10(i32 %c) { ret i1 %cmp } +define i32 @optimize_undef_1(i1 %c) { +; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT____-LABEL: define {{[^@]+}}@optimize_undef_1 +; IS__TUNIT____-SAME: (i1 [[C:%.*]]) +; IS__TUNIT____-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__TUNIT____: t: +; IS__TUNIT____-NEXT: ret i32 0 +; IS__TUNIT____: f: +; IS__TUNIT____-NEXT: ret i32 1 +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@optimize_undef_1 +; IS__CGSCC____-SAME: (i1 [[C:%.*]]) +; IS__CGSCC____-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__CGSCC____: t: +; IS__CGSCC____-NEXT: ret i32 0 +; IS__CGSCC____: f: +; IS__CGSCC____-NEXT: ret i32 1 +; + br i1 %c, label %t, label %f +t: + ret i32 0 +f: + %undef = add i32 undef, 1 + ret i32 %undef +} + +define i32 @optimize_undef_2(i1 %c) { +; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT____-LABEL: define {{[^@]+}}@optimize_undef_2 +; IS__TUNIT____-SAME: (i1 [[C:%.*]]) +; IS__TUNIT____-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__TUNIT____: t: +; IS__TUNIT____-NEXT: ret i32 0 +; IS__TUNIT____: f: +; IS__TUNIT____-NEXT: ret i32 -1 +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@optimize_undef_2 +; IS__CGSCC____-SAME: (i1 [[C:%.*]]) +; IS__CGSCC____-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__CGSCC____: t: +; IS__CGSCC____-NEXT: ret i32 0 +; IS__CGSCC____: f: +; IS__CGSCC____-NEXT: ret i32 -1 +; + br i1 %c, label %t, label %f +t: + ret i32 0 +f: + %undef = sub i32 undef, 1 + ret i32 %undef +} + +define i32 @optimize_undef_3(i1 %c) { +; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT____-LABEL: define {{[^@]+}}@optimize_undef_3 +; IS__TUNIT____-SAME: (i1 [[C:%.*]]) +; IS__TUNIT____-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__TUNIT____: t: +; IS__TUNIT____-NEXT: ret i32 0 +; IS__TUNIT____: f: +; IS__TUNIT____-NEXT: ret i32 1 +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@optimize_undef_3 +; IS__CGSCC____-SAME: (i1 [[C:%.*]]) +; IS__CGSCC____-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__CGSCC____: t: +; IS__CGSCC____-NEXT: ret i32 0 +; IS__CGSCC____: f: +; IS__CGSCC____-NEXT: ret i32 1 +; + br i1 %c, label %t, label %f +t: + ret i32 0 +f: + %undef = icmp eq i32 undef, 0 + %undef2 = zext i1 %undef to i32 + ret i32 %undef2 +} + + +; FIXME: returned value can be simplified to 0 +define i32 @potential_test11(i1 %c) { +; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@potential_test11 +; IS__TUNIT_OPM-SAME: (i1 [[C:%.*]]) +; IS__TUNIT_OPM-NEXT: [[ZERO1:%.*]] = call i32 @optimize_undef_1(i1 [[C]]) [[ATTR0]], [[RNG2:!range !.*]] +; IS__TUNIT_OPM-NEXT: [[ZERO2:%.*]] = call i32 @optimize_undef_2(i1 [[C]]) [[ATTR0]], [[RNG3:!range !.*]] +; IS__TUNIT_OPM-NEXT: [[ACC1:%.*]] = add i32 [[ZERO1]], [[ZERO2]] +; IS__TUNIT_OPM-NEXT: [[ACC2:%.*]] = add i32 [[ACC1]], 0 +; IS__TUNIT_OPM-NEXT: ret i32 [[ACC2]] +; +; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@potential_test11 +; IS__TUNIT_NPM-SAME: (i1 [[C:%.*]]) +; IS__TUNIT_NPM-NEXT: [[ZERO1:%.*]] = call i32 @optimize_undef_1(i1 [[C]]) [[ATTR0]], [[RNG0]] +; IS__TUNIT_NPM-NEXT: [[ZERO2:%.*]] = call i32 @optimize_undef_2(i1 [[C]]) [[ATTR0]], [[RNG3:!range !.*]] +; IS__TUNIT_NPM-NEXT: [[ACC1:%.*]] = add i32 [[ZERO1]], [[ZERO2]] +; IS__TUNIT_NPM-NEXT: [[ACC2:%.*]] = add i32 [[ACC1]], 0 +; IS__TUNIT_NPM-NEXT: ret i32 [[ACC2]] +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test11 +; IS__CGSCC____-SAME: (i1 [[C:%.*]]) +; IS__CGSCC____-NEXT: [[ZERO1:%.*]] = call i32 @optimize_undef_1(i1 [[C]]) +; IS__CGSCC____-NEXT: [[ZERO2:%.*]] = call i32 @optimize_undef_2(i1 [[C]]) +; IS__CGSCC____-NEXT: [[ZERO3:%.*]] = call i32 @optimize_undef_3(i1 [[C]]) +; IS__CGSCC____-NEXT: [[ACC1:%.*]] = add i32 [[ZERO1]], [[ZERO2]] +; IS__CGSCC____-NEXT: [[ACC2:%.*]] = add i32 [[ACC1]], [[ZERO3]] +; IS__CGSCC____-NEXT: ret i32 [[ACC2]] +; + %zero1 = call i32 @optimize_undef_1(i1 %c) + %zero2 = call i32 @optimize_undef_2(i1 %c) + %zero3 = call i32 @optimize_undef_3(i1 %c) + %acc1 = add i32 %zero1, %zero2 + %acc2 = add i32 %acc1, %zero3 + ret i32 %acc2 +} + +define i32 @optimize_poison_1(i1 %c) { +; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@optimize_poison_1 +; IS__TUNIT_OPM-SAME: (i1 [[C:%.*]]) +; IS__TUNIT_OPM-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__TUNIT_OPM: t: +; IS__TUNIT_OPM-NEXT: ret i32 0 +; IS__TUNIT_OPM: f: +; IS__TUNIT_OPM-NEXT: ret i32 -1 +; +; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@optimize_poison_1 +; IS__TUNIT_NPM-SAME: (i1 [[C:%.*]]) +; IS__TUNIT_NPM-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__TUNIT_NPM: t: +; IS__TUNIT_NPM-NEXT: ret i32 0 +; IS__TUNIT_NPM: f: +; IS__TUNIT_NPM-NEXT: ret i32 undef +; +; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@optimize_poison_1 +; IS__CGSCC_OPM-SAME: (i1 [[C:%.*]]) +; IS__CGSCC_OPM-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__CGSCC_OPM: t: +; IS__CGSCC_OPM-NEXT: ret i32 0 +; IS__CGSCC_OPM: f: +; IS__CGSCC_OPM-NEXT: ret i32 -1 +; +; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@optimize_poison_1 +; IS__CGSCC_NPM-SAME: (i1 [[C:%.*]]) +; IS__CGSCC_NPM-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; IS__CGSCC_NPM: t: +; IS__CGSCC_NPM-NEXT: ret i32 0 +; IS__CGSCC_NPM: f: +; IS__CGSCC_NPM-NEXT: ret i32 undef +; + br i1 %c, label %t, label %f +t: + ret i32 0 +f: + %poison = sub nuw i32 0, 1 + ret i32 %poison +} + +; FIXME: returned value can be simplified to 0 +define i32 @potential_test12(i1 %c) { +; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@potential_test12 +; IS__TUNIT_OPM-SAME: (i1 [[C:%.*]]) +; IS__TUNIT_OPM-NEXT: [[ZERO:%.*]] = call i32 @optimize_poison_1(i1 [[C]]) [[ATTR0]], [[RNG3]] +; IS__TUNIT_OPM-NEXT: ret i32 [[ZERO]] +; +; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@potential_test12 +; IS__TUNIT_NPM-SAME: (i1 [[C:%.*]]) +; IS__TUNIT_NPM-NEXT: ret i32 0 +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test12 +; IS__CGSCC____-SAME: (i1 [[C:%.*]]) +; IS__CGSCC____-NEXT: [[ZERO:%.*]] = call i32 @optimize_poison_1(i1 [[C]]) +; IS__CGSCC____-NEXT: ret i32 [[ZERO]] +; + %zero = call i32 @optimize_poison_1(i1 %c) + ret i32 %zero +} + +; Test 13 +; Do not simplify %ret in the callee to `%c`. +; The potential value of %c is {0, 1} (undef is merged). +; However, we should not simplify `and i32 %c, 3` to `%c` + +define internal i32 @potential_test13_callee(i32 %c) { +; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT____-LABEL: define {{[^@]+}}@potential_test13_callee +; IS__TUNIT____-SAME: (i32 [[C:%.*]]) +; IS__TUNIT____-NEXT: [[RET:%.*]] = and i32 [[C]], 3 +; IS__TUNIT____-NEXT: ret i32 [[RET]] +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test13_callee +; IS__CGSCC____-SAME: (i32 [[C:%.*]]) +; IS__CGSCC____-NEXT: [[RET:%.*]] = and i32 [[C]], 3 +; IS__CGSCC____-NEXT: ret i32 [[RET]] +; + %ret = and i32 %c, 3 + ret i32 %ret +} + +define i32 @potential_test13_caller1() { +; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@potential_test13_caller1() +; IS__TUNIT_OPM-NEXT: [[RET:%.*]] = call i32 @potential_test13_callee(i32 0) [[ATTR0]], [[RNG2]] +; IS__TUNIT_OPM-NEXT: ret i32 [[RET]] +; +; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@potential_test13_caller1() +; IS__TUNIT_NPM-NEXT: [[RET:%.*]] = call i32 @potential_test13_callee(i32 0) [[ATTR0]], [[RNG0]] +; IS__TUNIT_NPM-NEXT: ret i32 [[RET]] +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test13_caller1() +; IS__CGSCC____-NEXT: [[RET:%.*]] = call i32 @potential_test13_callee(i32 0) +; IS__CGSCC____-NEXT: ret i32 [[RET]] +; + %ret = call i32 @potential_test13_callee(i32 0) + ret i32 %ret +} + +define i32 @potential_test13_caller2() { +; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@potential_test13_caller2() +; IS__TUNIT_OPM-NEXT: [[RET:%.*]] = call i32 @potential_test13_callee(i32 1) [[ATTR0]], [[RNG2]] +; IS__TUNIT_OPM-NEXT: ret i32 [[RET]] +; +; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@potential_test13_caller2() +; IS__TUNIT_NPM-NEXT: [[RET:%.*]] = call i32 @potential_test13_callee(i32 1) [[ATTR0]], [[RNG0]] +; IS__TUNIT_NPM-NEXT: ret i32 [[RET]] +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test13_caller2() +; IS__CGSCC____-NEXT: [[RET:%.*]] = call i32 @potential_test13_callee(i32 1) +; IS__CGSCC____-NEXT: ret i32 [[RET]] +; + %ret = call i32 @potential_test13_callee(i32 1) + ret i32 %ret +} + +define i32 @potential_test13_caller3() { +; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@potential_test13_caller3() +; IS__TUNIT_OPM-NEXT: [[RET:%.*]] = call i32 @potential_test13_callee(i32 undef) [[ATTR0]], [[RNG2]] +; IS__TUNIT_OPM-NEXT: ret i32 [[RET]] +; +; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@potential_test13_caller3() +; IS__TUNIT_NPM-NEXT: [[RET:%.*]] = call i32 @potential_test13_callee(i32 undef) [[ATTR0]], [[RNG0]] +; IS__TUNIT_NPM-NEXT: ret i32 [[RET]] +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test13_caller3() +; IS__CGSCC____-NEXT: [[RET:%.*]] = call i32 @potential_test13_callee(i32 undef) +; IS__CGSCC____-NEXT: ret i32 [[RET]] +; + %ret = call i32 @potential_test13_callee(i32 undef) + ret i32 %ret +} + +define i1 @potential_test14(i1 %c0, i1 %c1, i1 %c2, i1 %c3) { +; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT____-LABEL: define {{[^@]+}}@potential_test14 +; IS__TUNIT____-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]]) +; IS__TUNIT____-NEXT: [[X0:%.*]] = select i1 [[C0]], i32 0, i32 1 +; IS__TUNIT____-NEXT: [[X1:%.*]] = select i1 [[C1]], i32 [[X0]], i32 undef +; IS__TUNIT____-NEXT: [[Y2:%.*]] = select i1 [[C2]], i32 0, i32 7 +; IS__TUNIT____-NEXT: [[Z3:%.*]] = select i1 [[C3]], i32 [[X1]], i32 [[Y2]] +; IS__TUNIT____-NEXT: [[RET:%.*]] = icmp slt i32 [[Z3]], 7 +; IS__TUNIT____-NEXT: ret i1 [[RET]] +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test14 +; IS__CGSCC____-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]]) +; IS__CGSCC____-NEXT: [[X0:%.*]] = select i1 [[C0]], i32 0, i32 1 +; IS__CGSCC____-NEXT: [[X1:%.*]] = select i1 [[C1]], i32 [[X0]], i32 undef +; IS__CGSCC____-NEXT: [[Y2:%.*]] = select i1 [[C2]], i32 0, i32 7 +; IS__CGSCC____-NEXT: [[Z3:%.*]] = select i1 [[C3]], i32 [[X1]], i32 [[Y2]] +; IS__CGSCC____-NEXT: [[RET:%.*]] = icmp slt i32 [[Z3]], 7 +; IS__CGSCC____-NEXT: ret i1 [[RET]] +; + %x0 = select i1 %c0, i32 0, i32 1 + %x1 = select i1 %c1, i32 %x0, i32 undef + %y2 = select i1 %c2, i32 0, i32 7 + %z3 = select i1 %c3, i32 %x1, i32 %y2 + %ret = icmp slt i32 %z3, 7 + ret i1 %ret +} + +define i1 @potential_test15(i1 %c0, i1 %c1) { +; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT____-LABEL: define {{[^@]+}}@potential_test15 +; IS__TUNIT____-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) +; IS__TUNIT____-NEXT: ret i1 false +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test15 +; IS__CGSCC____-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) +; IS__CGSCC____-NEXT: ret i1 false +; + %x0 = select i1 %c0, i32 0, i32 1 + %x1 = select i1 %c1, i32 %x0, i32 undef + %ret = icmp eq i32 %x1, 7 + ret i1 %ret +} + +define i1 @potential_test16(i1 %c0, i1 %c1) { +; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn +; IS__TUNIT____-LABEL: define {{[^@]+}}@potential_test16 +; IS__TUNIT____-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) +; IS__TUNIT____-NEXT: ret i1 false +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@potential_test16 +; IS__CGSCC____-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) +; IS__CGSCC____-NEXT: ret i1 false +; + %x0 = select i1 %c0, i32 0, i32 undef + %x1 = select i1 %c1, i32 %x0, i32 1 + %ret = icmp eq i32 %x1, 7 + ret i1 %ret +} + ; IS__TUNIT_NPM: !0 = !{i32 0, i32 2} ; IS__TUNIT_NPM: !1 = !{i32 1, i32 4} ; IS__TUNIT_NPM: !2 = !{i32 3, i32 5} -; IS__TUNIT_NPM: !3 = !{i32 -1, i32 2} +; IS__TUNIT_NPM: !3 = !{i32 -1, i32 1} ; IS__TUNIT_NPM-NOT: !4 ; IS__TUNIT_OPM: !0 = !{i32 1, i32 4} ; IS__TUNIT_OPM: !1 = !{i32 3, i32 5} -; IS__TUNIT_OPM: !2 = !{i32 -1, i32 2} -; IS__TUNIT_OPM-NOT: !3 +; IS__TUNIT_OPM: !2 = !{i32 0, i32 2} +; IS__TUNIT_OPM: !3 = !{i32 -1, i32 1} +; IS__TUNIT_OPM-NOT: !4 ; IS__CGSCC____-NOT: !0 diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll index 7ae8cd3780117..125d76926d09f 100644 --- a/llvm/test/Transforms/Attributor/value-simplify.ll +++ b/llvm/test/Transforms/Attributor/value-simplify.ll @@ -644,12 +644,10 @@ for.end: } ; Check we merge undef and a constant properly. -; FIXME fold the addition and return the constant. define i8 @caller0() { ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller0() -; IS__TUNIT____-NEXT: [[C:%.*]] = call i8 @callee() -; IS__TUNIT____-NEXT: ret i8 [[C]] +; IS__TUNIT____-NEXT: ret i8 49 ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller0() @@ -662,8 +660,7 @@ define i8 @caller0() { define i8 @caller1() { ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller1() -; IS__TUNIT____-NEXT: [[C:%.*]] = call i8 @callee() -; IS__TUNIT____-NEXT: ret i8 [[C]] +; IS__TUNIT____-NEXT: ret i8 49 ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller1() @@ -676,8 +673,7 @@ define i8 @caller1() { define i8 @caller2() { ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller2() -; IS__TUNIT____-NEXT: [[C:%.*]] = call i8 @callee() -; IS__TUNIT____-NEXT: ret i8 [[C]] +; IS__TUNIT____-NEXT: ret i8 49 ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller2() @@ -690,8 +686,7 @@ define i8 @caller2() { define i8 @caller_middle() { ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller_middle() -; IS__TUNIT____-NEXT: [[C:%.*]] = call i8 @callee() -; IS__TUNIT____-NEXT: ret i8 [[C]] +; IS__TUNIT____-NEXT: ret i8 49 ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller_middle() @@ -704,8 +699,7 @@ define i8 @caller_middle() { define i8 @caller3() { ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller3() -; IS__TUNIT____-NEXT: [[C:%.*]] = call i8 @callee() -; IS__TUNIT____-NEXT: ret i8 [[C]] +; IS__TUNIT____-NEXT: ret i8 49 ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller3() @@ -718,8 +712,7 @@ define i8 @caller3() { define i8 @caller4() { ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller4() -; IS__TUNIT____-NEXT: [[C:%.*]] = call i8 @callee() -; IS__TUNIT____-NEXT: ret i8 [[C]] +; IS__TUNIT____-NEXT: ret i8 49 ; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller4() @@ -730,15 +723,9 @@ define i8 @caller4() { ret i8 %c } define internal i8 @callee(i8 %a) { -; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn -; IS__TUNIT____-LABEL: define {{[^@]+}}@callee() -; IS__TUNIT____-NEXT: [[C:%.*]] = add i8 42, 7 -; IS__TUNIT____-NEXT: ret i8 [[C]] -; ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn ; IS__CGSCC____-LABEL: define {{[^@]+}}@callee() -; IS__CGSCC____-NEXT: [[C:%.*]] = add i8 42, 7 -; IS__CGSCC____-NEXT: ret i8 [[C]] +; IS__CGSCC____-NEXT: ret i8 49 ; %c = add i8 %a, 7 ret i8 %c diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll index 9def782900899..4171b714d1be9 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; XFAIL: * + ; REQUIRES: asserts ; Eliminates store to %R in the entry block. diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll index 5ee6fe1f69d5c..ae3066192a001 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll @@ -1,4 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py + +; XFAIL: * + ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll index e28713929e9ab..fc3e99723d6e6 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll @@ -12,7 +12,7 @@ declare void @capture(i8*) define i8* @test_return_captures_1() { ; CHECK-LABEL: @test_return_captures_1( ; CHECK-NEXT: [[M:%.*]] = call i8* @malloc(i64 24) -; CHECK-NEXT: store i8 1, i8* [[M]] +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: ret i8* [[M]] ; %m = call i8* @malloc(i64 24) @@ -27,7 +27,7 @@ define i8* @test_return_captures_2() { ; CHECK-NEXT: [[M:%.*]] = call i8* @malloc(i64 24) ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: store i8 1, i8* [[M]] +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: ret i8* [[M]] ; %m = call i8* @malloc(i64 24) @@ -49,8 +49,8 @@ define void @test_malloc_capture_1(%S1* %E) { ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: [[F_PTR:%.*]] = getelementptr [[S1:%.*]], %S1* [[E:%.*]], i32 0, i32 0 -; CHECK-NEXT: store i8* [[M]], i8** [[F_PTR]] -; CHECK-NEXT: store i8 1, i8* [[M]] +; CHECK-NEXT: store i8* [[M]], i8** [[F_PTR]], align 4 +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: ret void ; %m = call i8* @malloc(i64 24) @@ -68,11 +68,11 @@ exit: define i8* @test_malloc_capture_2() { ; CHECK-LABEL: @test_malloc_capture_2( ; CHECK-NEXT: [[M:%.*]] = call i8* @malloc(i64 24) -; CHECK-NEXT: store i8 0, i8* [[M]] +; CHECK-NEXT: store i8 0, i8* [[M]], align 1 ; CHECK-NEXT: call void @capture(i8* [[M]]) ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: store i8 1, i8* [[M]] +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: ret i8* [[M]] ; %m = call i8* @malloc(i64 24) @@ -92,7 +92,7 @@ define i8* @test_malloc_capture_3() { ; CHECK-NEXT: [[M:%.*]] = call i8* @malloc(i64 24) ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: store i8 1, i8* [[M]] +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: call void @capture(i8* [[M]]) ; CHECK-NEXT: ret i8* [[M]] ; @@ -111,11 +111,11 @@ exit: define i8* @test_malloc_capture_4() { ; CHECK-LABEL: @test_malloc_capture_4( ; CHECK-NEXT: [[M:%.*]] = call i8* @malloc(i64 24) -; CHECK-NEXT: store i8 0, i8* [[M]] +; CHECK-NEXT: store i8 0, i8* [[M]], align 1 ; CHECK-NEXT: call void @may_throw_readnone() ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: store i8 1, i8* [[M]] +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: call void @capture(i8* [[M]]) ; CHECK-NEXT: ret i8* [[M]] ; @@ -138,11 +138,11 @@ define i8* @test_malloc_capture_5() { ; CHECK-LABEL: @test_malloc_capture_5( ; CHECK-NEXT: [[M:%.*]] = call i8* @malloc(i64 24) ; CHECK-NEXT: call void @capture(i8* [[M]]) -; CHECK-NEXT: store i8 0, i8* [[M]] +; CHECK-NEXT: store i8 0, i8* [[M]], align 1 ; CHECK-NEXT: call void @may_throw_readnone() ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: store i8 1, i8* [[M]] +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: ret i8* [[M]] ; @@ -164,11 +164,11 @@ exit: define i8* @test_malloc_capture_6() { ; CHECK-LABEL: @test_malloc_capture_6( ; CHECK-NEXT: [[M:%.*]] = call i8* @malloc(i64 24) -; CHECK-NEXT: store i8 0, i8* [[M]] +; CHECK-NEXT: store i8 0, i8* [[M]], align 1 ; CHECK-NEXT: call void @may_throw_readnone() ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: store i8 1, i8* [[M]] +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: call void @capture(i8* [[M]]) ; CHECK-NEXT: ret i8* [[M]] ; @@ -184,15 +184,20 @@ exit: ret i8* %m } -; We can remove the first store 'store i8 0, i8* %m' even though there is a +; We *could* remove the first store 'store i8 0, i8* %m' even though there is a ; throwing instruction between them, because %m escapes after the killing store. +; But this would require using PointerMayBeCapturedBefore in +; isInvisibleToCallerBeforeRet, which we currently do not do to limit +; compile-time, as this appears to hardly ever lead to more stores eliminated +; in practice. define i8* @test_malloc_capture_7() { ; CHECK-LABEL: @test_malloc_capture_7( ; CHECK-NEXT: [[M:%.*]] = call i8* @malloc(i64 24) +; CHECK-NEXT: store i8 0, i8* [[M]], align 1 ; CHECK-NEXT: call void @may_throw() ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: store i8 1, i8* [[M]] +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: call void @capture(i8* [[M]]) ; CHECK-NEXT: ret i8* [[M]] ; @@ -229,8 +234,8 @@ exit: ; Cannot remove first store i8 0, i8* %m, as the call to @capture captures the object. define void @test_alloca_capture_1() { ; CHECK-LABEL: @test_alloca_capture_1( -; CHECK-NEXT: [[M:%.*]] = alloca i8 -; CHECK-NEXT: store i8 0, i8* [[M]] +; CHECK-NEXT: [[M:%.*]] = alloca i8, align 1 +; CHECK-NEXT: store i8 0, i8* [[M]], align 1 ; CHECK-NEXT: call void @capture(i8* [[M]]) ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: @@ -250,11 +255,11 @@ exit: ; becomes invalid after the function returns. define void @test_alloca_capture_2(%S1* %E) { ; CHECK-LABEL: @test_alloca_capture_2( -; CHECK-NEXT: [[M:%.*]] = alloca i8 +; CHECK-NEXT: [[M:%.*]] = alloca i8, align 1 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: [[F_PTR:%.*]] = getelementptr [[S1:%.*]], %S1* [[E:%.*]], i32 0, i32 0 -; CHECK-NEXT: store i8* [[M]], i8** [[F_PTR]] +; CHECK-NEXT: store i8* [[M]], i8** [[F_PTR]], align 4 ; CHECK-NEXT: ret void ; %m = alloca i8 diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll index 28f81015c799a..8413251036676 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll @@ -406,7 +406,6 @@ define void @accessible_after_return11_loop() { ; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C_1]], label [[FOR_BODY_I]], label [[INIT_PARSE_EXIT:%.*]] ; CHECK: init_parse.exit: -; CHECK-NEXT: store i32 0, i32* @linenum, align 4 ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull undef) ; CHECK-NEXT: store i32 0, i32* @linenum, align 4 ; CHECK-NEXT: br label [[FOR_BODY_I20:%.*]] diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/pr47285-not-overwritten-on-all-exit-paths.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/pr47285-not-overwritten-on-all-exit-paths.ll new file mode 100644 index 0000000000000..aaff809d38d0b --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/pr47285-not-overwritten-on-all-exit-paths.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -dse -enable-dse-memoryssa -S %s | FileCheck %s + +@b = local_unnamed_addr global i32 0, align 4 + +; Reduced test case for PR47285. + +; `store i32 9, i32* @b` in %interesting is not killed by `store i32 23, i32* @b` +; in %killer, because it is not overwritten before reaching the end of the +; function via %bb.2 -> %no.overwrite.exit. + +define void @test(i1 %c.0, i1 %c.2, i1 %c.3, i1 %c.4, i1 %c.5, i1 %c.6) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[BB_2:%.*]] +; CHECK: bb.2: +; CHECK-NEXT: br i1 [[C_0:%.*]], label [[BB_3:%.*]], label [[NO_OVERWRITE_EXIT:%.*]] +; CHECK: no.overwrite.exit: +; CHECK-NEXT: ret void +; CHECK: bb.3: +; CHECK-NEXT: br i1 [[C_2:%.*]], label [[BB_4:%.*]], label [[BB_7:%.*]] +; CHECK: bb.4: +; CHECK-NEXT: br i1 [[C_4:%.*]], label [[BB_5:%.*]], label [[BB_6:%.*]] +; CHECK: bb.5: +; CHECK-NEXT: store i32 99, i32* @b, align 4 +; CHECK-NEXT: br i1 [[C_3:%.*]], label [[BB_5]], label [[BB_2]] +; CHECK: bb.6: +; CHECK-NEXT: store i32 91, i32* @b, align 4 +; CHECK-NEXT: br i1 [[C_5:%.*]], label [[SPLIT_CRIT_EDGE_2:%.*]], label [[BB_2]] +; CHECK: split_crit_edge.2: +; CHECK-NEXT: store i32 27, i32* @b, align 4 +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: bb.7: +; CHECK-NEXT: br i1 [[C_4]], label [[INTERESTING:%.*]], label [[BB_8:%.*]] +; CHECK: interesting: +; CHECK-NEXT: store i32 9, i32* @b, align 4 +; CHECK-NEXT: br i1 [[C_6:%.*]], label [[KILLER:%.*]], label [[BB_2]] +; CHECK: killer: +; CHECK-NEXT: store i32 23, i32* @b, align 4 +; CHECK-NEXT: ret void +; CHECK: bb.8: +; CHECK-NEXT: store i32 19, i32* @b, align 4 +; CHECK-NEXT: br i1 [[C_4]], label [[EXIT]], label [[BB_2]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %bb.2 + +bb.2: ; preds = %bb.8, %interesting, %bb.6, %bb.5, %entry + br i1 %c.0, label %bb.3, label %no.overwrite.exit + +no.overwrite.exit: ; preds = %bb.2 + ret void + +bb.3: ; preds = %bb.2 + br i1 %c.2, label %bb.4, label %bb.7 + +bb.4: ; preds = %bb.3 + br i1 %c.4, label %bb.5, label %bb.6 + +bb.5: ; preds = %bb.5, %bb.4 + store i32 99, i32* @b, align 4 + br i1 %c.3, label %bb.5, label %bb.2 + +bb.6: ; preds = %bb.4 + store i32 91, i32* @b, align 4 + br i1 %c.5, label %split_crit_edge.2, label %bb.2 + +split_crit_edge.2: ; preds = %bb.6 + store i32 27, i32* @b, align 4 + br label %exit + +bb.7: ; preds = %bb.3 + br i1 %c.4, label %interesting, label %bb.8 + +interesting: ; preds = %bb.7 + store i32 9, i32* @b, align 4 + br i1 %c.6, label %killer, label %bb.2 + +killer: ; preds = %interesting + store i32 23, i32* @b, align 4 + ret void + +bb.8: ; preds = %bb.7 + store i32 19, i32* @b, align 4 + br i1 %c.4, label %exit, label %bb.2 + +exit: ; preds = %bb.8, %split_crit_edge.2 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll index 0c83a750a6a88..0cceb5ac4a73d 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll @@ -285,8 +285,15 @@ define void @test21() { ret void } +; Currently elimination of stores at the end of a function is limited to a +; single underlying object, for compile-time. This case appears to not be +; very important in practice. define void @test22(i1 %i, i32 %k, i32 %m) nounwind { ; CHECK-LABEL: @test22( +; CHECK-NEXT: [[K_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[M_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[K_ADDR_M_ADDR:%.*]] = select i1 [[I:%.*]], i32* [[K_ADDR]], i32* [[M_ADDR]] +; CHECK-NEXT: store i32 0, i32* [[K_ADDR_M_ADDR]], align 4 ; CHECK-NEXT: ret void ; %k.addr = alloca i32 @@ -305,7 +312,7 @@ define noalias i8* @test23() nounwind uwtable ssp { ; CHECK-NEXT: store i8 97, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i8], [2 x i8]* [[X]], i64 0, i64 1 ; CHECK-NEXT: store i8 0, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CALL:%.*]] = call i8* @strdup(i8* [[ARRAYIDX]]) #3 +; CHECK-NEXT: [[CALL:%.*]] = call i8* @strdup(i8* [[ARRAYIDX]]) [[ATTR3:#.*]] ; CHECK-NEXT: ret i8* [[CALL]] ; %x = alloca [2 x i8], align 1 @@ -343,7 +350,7 @@ define i8* @test25(i8* %p) nounwind { ; CHECK-NEXT: [[P_4:%.*]] = getelementptr i8, i8* [[P:%.*]], i64 4 ; CHECK-NEXT: [[TMP:%.*]] = load i8, i8* [[P_4]], align 1 ; CHECK-NEXT: store i8 0, i8* [[P_4]], align 1 -; CHECK-NEXT: [[Q:%.*]] = call i8* @strdup(i8* [[P]]) #6 +; CHECK-NEXT: [[Q:%.*]] = call i8* @strdup(i8* [[P]]) [[ATTR6:#.*]] ; CHECK-NEXT: store i8 [[TMP]], i8* [[P_4]], align 1 ; CHECK-NEXT: ret i8* [[Q]] ; @@ -711,7 +718,7 @@ define void @test44_volatile(i32* %P) { define void @test45_volatile(i32* %P) { ; CHECK-LABEL: @test45_volatile( -; CHECK-NEXT: store volatile i32 2, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 2, i32* [[P:%.*]], align 4 ; CHECK-NEXT: store volatile i32 3, i32* [[P]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/EarlyCSE/invariant.start.ll b/llvm/test/Transforms/EarlyCSE/invariant.start.ll index d26ba496223f9..2202c09c1a0e9 100644 --- a/llvm/test/Transforms/EarlyCSE/invariant.start.ll +++ b/llvm/test/Transforms/EarlyCSE/invariant.start.ll @@ -525,3 +525,5 @@ define i32 @test_invariant_load_scope(i32* %p) { %sub = sub i32 %v1, %v2 ret i32 %sub } + +; USE_ASSUME: declare void @llvm.assume(i1 noundef) diff --git a/llvm/test/Transforms/EarlyCSE/phi.ll b/llvm/test/Transforms/EarlyCSE/phi.ll new file mode 100644 index 0000000000000..efe08a63948ed --- /dev/null +++ b/llvm/test/Transforms/EarlyCSE/phi.ll @@ -0,0 +1,324 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -early-cse -S < %s | FileCheck %s +; RUN: opt -basic-aa -early-cse-memssa -S < %s | FileCheck %s + +; Most basic case, fully identical PHI nodes +define void @test0(i32 %v0, i32 %v1, i1 %c, i32* %d0, i32* %d1) { +; CHECK-LABEL: @test0( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: store i32 [[I0]], i32* [[D0:%.*]], align 4 +; CHECK-NEXT: store i32 [[I0]], i32* [[D1:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %i1 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + store i32 %i0, i32* %d0 + store i32 %i1, i32* %d1 + ret void +} + +; Fully identical PHI nodes, but order of operands differs +define void @test1(i32 %v0, i32 %v1, i1 %c, i32* %d0, i32* %d1) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: store i32 [[I0]], i32* [[D0:%.*]], align 4 +; CHECK-NEXT: store i32 [[I0]], i32* [[D1:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %i1 = phi i32 [ %v1, %b1 ], [ %v0, %b0 ] + store i32 %i0, i32* %d0 + store i32 %i1, i32* %d1 + ret void +} + +; Different incoming values in second PHI +define void @negative_test2(i32 %v0, i32 %v1, i32 %v2, i1 %c, i32* %d0, i32* %d1) { +; CHECK-LABEL: @negative_test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: [[I1:%.*]] = phi i32 [ [[V0]], [[B0]] ], [ [[V2:%.*]], [[B1]] ] +; CHECK-NEXT: store i32 [[I0]], i32* [[D0:%.*]], align 4 +; CHECK-NEXT: store i32 [[I1]], i32* [[D1:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %i1 = phi i32 [ %v0, %b0 ], [ %v2, %b1 ] ; from %b0 takes %v2 instead of %v1 + store i32 %i0, i32* %d0 + store i32 %i1, i32* %d1 + ret void +} +define void @negative_test3(i32 %v0, i32 %v1, i32 %v2, i1 %c, i32* %d0, i32* %d1) { +; CHECK-LABEL: @negative_test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: [[I1:%.*]] = phi i32 [ [[V2:%.*]], [[B1]] ], [ [[V0]], [[B0]] ] +; CHECK-NEXT: store i32 [[I0]], i32* [[D0:%.*]], align 4 +; CHECK-NEXT: store i32 [[I1]], i32* [[D1:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %i1 = phi i32 [ %v2, %b1 ], [ %v0, %b0 ] ; from %b0 takes %v2 instead of %v1 + store i32 %i0, i32* %d0 + store i32 %i1, i32* %d1 + ret void +} +define void @negative_test4(i32 %v0, i32 %v1, i1 %c, i32* %d0, i32* %d1) { +; CHECK-LABEL: @negative_test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: store i32 [[I0]], i32* [[D0:%.*]], align 4 +; CHECK-NEXT: store i32 [[I0]], i32* [[D1:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %i1 = phi i32 [ %v1, %b1 ], [ %v0, %b0 ] ; incoming values are swapped + store i32 %i0, i32* %d0 + store i32 %i1, i32* %d1 + ret void +} + +; Both PHI's are identical, but the first one has no uses, so ignore it. +define void @test5(i32 %v0, i32 %v1, i1 %c, i32* %d0, i32* %d1) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[I1:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: store i32 [[I1]], i32* [[D1:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] ; unused + %i1 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + store i32 %i1, i32* %d1 + ret void +} +; Second PHI has no uses +define void @test6(i32 %v0, i32 %v1, i1 %c, i32* %d0, i32* %d1) { +; CHECK-LABEL: @test6( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: store i32 [[I0]], i32* [[D0:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %i1 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] ; unused + store i32 %i0, i32* %d0 + ret void +} + +; Non-matching PHI node should be ignored without terminating CSE. +define void @test7(i32 %v0, i32 %v1, i16 %v2, i16 %v3, i1 %c, i32* %d0, i32* %d1, i16* %d2) { +; CHECK-LABEL: @test7( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[IBAD:%.*]] = phi i16 [ [[V2:%.*]], [[B0]] ], [ [[V3:%.*]], [[B1]] ] +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: store i32 [[I0]], i32* [[D0:%.*]], align 4 +; CHECK-NEXT: store i32 [[I0]], i32* [[D1:%.*]], align 4 +; CHECK-NEXT: store i16 [[IBAD]], i16* [[D2:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %iBAD = phi i16 [ %v2, %b0 ], [ %v3, %b1 ] + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %i1 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + store i32 %i0, i32* %d0 + store i32 %i1, i32* %d1 + store i16 %iBAD, i16* %d2 + ret void +} +define void @test8(i32 %v0, i32 %v1, i16 %v2, i16 %v3, i1 %c, i32* %d0, i32* %d1, i16* %d2) { +; CHECK-LABEL: @test8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: [[IBAD:%.*]] = phi i16 [ [[V2:%.*]], [[B0]] ], [ [[V3:%.*]], [[B1]] ] +; CHECK-NEXT: store i32 [[I0]], i32* [[D0:%.*]], align 4 +; CHECK-NEXT: store i32 [[I0]], i32* [[D1:%.*]], align 4 +; CHECK-NEXT: store i16 [[IBAD]], i16* [[D2:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %iBAD = phi i16 [ %v2, %b0 ], [ %v3, %b1 ] + %i1 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + store i32 %i0, i32* %d0 + store i32 %i1, i32* %d1 + store i16 %iBAD, i16* %d2 + ret void +} +define void @test9(i32 %v0, i32 %v1, i16 %v2, i16 %v3, i1 %c, i32* %d0, i32* %d1, i16* %d2) { +; CHECK-LABEL: @test9( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[B0:%.*]], label [[B1:%.*]] +; CHECK: b0: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: b1: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[V0:%.*]], [[B0]] ], [ [[V1:%.*]], [[B1]] ] +; CHECK-NEXT: [[IBAD:%.*]] = phi i16 [ [[V2:%.*]], [[B0]] ], [ [[V3:%.*]], [[B1]] ] +; CHECK-NEXT: store i32 [[I0]], i32* [[D0:%.*]], align 4 +; CHECK-NEXT: store i32 [[I0]], i32* [[D1:%.*]], align 4 +; CHECK-NEXT: store i16 [[IBAD]], i16* [[D2:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %b0, label %b1 + +b0: + br label %end + +b1: + br label %end + +end: + %i0 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %i1 = phi i32 [ %v0, %b0 ], [ %v1, %b1 ] + %iBAD = phi i16 [ %v2, %b0 ], [ %v3, %b1 ] + store i32 %i0, i32* %d0 + store i32 %i1, i32* %d1 + store i16 %iBAD, i16* %d2 + ret void +} diff --git a/llvm/test/Transforms/Inline/ML/Inputs/test_output_spec.json b/llvm/test/Transforms/Inline/ML/Inputs/test_output_spec.json index bd6a19c9572b5..2a70e3afc9639 100644 --- a/llvm/test/Transforms/Inline/ML/Inputs/test_output_spec.json +++ b/llvm/test/Transforms/Inline/ML/Inputs/test_output_spec.json @@ -4,7 +4,7 @@ "tensor_spec": { "name": "StatefulPartitionedCall", "port": 0, - "type": "int64", + "type": "int64_t", "shape": [ 1 ] @@ -15,7 +15,7 @@ "tensor_spec": { "name": "StatefulPartitionedCall", "port": 0, - "type": "int64", + "type": "int64_t", "shape": [ 1 ] diff --git a/llvm/test/Transforms/Inline/ML/development-training-log.ll b/llvm/test/Transforms/Inline/ML/development-training-log.ll index 77347455958b1..82dea452497dd 100644 --- a/llvm/test/Transforms/Inline/ML/development-training-log.ll +++ b/llvm/test/Transforms/Inline/ML/development-training-log.ll @@ -1,8 +1,10 @@ ; Test that we can produce a log if we have or do not have a model, in development mode. ; REQUIRES: have_tf_api ; RUN: opt -enable-ml-inliner=development -passes=scc-oz-module-inliner -training-log=- -ml-inliner-model-under-training=%S/../../../../lib/Analysis/models/inliner -ml-inliner-ir2native-model=%S/../../../../unittests/Analysis/Inputs/ir2native_x86_64_model -S < %s | FileCheck %s -; RUN: opt -enable-ml-inliner=development -passes=scc-oz-module-inliner -training-log=- -ml-inliner-model-under-training=%S/../../../../lib/Analysis/models/inliner -ml-inliner-ir2native-model=%S/../../../../unittests/Analysis/Inputs/ir2native_x86_64_model -ml-inliner-output-spec-override=%S/Inputs/test_output_spec.json -S < %s | FileCheck %s --check-prefix=EXTRA-OUTPUTS +; RUN: opt -enable-ml-inliner=development -passes=scc-oz-module-inliner -training-log=- -ml-inliner-model-under-training=%S/../../../../lib/Analysis/models/inliner -ml-inliner-ir2native-model=%S/../../../../unittests/Analysis/Inputs/ir2native_x86_64_model -ml-inliner-output-spec-override=%S/Inputs/test_output_spec.json -S < %s | FileCheck %s --check-prefixes=EXTRA-OUTPUTS,CHECK ; RUN: opt -enable-ml-inliner=development -passes=scc-oz-module-inliner -training-log=- -ml-inliner-ir2native-model=%S/../../../../unittests/Analysis/Inputs/ir2native_x86_64_model -S < %s | FileCheck %s +; RUN: opt -enable-ml-inliner=development -passes=scc-oz-module-inliner -training-log=- -ml-inliner-model-under-training=%S/../../../../lib/Analysis/models/inliner -S < %s | FileCheck %s --check-prefix=NOREWARD +; RUN: opt -enable-ml-inliner=development -passes=scc-oz-module-inliner -training-log=- -S < %s | FileCheck %s --check-prefix=NOREWARD target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" @@ -50,6 +52,7 @@ define dso_local i32 @top() { ; CHECK-NEXT: feature: { int64_list: { value: [0] } } ; CHECK-NEXT: } ; CHECK-NEXT: } +; NOREWARD-NOT: key: "delta_size" value: { ; CHECK-NOT: fake_extra_output ; EXTRA-OUTPUTS: key: "fake_extra_output" value: { ; EXTRA-OUTPUTS-NEXT: feature: { int64_list: { value: [1] } } diff --git a/llvm/test/Transforms/Inline/always-inline.ll b/llvm/test/Transforms/Inline/always-inline.ll index 81cbb73fbf59d..0fcf956199c46 100644 --- a/llvm/test/Transforms/Inline/always-inline.ll +++ b/llvm/test/Transforms/Inline/always-inline.ll @@ -1,15 +1,17 @@ -; RUN: opt < %s -inline-threshold=0 -always-inline -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CALL +; RUN: opt < %s -inline-threshold=0 -always-inline -enable-new-pm=0 -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CALL ; ; Ensure the threshold has no impact on these decisions. -; RUN: opt < %s -inline-threshold=20000000 -always-inline -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CALL -; RUN: opt < %s -inline-threshold=-20000000 -always-inline -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CALL +; RUN: opt < %s -inline-threshold=20000000 -always-inline -enable-new-pm=0 -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CALL +; RUN: opt < %s -inline-threshold=-20000000 -always-inline -enable-new-pm=0 -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CALL ; ; The new pass manager doesn't re-use any threshold based infrastructure for ; the always inliner, but test that we get the correct result. The new PM ; always inliner also doesn't support inlining call-site alwaysinline ; annotations. It isn't clear that this is a reasonable use case for ; 'alwaysinline'. -; RUN: opt < %s -passes=always-inline -S | FileCheck %s --check-prefix=CHECK +; RUN: opt < %s -inline-threshold=0 -passes=always-inline -S | FileCheck %s --check-prefix=CHECK +; RUN: opt < %s -inline-threshold=20000000 -passes=always-inline -S | FileCheck %s --check-prefix=CHECK +; RUN: opt < %s -inline-threshold=-20000000 -passes=always-inline -S | FileCheck %s --check-prefix=CHECK define internal i32 @inner1() alwaysinline { ; CHECK-NOT: @inner1( diff --git a/llvm/test/Transforms/Inline/externally_available.ll b/llvm/test/Transforms/Inline/externally_available.ll index ba316f134cb7c..0785dabf84683 100644 --- a/llvm/test/Transforms/Inline/externally_available.ll +++ b/llvm/test/Transforms/Inline/externally_available.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -inline -constprop -S | FileCheck %s +; RUN: opt < %s -inline -S | FileCheck %s define available_externally i32 @test_function() { ; CHECK-NOT: @test_function @@ -16,7 +16,8 @@ entry: %B = add i32 %A, 1 ret i32 %B -; CHECK: ret i32 5 +; CHECK: add i32 +; CHECK-NEXT: ret i32 } ; CHECK-NOT: @test_function diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-trunc.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-trunc.ll new file mode 100644 index 0000000000000..d18beb5dbdf40 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-trunc.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -instcombine -S < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define void @trunc_nxv2i64_to_nxv2i32(i32* %ptr, %v) { +; CHECK-LABEL: @trunc_nxv2i64_to_nxv2i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[V:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[TMP0]]) +; CHECK-NEXT: [[TMP3:%.*]] = trunc [[TMP1]] to +; CHECK-NEXT: call void @llvm.aarch64.sve.st1.nxv2i32( [[TMP3]], [[TMP2]], i32* [[PTR:%.*]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %1 = bitcast %v to + %2 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) + %3 = trunc %1 to + call void @llvm.aarch64.sve.st1.nxv2i32( %3, %2, i32* %ptr) + ret void +} + +declare void @llvm.aarch64.sve.st1.nxv2i32(, , i32*) +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern) diff --git a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll index bf8d2b937d8be..20ba876c53441 100644 --- a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll +++ b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll @@ -104,3 +104,25 @@ define <4 x i32> @abs_of_neg_vec(<4 x i32> %x) { %b = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 false) ret <4 x i32> %b } + +define i32 @abs_of_select_neg_true_val(i1 %b, i32 %x) { +; CHECK-LABEL: @abs_of_select_neg_true_val( +; CHECK-NEXT: [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[ABS]] +; + %neg = sub i32 0, %x + %sel = select i1 %b, i32 %neg, i32 %x + %abs = call i32 @llvm.abs.i32(i32 %sel, i1 true) + ret i32 %abs +} + +define <4 x i32> @abs_of_select_neg_false_val(<4 x i1> %b, <4 x i32> %x) { +; CHECK-LABEL: @abs_of_select_neg_false_val( +; CHECK-NEXT: [[ABS:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[X:%.*]], i1 false) +; CHECK-NEXT: ret <4 x i32> [[ABS]] +; + %neg = sub <4 x i32> zeroinitializer, %x + %sel = select <4 x i1> %b, <4 x i32> %x, <4 x i32> %neg + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sel, i1 false) + ret <4 x i32> %abs +} diff --git a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll index 5be341fa62280..be00822834d82 100644 --- a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll +++ b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt %s -instcombine -instcombine-infinite-loop-threshold=2 -S | FileCheck %s +; RUN: opt %s -instcombine -instcombine-infinite-loop-threshold=3 -S | FileCheck %s @var_7 = external global i8, align 1 @var_1 = external global i32, align 4 @@ -29,11 +29,10 @@ define void @_Z4testv() { ; CHECK-NEXT: br label [[BB12]] ; CHECK: bb12: ; CHECK-NEXT: [[STOREMERGE1:%.*]] = phi i32 [ [[I11]], [[BB10]] ], [ 1, [[BB9]] ] -; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i32 [ [[I11]], [[BB10]] ], [ 1, [[BB9]] ] ; CHECK-NEXT: store i32 [[STOREMERGE1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @arr_2, i64 0, i64 0), align 4 ; CHECK-NEXT: store i16 [[I4]], i16* getelementptr inbounds ([0 x i16], [0 x i16]* @arr_4, i64 0, i64 0), align 2 ; CHECK-NEXT: store i32 [[I8]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @arr_3, i64 0, i64 0), align 16 -; CHECK-NEXT: store i32 [[STOREMERGE]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @arr_2, i64 0, i64 1), align 4 +; CHECK-NEXT: store i32 [[STOREMERGE1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @arr_2, i64 0, i64 1), align 4 ; CHECK-NEXT: store i16 [[I4]], i16* getelementptr inbounds ([0 x i16], [0 x i16]* @arr_4, i64 0, i64 1), align 2 ; CHECK-NEXT: store i32 [[I8]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @arr_3, i64 0, i64 1), align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/InstCombine/phi-aware-aggregate-reconstruction.ll b/llvm/test/Transforms/InstCombine/phi-aware-aggregate-reconstruction.ll index 78c75c346dbc0..0befa4b7a2ad0 100644 --- a/llvm/test/Transforms/InstCombine/phi-aware-aggregate-reconstruction.ll +++ b/llvm/test/Transforms/InstCombine/phi-aware-aggregate-reconstruction.ll @@ -24,9 +24,9 @@ define { i32, i32 } @test0({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 % ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[I8_MERGED:%.*]] = phi { i32, i32 } [ [[AGG_RIGHT:%.*]], [[RIGHT]] ], [ [[AGG_LEFT:%.*]], [[LEFT]] ] +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ] ; CHECK-NEXT: call void @baz() -; CHECK-NEXT: ret { i32, i32 } [[I8_MERGED]] +; CHECK-NEXT: ret { i32, i32 } [[AGG_LEFT_PN]] ; entry: br i1 %c, label %left, label %right @@ -58,18 +58,16 @@ define { i32, i32 } @negative_test1({ i32, i32 } %agg_left, { i32, i32 } %agg_ri ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] ; CHECK: left: -; CHECK-NEXT: [[I4:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 1 -; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0 ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[END:%.*]] ; CHECK: right: -; CHECK-NEXT: [[I3:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT]], 0 -; CHECK-NEXT: [[I2:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT]], 1 ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[I5:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I3]], [[RIGHT]] ] -; CHECK-NEXT: [[I6:%.*]] = phi i32 [ [[I4]], [[LEFT]] ], [ [[I2]], [[RIGHT]] ] +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[AGG_RIGHT_PN:%.*]] = phi { i32, i32 } [ [[AGG_RIGHT]], [[LEFT]] ], [ [[AGG_LEFT]], [[RIGHT]] ] +; CHECK-NEXT: [[I6:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT_PN]], 1 +; CHECK-NEXT: [[I5:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT_PN]], 0 ; CHECK-NEXT: call void @baz() ; CHECK-NEXT: [[I7:%.*]] = insertvalue { i32, i32 } undef, i32 [[I5]], 0 ; CHECK-NEXT: [[I8:%.*]] = insertvalue { i32, i32 } [[I7]], i32 [[I6]], 1 @@ -154,28 +152,18 @@ define { i32, i32 } @test3({ i32, i32 } %agg_00, { i32, i32 } %agg_01, { i32, i3 ; CHECK: bb0.dispatch: ; CHECK-NEXT: br i1 [[C1:%.*]], label [[BB00:%.*]], label [[BB01:%.*]] ; CHECK: bb00: -; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_00:%.*]], 0 -; CHECK-NEXT: [[I1:%.*]] = extractvalue { i32, i32 } [[AGG_00]], 1 ; CHECK-NEXT: br label [[BB0_MERGE:%.*]] ; CHECK: bb01: -; CHECK-NEXT: [[I2:%.*]] = extractvalue { i32, i32 } [[AGG_01:%.*]], 0 -; CHECK-NEXT: [[I3:%.*]] = extractvalue { i32, i32 } [[AGG_01]], 1 ; CHECK-NEXT: br label [[BB0_MERGE]] ; CHECK: bb0.merge: -; CHECK-NEXT: [[I4:%.*]] = phi i32 [ [[I0]], [[BB00]] ], [ [[I2]], [[BB01]] ] -; CHECK-NEXT: [[I5:%.*]] = phi i32 [ [[I1]], [[BB00]] ], [ [[I3]], [[BB01]] ] +; CHECK-NEXT: [[AGG_00_PN:%.*]] = phi { i32, i32 } [ [[AGG_00:%.*]], [[BB00]] ], [ [[AGG_01:%.*]], [[BB01]] ] ; CHECK-NEXT: br label [[END:%.*]] ; CHECK: bb10: -; CHECK-NEXT: [[I6:%.*]] = extractvalue { i32, i32 } [[AGG_10:%.*]], 0 -; CHECK-NEXT: [[I7:%.*]] = extractvalue { i32, i32 } [[AGG_10]], 1 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[I8:%.*]] = phi i32 [ [[I4]], [[BB0_MERGE]] ], [ [[I6]], [[BB10]] ] -; CHECK-NEXT: [[I9:%.*]] = phi i32 [ [[I5]], [[BB0_MERGE]] ], [ [[I7]], [[BB10]] ] +; CHECK-NEXT: [[AGG_00_PN_PN:%.*]] = phi { i32, i32 } [ [[AGG_00_PN]], [[BB0_MERGE]] ], [ [[AGG_10:%.*]], [[BB10]] ] ; CHECK-NEXT: call void @baz() -; CHECK-NEXT: [[I10:%.*]] = insertvalue { i32, i32 } undef, i32 [[I8]], 0 -; CHECK-NEXT: [[I11:%.*]] = insertvalue { i32, i32 } [[I10]], i32 [[I9]], 1 -; CHECK-NEXT: ret { i32, i32 } [[I11]] +; CHECK-NEXT: ret { i32, i32 } [[AGG_00_PN_PN]] ; entry: br i1 %c0, label %bb0.dispatch, label %bb10 @@ -278,12 +266,12 @@ define { i32, i32 } @test5({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 % ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label [[MIDDLE]] ; CHECK: middle: -; CHECK-NEXT: [[I8_MERGED:%.*]] = phi { i32, i32 } [ [[I8_MERGED]], [[MIDDLE]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ], [ [[AGG_LEFT:%.*]], [[LEFT]] ] +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ], [ [[AGG_LEFT_PN]], [[MIDDLE]] ] ; CHECK-NEXT: call void @baz() ; CHECK-NEXT: [[C1:%.*]] = call i1 @geni1() ; CHECK-NEXT: br i1 [[C1]], label [[END:%.*]], label [[MIDDLE]] ; CHECK: end: -; CHECK-NEXT: ret { i32, i32 } [[I8_MERGED]] +; CHECK-NEXT: ret { i32, i32 } [[AGG_LEFT_PN]] ; entry: br i1 %c0, label %left, label %right @@ -327,7 +315,7 @@ define { i32, i32 } @test6({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 % ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label [[MERGE]] ; CHECK: merge: -; CHECK-NEXT: [[I8_MERGED:%.*]] = phi { i32, i32 } [ [[AGG_RIGHT:%.*]], [[RIGHT]] ], [ [[AGG_LEFT:%.*]], [[LEFT]] ] +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ] ; CHECK-NEXT: call void @baz() ; CHECK-NEXT: br i1 [[C1:%.*]], label [[END:%.*]], label [[PASSTHROUGH:%.*]] ; CHECK: passthrough: @@ -335,7 +323,7 @@ define { i32, i32 } @test6({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 % ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: call void @quux() -; CHECK-NEXT: ret { i32, i32 } [[I8_MERGED]] +; CHECK-NEXT: ret { i32, i32 } [[AGG_LEFT_PN]] ; entry: br i1 %c0, label %left, label %right @@ -441,9 +429,9 @@ define { i32, i32 } @test8({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 % ; CHECK: impossible: ; CHECK-NEXT: unreachable ; CHECK: end: -; CHECK-NEXT: [[I8_MERGED:%.*]] = phi { i32, i32 } [ [[AGG_RIGHT:%.*]], [[RIGHT]] ], [ [[AGG_RIGHT]], [[RIGHT]] ], [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_LEFT]], [[LEFT]] ] +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_LEFT]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ], [ [[AGG_RIGHT]], [[RIGHT]] ] ; CHECK-NEXT: call void @baz() -; CHECK-NEXT: ret { i32, i32 } [[I8_MERGED]] +; CHECK-NEXT: ret { i32, i32 } [[AGG_LEFT_PN]] ; entry: br i1 %c, label %left, label %right @@ -484,23 +472,15 @@ define { i32, i32 } @test9({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 % ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] ; CHECK: left: -; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0 -; CHECK-NEXT: [[I1:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT]], 1 -; CHECK-NEXT: [[I2:%.*]] = insertvalue { i32, i32 } undef, i32 [[I0]], 0 ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[END:%.*]] ; CHECK: right: -; CHECK-NEXT: [[I3:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 0 -; CHECK-NEXT: [[I4:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT]], 1 -; CHECK-NEXT: [[I5:%.*]] = insertvalue { i32, i32 } undef, i32 [[I3]], 0 ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[I6:%.*]] = phi { i32, i32 } [ [[I2]], [[LEFT]] ], [ [[I5]], [[RIGHT]] ] -; CHECK-NEXT: [[I7:%.*]] = phi i32 [ [[I1]], [[LEFT]] ], [ [[I4]], [[RIGHT]] ] +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ] ; CHECK-NEXT: call void @baz() -; CHECK-NEXT: [[I8:%.*]] = insertvalue { i32, i32 } [[I6]], i32 [[I7]], 1 -; CHECK-NEXT: ret { i32, i32 } [[I8]] +; CHECK-NEXT: ret { i32, i32 } [[AGG_LEFT_PN]] ; entry: br i1 %c, label %left, label %right diff --git a/llvm/test/Transforms/InstCombine/phi-equal-incoming-pointers.ll b/llvm/test/Transforms/InstCombine/phi-equal-incoming-pointers.ll index db5402bd78c12..4e37dfc1357f2 100644 --- a/llvm/test/Transforms/InstCombine/phi-equal-incoming-pointers.ll +++ b/llvm/test/Transforms/InstCombine/phi-equal-incoming-pointers.ll @@ -15,17 +15,13 @@ define i32 @test_gep_and_bitcast(i1 %cond, i1 %cond2) { ; ALL-NEXT: [[OBJ:%.*]] = call i8* @get_ptr.i8() ; ALL-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; ALL: bb1: -; ALL-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 -; ALL-NEXT: [[PTR1_TYPED:%.*]] = bitcast i8* [[PTR1]] to i32* ; ALL-NEXT: br label [[EXIT:%.*]] ; ALL: bb2: -; ALL-NEXT: [[PTR2:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 -; ALL-NEXT: [[PTR2_TYPED:%.*]] = bitcast i8* [[PTR2]] to i32* ; ALL-NEXT: br label [[EXIT]] ; ALL: exit: -; ALL-NEXT: [[PTR_TYPED:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB1]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI_IN:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB1]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[RES_PHI_IN]], align 4 +; ALL-NEXT: [[PTR_TYPED_IN:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 +; ALL-NEXT: [[PTR_TYPED:%.*]] = bitcast i8* [[PTR_TYPED_IN]] to i32* +; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: store i32 1, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: [[RES:%.*]] = select i1 [[COND2:%.*]], i32 [[RES_PHI]], i32 1 ; ALL-NEXT: ret i32 [[RES]] @@ -60,17 +56,13 @@ define i32 @test_gep_and_bitcast_arg(i8* %obj, i1 %cond, i1 %cond2) { ; ALL-NEXT: entry: ; ALL-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; ALL: bb1: -; ALL-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, i8* [[OBJ:%.*]], i64 16 -; ALL-NEXT: [[PTR1_TYPED:%.*]] = bitcast i8* [[PTR1]] to i32* ; ALL-NEXT: br label [[EXIT:%.*]] ; ALL: bb2: -; ALL-NEXT: [[PTR2:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 -; ALL-NEXT: [[PTR2_TYPED:%.*]] = bitcast i8* [[PTR2]] to i32* ; ALL-NEXT: br label [[EXIT]] ; ALL: exit: -; ALL-NEXT: [[PTR_TYPED:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB1]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI_IN:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB1]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[RES_PHI_IN]], align 4 +; ALL-NEXT: [[PTR_TYPED_IN:%.*]] = getelementptr inbounds i8, i8* [[OBJ:%.*]], i64 16 +; ALL-NEXT: [[PTR_TYPED:%.*]] = bitcast i8* [[PTR_TYPED_IN]] to i32* +; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: store i32 1, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: [[RES:%.*]] = select i1 [[COND2:%.*]], i32 [[RES_PHI]], i32 1 ; ALL-NEXT: ret i32 [[RES]] @@ -116,17 +108,13 @@ define i32 @test_gep_and_bitcast_phi(i1 %cond, i1 %cond2, i1 %cond3) { ; ALL-NEXT: call void @foo.i8(i8* [[ANOTHER_PHI]]) ; ALL-NEXT: br i1 [[COND2:%.*]], label [[BB3:%.*]], label [[BB4:%.*]] ; ALL: bb3: -; ALL-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 -; ALL-NEXT: [[PTR1_TYPED:%.*]] = bitcast i8* [[PTR1]] to i32* ; ALL-NEXT: br label [[EXIT:%.*]] ; ALL: bb4: -; ALL-NEXT: [[PTR2:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 -; ALL-NEXT: [[PTR2_TYPED:%.*]] = bitcast i8* [[PTR2]] to i32* ; ALL-NEXT: br label [[EXIT]] ; ALL: exit: -; ALL-NEXT: [[PTR_TYPED:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB3]] ], [ [[PTR2_TYPED]], [[BB4]] ] -; ALL-NEXT: [[RES_PHI_IN:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB3]] ], [ [[PTR2_TYPED]], [[BB4]] ] -; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[RES_PHI_IN]], align 4 +; ALL-NEXT: [[PTR_TYPED_IN:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 +; ALL-NEXT: [[PTR_TYPED:%.*]] = bitcast i8* [[PTR_TYPED_IN]] to i32* +; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: store i32 1, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: [[RES:%.*]] = select i1 [[COND3:%.*]], i32 [[RES_PHI]], i32 1 ; ALL-NEXT: ret i32 [[RES]] @@ -176,15 +164,12 @@ define i32 @test_gep_i32ptr(i1 %cond, i1 %cond2) { ; ALL-NEXT: [[OBJ:%.*]] = call i32* @get_ptr.i32() ; ALL-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; ALL: bb1: -; ALL-NEXT: [[PTR1_TYPED:%.*]] = getelementptr inbounds i32, i32* [[OBJ]], i64 16 ; ALL-NEXT: br label [[EXIT:%.*]] ; ALL: bb2: -; ALL-NEXT: [[PTR2_TYPED:%.*]] = getelementptr inbounds i32, i32* [[OBJ]], i64 16 ; ALL-NEXT: br label [[EXIT]] ; ALL: exit: -; ALL-NEXT: [[PTR_TYPED:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB1]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI_IN:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB1]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[RES_PHI_IN]], align 4 +; ALL-NEXT: [[PTR_TYPED:%.*]] = getelementptr inbounds i32, i32* [[OBJ]], i64 16 +; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: store i32 1, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: [[RES:%.*]] = select i1 [[COND2:%.*]], i32 [[RES_PHI]], i32 1 ; ALL-NEXT: ret i32 [[RES]] @@ -218,17 +203,13 @@ define i32 @test_gep_and_bitcast_gep_base_ptr(i1 %cond, i1 %cond2) { ; ALL-NEXT: [[OBJ0:%.*]] = call i8* @get_ptr.i8() ; ALL-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; ALL: bb1: -; ALL-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, i8* [[OBJ0]], i64 32 -; ALL-NEXT: [[PTR1_TYPED:%.*]] = bitcast i8* [[PTR1]] to i32* ; ALL-NEXT: br label [[EXIT:%.*]] ; ALL: bb2: -; ALL-NEXT: [[PTR2:%.*]] = getelementptr inbounds i8, i8* [[OBJ0]], i64 32 -; ALL-NEXT: [[PTR2_TYPED:%.*]] = bitcast i8* [[PTR2]] to i32* ; ALL-NEXT: br label [[EXIT]] ; ALL: exit: -; ALL-NEXT: [[PTR_TYPED:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB1]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI_IN:%.*]] = phi i32* [ [[PTR1_TYPED]], [[BB1]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[RES_PHI_IN]], align 4 +; ALL-NEXT: [[PTR_TYPED_IN:%.*]] = getelementptr inbounds i8, i8* [[OBJ0]], i64 32 +; ALL-NEXT: [[PTR_TYPED:%.*]] = bitcast i8* [[PTR_TYPED_IN]] to i32* +; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: store i32 1, i32* [[PTR_TYPED]], align 4 ; ALL-NEXT: [[RES:%.*]] = select i1 [[COND2:%.*]], i32 [[RES_PHI]], i32 1 ; ALL-NEXT: ret i32 [[RES]] @@ -260,37 +241,19 @@ exit: } define i32 @test_gep_and_bitcast_same_bb(i1 %cond, i1 %cond2) { -; INSTCOMBINE-LABEL: @test_gep_and_bitcast_same_bb( -; INSTCOMBINE-NEXT: entry: -; INSTCOMBINE-NEXT: [[OBJ:%.*]] = call i8* @get_ptr.i8() -; INSTCOMBINE-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 -; INSTCOMBINE-NEXT: [[PTR1_TYPED:%.*]] = bitcast i8* [[PTR1]] to i32* -; INSTCOMBINE-NEXT: br i1 [[COND:%.*]], label [[EXIT:%.*]], label [[BB2:%.*]] -; INSTCOMBINE: bb2: -; INSTCOMBINE-NEXT: [[PTR2:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 -; INSTCOMBINE-NEXT: [[PTR2_TYPED:%.*]] = bitcast i8* [[PTR2]] to i32* -; INSTCOMBINE-NEXT: br label [[EXIT]] -; INSTCOMBINE: exit: -; INSTCOMBINE-NEXT: [[PTR_TYPED:%.*]] = phi i32* [ [[PTR1_TYPED]], [[ENTRY:%.*]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; INSTCOMBINE-NEXT: [[RES_PHI_IN:%.*]] = phi i32* [ [[PTR1_TYPED]], [[ENTRY]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; INSTCOMBINE-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[RES_PHI_IN]], align 4 -; INSTCOMBINE-NEXT: store i32 1, i32* [[PTR_TYPED]], align 4 -; INSTCOMBINE-NEXT: [[RES:%.*]] = select i1 [[COND2:%.*]], i32 [[RES_PHI]], i32 1 -; INSTCOMBINE-NEXT: ret i32 [[RES]] -; -; INSTCOMBINEGVN-LABEL: @test_gep_and_bitcast_same_bb( -; INSTCOMBINEGVN-NEXT: entry: -; INSTCOMBINEGVN-NEXT: [[OBJ:%.*]] = call i8* @get_ptr.i8() -; INSTCOMBINEGVN-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 -; INSTCOMBINEGVN-NEXT: [[PTR1_TYPED:%.*]] = bitcast i8* [[PTR1]] to i32* -; INSTCOMBINEGVN-NEXT: br i1 [[COND:%.*]], label [[EXIT:%.*]], label [[BB2:%.*]] -; INSTCOMBINEGVN: bb2: -; INSTCOMBINEGVN-NEXT: br label [[EXIT]] -; INSTCOMBINEGVN: exit: -; INSTCOMBINEGVN-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[PTR1_TYPED]], align 4 -; INSTCOMBINEGVN-NEXT: store i32 1, i32* [[PTR1_TYPED]], align 4 -; INSTCOMBINEGVN-NEXT: [[RES:%.*]] = select i1 [[COND2:%.*]], i32 [[RES_PHI]], i32 1 -; INSTCOMBINEGVN-NEXT: ret i32 [[RES]] +; ALL-LABEL: @test_gep_and_bitcast_same_bb( +; ALL-NEXT: entry: +; ALL-NEXT: [[OBJ:%.*]] = call i8* @get_ptr.i8() +; ALL-NEXT: br i1 [[COND:%.*]], label [[EXIT:%.*]], label [[BB2:%.*]] +; ALL: bb2: +; ALL-NEXT: br label [[EXIT]] +; ALL: exit: +; ALL-NEXT: [[PTR_TYPED_IN:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 +; ALL-NEXT: [[PTR_TYPED:%.*]] = bitcast i8* [[PTR_TYPED_IN]] to i32* +; ALL-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[PTR_TYPED]], align 4 +; ALL-NEXT: store i32 1, i32* [[PTR_TYPED]], align 4 +; ALL-NEXT: [[RES:%.*]] = select i1 [[COND2:%.*]], i32 [[RES_PHI]], i32 1 +; ALL-NEXT: ret i32 [[RES]] ; entry: %obj = call i8* @get_ptr.i8() @@ -328,8 +291,7 @@ define i32 @test_gep_and_bitcast_same_bb_and_extra_use(i1 %cond, i1 %cond2) { ; INSTCOMBINE-NEXT: br label [[EXIT]] ; INSTCOMBINE: exit: ; INSTCOMBINE-NEXT: [[PTR_TYPED:%.*]] = phi i32* [ [[PTR1_TYPED]], [[ENTRY:%.*]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; INSTCOMBINE-NEXT: [[RES_PHI_IN:%.*]] = phi i32* [ [[PTR1_TYPED]], [[ENTRY]] ], [ [[PTR2_TYPED]], [[BB2]] ] -; INSTCOMBINE-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[RES_PHI_IN]], align 4 +; INSTCOMBINE-NEXT: [[RES_PHI:%.*]] = load i32, i32* [[PTR_TYPED]], align 4 ; INSTCOMBINE-NEXT: store i32 1, i32* [[PTR_TYPED]], align 4 ; INSTCOMBINE-NEXT: [[RES:%.*]] = select i1 [[COND2:%.*]], i32 [[RES_PHI]], i32 1 ; INSTCOMBINE-NEXT: ret i32 [[RES]] @@ -378,15 +340,12 @@ define i8 @test_gep(i1 %cond, i1 %cond2) { ; ALL-NEXT: [[OBJ:%.*]] = call i8* @get_ptr.i8() ; ALL-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; ALL: bb1: -; ALL-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 ; ALL-NEXT: br label [[EXIT:%.*]] ; ALL: bb2: -; ALL-NEXT: [[PTR2:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 ; ALL-NEXT: br label [[EXIT]] ; ALL: exit: -; ALL-NEXT: [[PTR_TYPED:%.*]] = phi i8* [ [[PTR1]], [[BB1]] ], [ [[PTR2]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI_IN:%.*]] = phi i8* [ [[PTR1]], [[BB1]] ], [ [[PTR2]], [[BB2]] ] -; ALL-NEXT: [[RES_PHI:%.*]] = load i8, i8* [[RES_PHI_IN]], align 1 +; ALL-NEXT: [[PTR_TYPED:%.*]] = getelementptr inbounds i8, i8* [[OBJ]], i64 16 +; ALL-NEXT: [[RES_PHI:%.*]] = load i8, i8* [[PTR_TYPED]], align 1 ; ALL-NEXT: store i8 1, i8* [[PTR_TYPED]], align 1 ; ALL-NEXT: [[RES:%.*]] = select i1 [[COND2:%.*]], i8 [[RES_PHI]], i8 1 ; ALL-NEXT: ret i8 [[RES]] diff --git a/llvm/test/Transforms/InstCombine/phi-of-extractvalues.ll b/llvm/test/Transforms/InstCombine/phi-of-extractvalues.ll new file mode 100644 index 0000000000000..020b98407984d --- /dev/null +++ b/llvm/test/Transforms/InstCombine/phi-of-extractvalues.ll @@ -0,0 +1,386 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -instcombine < %s | FileCheck %s + +declare void @usei32(i32) + +; If we have a phi of extractvalues, we can sink it, +; Here, we only need a PHI for extracted values. +define i32 @test0({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) { +; CHECK-LABEL: @test0( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[R:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT_PN]], 0 +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue { i32, i32 } %agg_left, 0 + br label %end + +right: + %i1 = extractvalue { i32, i32 } %agg_right, 0 + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} + +; But only if the extractvalues have no extra uses +define i32 @test1_extrause0({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) { +; CHECK-LABEL: @test1_extrause0( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0 +; CHECK-NEXT: call void @usei32(i32 [[I0]]) +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: [[I1:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 0 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue { i32, i32 } %agg_left, 0 + call void @usei32(i32 %i0) + br label %end + +right: + %i1 = extractvalue { i32, i32 } %agg_right, 0 + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} +define i32 @test2_extrause1({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) { +; CHECK-LABEL: @test2_extrause1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: [[I1:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 0 +; CHECK-NEXT: call void @usei32(i32 [[I1]]) +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue { i32, i32 } %agg_left, 0 + br label %end + +right: + %i1 = extractvalue { i32, i32 } %agg_right, 0 + call void @usei32(i32 %i1) + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} +define i32 @test3_extrause2({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) { +; CHECK-LABEL: @test3_extrause2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0 +; CHECK-NEXT: call void @usei32(i32 [[I0]]) +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: [[I1:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 0 +; CHECK-NEXT: call void @usei32(i32 [[I1]]) +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue { i32, i32 } %agg_left, 0 + call void @usei32(i32 %i0) + br label %end + +right: + %i1 = extractvalue { i32, i32 } %agg_right, 0 + call void @usei32(i32 %i1) + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} + +; But the indicies must match +define i32 @test4({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: [[I1:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 1 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue { i32, i32 } %agg_left, 0 + br label %end + +right: + %i1 = extractvalue { i32, i32 } %agg_right, 1 + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} + +; More complex aggregates are fine, too, as long as indicies match. +define i32 @test5({{ i32, i32 }, { i32, i32 }} %agg_left, {{ i32, i32 }, { i32, i32 }} %agg_right, i1 %c) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { { i32, i32 }, { i32, i32 } } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[R:%.*]] = extractvalue { { i32, i32 }, { i32, i32 } } [[AGG_LEFT_PN]], 0, 0 +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue {{ i32, i32 }, { i32, i32 }} %agg_left, 0, 0 + br label %end + +right: + %i1 = extractvalue {{ i32, i32 }, { i32, i32 }} %agg_right, 0, 0 + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} + +; The indicies must fully match, on all levels. +define i32 @test6({{ i32, i32 }, { i32, i32 }} %agg_left, {{ i32, i32 }, { i32, i32 }} %agg_right, i1 %c) { +; CHECK-LABEL: @test6( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: [[I0:%.*]] = extractvalue { { i32, i32 }, { i32, i32 } } [[AGG_LEFT:%.*]], 0, 0 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: [[I1:%.*]] = extractvalue { { i32, i32 }, { i32, i32 } } [[AGG_RIGHT:%.*]], 0, 1 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue {{ i32, i32 }, { i32, i32 }} %agg_left, 0, 0 + br label %end + +right: + %i1 = extractvalue {{ i32, i32 }, { i32, i32 }} %agg_right, 0, 1 + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} +define i32 @test7({{ i32, i32 }, { i32, i32 }} %agg_left, {{ i32, i32 }, { i32, i32 }} %agg_right, i1 %c) { +; CHECK-LABEL: @test7( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: [[I0:%.*]] = extractvalue { { i32, i32 }, { i32, i32 } } [[AGG_LEFT:%.*]], 0, 0 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: [[I1:%.*]] = extractvalue { { i32, i32 }, { i32, i32 } } [[AGG_RIGHT:%.*]], 1, 0 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue {{ i32, i32 }, { i32, i32 }} %agg_left, 0, 0 + br label %end + +right: + %i1 = extractvalue {{ i32, i32 }, { i32, i32 }} %agg_right, 1, 0 + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} +define i32 @test8({{ i32, i32 }, { i32, i32 }} %agg_left, {{ i32, i32 }, { i32, i32 }} %agg_right, i1 %c) { +; CHECK-LABEL: @test8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: [[I0:%.*]] = extractvalue { { i32, i32 }, { i32, i32 } } [[AGG_LEFT:%.*]], 0, 0 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: [[I1:%.*]] = extractvalue { { i32, i32 }, { i32, i32 } } [[AGG_RIGHT:%.*]], 1, 1 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue {{ i32, i32 }, { i32, i32 }} %agg_left, 0, 0 + br label %end + +right: + %i1 = extractvalue {{ i32, i32 }, { i32, i32 }} %agg_right, 1, 1 + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} + +; Also, unlike PHI-of-insertvalues, here the base aggregates of extractvalue +; can have different types, and just checking the indicies is not enough. +define i32 @test9({ i32, i32 } %agg_left, { i32, { i32, i32 } } %agg_right, i1 %c) { +; CHECK-LABEL: @test9( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: [[I1:%.*]] = extractvalue { i32, { i32, i32 } } [[AGG_RIGHT:%.*]], 0 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = extractvalue { i32, i32 } %agg_left, 0 + br label %end + +right: + %i1 = extractvalue { i32, { i32, i32 } } %agg_right, 0 + br label %end + +end: + %r = phi i32 [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} + +; It is fine if there are multiple uses of the PHI's value, as long as they are all in the PHI node itself +define i32 @test10({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c0, i1 %c1) { +; CHECK-LABEL: @test10( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C0:%.*]], label [[END:%.*]], label [[DISPATCH:%.*]] +; CHECK: dispatch: +; CHECK-NEXT: br i1 [[C1:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: br label [[END]] +; CHECK: right: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[ENTRY:%.*]] ], [ [[AGG_LEFT]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[R:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT_PN]], 0 +; CHECK-NEXT: ret i32 [[R]] +; +entry: + %i0 = extractvalue { i32, i32 } %agg_left, 0 + %i1 = extractvalue { i32, i32 } %agg_right, 0 + br i1 %c0, label %end, label %dispatch + +dispatch: + br i1 %c1, label %left, label %right + +left: + br label %end + +right: + br label %end + +end: + %r = phi i32 [ %i0, %entry ], [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} +; Which isn't the case here, there is a legitimate external use. +define i32 @test11({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c0, i1 %c1) { +; CHECK-LABEL: @test11( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0 +; CHECK-NEXT: [[I1:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 0 +; CHECK-NEXT: call void @usei32(i32 [[I0]]) +; CHECK-NEXT: br i1 [[C0:%.*]], label [[END:%.*]], label [[DISPATCH:%.*]] +; CHECK: dispatch: +; CHECK-NEXT: br i1 [[C1:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: br label [[END]] +; CHECK: right: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I0]], [[ENTRY:%.*]] ], [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + %i0 = extractvalue { i32, i32 } %agg_left, 0 + %i1 = extractvalue { i32, i32 } %agg_right, 0 + call void @usei32(i32 %i0) + br i1 %c0, label %end, label %dispatch + +dispatch: + br i1 %c1, label %left, label %right + +left: + br label %end + +right: + br label %end + +end: + %r = phi i32 [ %i0, %entry ], [ %i0, %left ], [ %i1, %right ] + ret i32 %r +} diff --git a/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll b/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll index dc9c45ca12945..d6961b8b7ba1a 100644 --- a/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll +++ b/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll @@ -10,13 +10,12 @@ define { i32, i32 } @test0({ i32, i32 } %agg, i32 %val_left, i32 %val_right, i1 ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] ; CHECK: left: -; CHECK-NEXT: [[I0:%.*]] = insertvalue { i32, i32 } [[AGG:%.*]], i32 [[VAL_LEFT:%.*]], 0 ; CHECK-NEXT: br label [[END:%.*]] ; CHECK: right: -; CHECK-NEXT: [[I1:%.*]] = insertvalue { i32, i32 } [[AGG]], i32 [[VAL_RIGHT:%.*]], 0 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[R:%.*]] = phi { i32, i32 } [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: [[VAL_LEFT_PN:%.*]] = phi i32 [ [[VAL_LEFT:%.*]], [[LEFT]] ], [ [[VAL_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[R:%.*]] = insertvalue { i32, i32 } [[AGG:%.*]], i32 [[VAL_LEFT_PN]], 0 ; CHECK-NEXT: ret { i32, i32 } [[R]] ; entry: @@ -138,13 +137,12 @@ define { i32, i32 } @test4({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i32 ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] ; CHECK: left: -; CHECK-NEXT: [[I0:%.*]] = insertvalue { i32, i32 } [[AGG_LEFT:%.*]], i32 [[VAL:%.*]], 0 ; CHECK-NEXT: br label [[END:%.*]] ; CHECK: right: -; CHECK-NEXT: [[I1:%.*]] = insertvalue { i32, i32 } [[AGG_RIGHT:%.*]], i32 [[VAL]], 0 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[R:%.*]] = phi { i32, i32 } [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[R:%.*]] = insertvalue { i32, i32 } [[AGG_LEFT_PN]], i32 [[VAL:%.*]], 0 ; CHECK-NEXT: ret { i32, i32 } [[R]] ; entry: @@ -169,13 +167,13 @@ define { i32, i32 } @test5({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i32 ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] ; CHECK: left: -; CHECK-NEXT: [[I0:%.*]] = insertvalue { i32, i32 } [[AGG_LEFT:%.*]], i32 [[VAL_LEFT:%.*]], 0 ; CHECK-NEXT: br label [[END:%.*]] ; CHECK: right: -; CHECK-NEXT: [[I1:%.*]] = insertvalue { i32, i32 } [[AGG_RIGHT:%.*]], i32 [[VAL_RIGHT:%.*]], 0 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[R:%.*]] = phi { i32, i32 } [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: [[AGG_LEFT_PN:%.*]] = phi { i32, i32 } [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[VAL_LEFT_PN:%.*]] = phi i32 [ [[VAL_LEFT:%.*]], [[LEFT]] ], [ [[VAL_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[R:%.*]] = insertvalue { i32, i32 } [[AGG_LEFT_PN]], i32 [[VAL_LEFT_PN]], 0 ; CHECK-NEXT: ret { i32, i32 } [[R]] ; entry: @@ -224,3 +222,136 @@ end: %r = phi { i32, i32 } [ %i0, %left ], [ %i1, %right ] ret { i32, i32 } %r } + +; More complex aggregates are fine, too, as long as indicies match. +define {{ i32, i32 }, { i32, i32 }} @test7({{ i32, i32 }, { i32, i32 }} %agg, i32 %val_left, i32 %val_right, i1 %c) { +; CHECK-LABEL: @test7( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[VAL_LEFT_PN:%.*]] = phi i32 [ [[VAL_LEFT:%.*]], [[LEFT]] ], [ [[VAL_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[R:%.*]] = insertvalue { { i32, i32 }, { i32, i32 } } [[AGG:%.*]], i32 [[VAL_LEFT_PN]], 0, 0 +; CHECK-NEXT: ret { { i32, i32 }, { i32, i32 } } [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = insertvalue {{ i32, i32 }, { i32, i32 }} %agg, i32 %val_left, 0, 0 + br label %end + +right: + %i1 = insertvalue {{ i32, i32 }, { i32, i32 }} %agg, i32 %val_right, 0, 0 + br label %end + +end: + %r = phi {{ i32, i32 }, { i32, i32 }} [ %i0, %left ], [ %i1, %right ] + ret {{ i32, i32 }, { i32, i32 }} %r +} + +; The indicies must fully match, on all levels. +define {{ i32, i32 }, { i32, i32 }} @test8({{ i32, i32 }, { i32, i32 }} %agg, i32 %val_left, i32 %val_right, i1 %c) { +; CHECK-LABEL: @test8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: [[I0:%.*]] = insertvalue { { i32, i32 }, { i32, i32 } } [[AGG:%.*]], i32 [[VAL_LEFT:%.*]], 0, 0 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: right: +; CHECK-NEXT: [[I1:%.*]] = insertvalue { { i32, i32 }, { i32, i32 } } [[AGG]], i32 [[VAL_RIGHT:%.*]], 0, 1 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi { { i32, i32 }, { i32, i32 } } [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret { { i32, i32 }, { i32, i32 } } [[R]] +; +entry: + br i1 %c, label %left, label %right + +left: + %i0 = insertvalue {{ i32, i32 }, { i32, i32 }} %agg, i32 %val_left, 0, 0 + br label %end + +right: + %i1 = insertvalue {{ i32, i32 }, { i32, i32 }} %agg, i32 %val_right, 0, 1 + br label %end + +end: + %r = phi {{ i32, i32 }, { i32, i32 }} [ %i0, %left ], [ %i1, %right ] + ret {{ i32, i32 }, { i32, i32 }} %r +} + +; It is fine if there are multiple uses of the PHI's value, as long as they are all in the PHI node itself +define { i32, i32 } @test9({ i32, i32 } %agg, i32 %val_left, i32 %val_right, i1 %c0, i1 %c1) { +; CHECK-LABEL: @test9( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C0:%.*]], label [[END:%.*]], label [[DISPATCH:%.*]] +; CHECK: dispatch: +; CHECK-NEXT: br i1 [[C1:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: br label [[END]] +; CHECK: right: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[VAL_LEFT_PN:%.*]] = phi i32 [ [[VAL_LEFT:%.*]], [[ENTRY:%.*]] ], [ [[VAL_LEFT]], [[LEFT]] ], [ [[VAL_RIGHT:%.*]], [[RIGHT]] ] +; CHECK-NEXT: [[R:%.*]] = insertvalue { i32, i32 } [[AGG:%.*]], i32 [[VAL_LEFT_PN]], 0 +; CHECK-NEXT: ret { i32, i32 } [[R]] +; +entry: + %i0 = insertvalue { i32, i32 } %agg, i32 %val_left, 0 + %i1 = insertvalue { i32, i32 } %agg, i32 %val_right, 0 + br i1 %c0, label %end, label %dispatch + +dispatch: + br i1 %c1, label %left, label %right + +left: + br label %end + +right: + br label %end + +end: + %r = phi { i32, i32 } [ %i0, %entry ], [ %i0, %left ], [ %i1, %right ] + ret { i32, i32 } %r +} +; Which isn't the case here, there is a legitimate external use. +define { i32, i32 } @test10({ i32, i32 } %agg, i32 %val_left, i32 %val_right, i1 %c0, i1 %c1) { +; CHECK-LABEL: @test10( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I0:%.*]] = insertvalue { i32, i32 } [[AGG:%.*]], i32 [[VAL_LEFT:%.*]], 0 +; CHECK-NEXT: [[I1:%.*]] = insertvalue { i32, i32 } [[AGG]], i32 [[VAL_RIGHT:%.*]], 0 +; CHECK-NEXT: call void @usei32i32agg({ i32, i32 } [[I0]]) +; CHECK-NEXT: br i1 [[C0:%.*]], label [[END:%.*]], label [[DISPATCH:%.*]] +; CHECK: dispatch: +; CHECK-NEXT: br i1 [[C1:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CHECK: left: +; CHECK-NEXT: br label [[END]] +; CHECK: right: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi { i32, i32 } [ [[I0]], [[ENTRY:%.*]] ], [ [[I0]], [[LEFT]] ], [ [[I1]], [[RIGHT]] ] +; CHECK-NEXT: ret { i32, i32 } [[R]] +; +entry: + %i0 = insertvalue { i32, i32 } %agg, i32 %val_left, 0 + %i1 = insertvalue { i32, i32 } %agg, i32 %val_right, 0 + call void @usei32i32agg({ i32, i32 } %i0) + br i1 %c0, label %end, label %dispatch + +dispatch: + br i1 %c1, label %left, label %right + +left: + br label %end + +right: + br label %end + +end: + %r = phi { i32, i32 } [ %i0, %entry ], [ %i0, %left ], [ %i1, %right ] + ret { i32, i32 } %r +} diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index 73e1e4af7aac6..aa5472ef43b8c 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -1926,10 +1926,7 @@ define i32 @select_dominance_chain(i1 %cond, i32 %x, i32 %y) { ; CHECK-NEXT: br label [[MERGE_3]] ; CHECK: merge.3: ; CHECK-NEXT: [[S_3:%.*]] = phi i32 [ [[Y:%.*]], [[IF_FALSE_3]] ], [ [[X:%.*]], [[IF_TRUE_3]] ] -; CHECK-NEXT: [[S_2:%.*]] = phi i32 [ [[Y]], [[IF_FALSE_3]] ], [ [[X]], [[IF_TRUE_3]] ] -; CHECK-NEXT: [[S_1:%.*]] = phi i32 [ [[Y]], [[IF_FALSE_3]] ], [ [[X]], [[IF_TRUE_3]] ] -; CHECK-NEXT: [[SUM_1:%.*]] = add i32 [[S_1]], [[S_2]] -; CHECK-NEXT: [[SUM_2:%.*]] = add i32 [[SUM_1]], [[S_3]] +; CHECK-NEXT: [[SUM_2:%.*]] = mul i32 [[S_3]], 3 ; CHECK-NEXT: ret i32 [[SUM_2]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/strlen-1.ll b/llvm/test/Transforms/InstCombine/strlen-1.ll index e9a6e714acd0b..61bc2e57b08d2 100644 --- a/llvm/test/Transforms/InstCombine/strlen-1.ll +++ b/llvm/test/Transforms/InstCombine/strlen-1.ll @@ -223,9 +223,7 @@ define i32 @test2(i8* %str) #0 { define i1 @strlen0_after_write_to_first_byte_global() { ; CHECK-LABEL: @strlen0_after_write_to_first_byte_global( ; CHECK-NEXT: store i8 49, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @a, i32 0, i32 0), align 16 -; CHECK-NEXT: [[LEN:%.*]] = tail call i32 @strlen(i8* nonnull dereferenceable(1) getelementptr inbounds ([32 x i8], [32 x i8]* @a, i32 0, i32 0)) -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LEN]], 0 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 false ; store i8 49, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @a, i64 0, i64 0), align 16 %len = tail call i32 @strlen(i8* nonnull dereferenceable(1) getelementptr inbounds ([32 x i8], [32 x i8]* @a, i64 0, i64 0)) @@ -236,8 +234,8 @@ define i1 @strlen0_after_write_to_first_byte_global() { define i1 @strlen0_after_write_to_second_byte_global() { ; CHECK-LABEL: @strlen0_after_write_to_second_byte_global( ; CHECK-NEXT: store i8 49, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @a, i32 0, i32 1), align 16 -; CHECK-NEXT: [[LEN:%.*]] = tail call i32 @strlen(i8* nonnull dereferenceable(1) getelementptr inbounds ([32 x i8], [32 x i8]* @a, i32 0, i32 0)) -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LEN]], 0 +; CHECK-NEXT: [[STRLENFIRST:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @a, i32 0, i32 0), align 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[STRLENFIRST]], 0 ; CHECK-NEXT: ret i1 [[CMP]] ; store i8 49, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @a, i64 0, i64 1), align 16 diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll index 88afdadc122fe..3f308485342ec 100644 --- a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -21,7 +21,7 @@ define i32 @test2(float %f) { define void @get_image() nounwind { ; CHECK-LABEL: @get_image( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @fgetc(i8* null) #0 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @fgetc(i8* null) [[ATTR0:#.*]] ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] @@ -746,3 +746,105 @@ define <4 x i8> @select_cond_(<4 x i8> %x, <4 x i8> %min, <4 x i1> %cmp, i1 %poi %r = select <4 x i1> %ins, <4 x i8> %vecins, <4 x i8> %x ret <4 x i8> %r } + +define <4 x float> @ins_of_ext(<4 x float> %x, float %y) { +; CHECK-LABEL: @ins_of_ext( +; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 1 +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3 +; CHECK-NEXT: ret <4 x float> [[I3]] +; + %e0 = extractelement <4 x float> %x, i32 0 + %i0 = insertelement <4 x float> undef, float %e0, i32 0 + %i1 = insertelement <4 x float> %i0, float %y, i32 1 + %i2 = insertelement <4 x float> %i1, float %y, i32 2 + %i3 = insertelement <4 x float> %i2, float %y, i32 3 + ret <4 x float> %i3 +} + +define <4 x float> @ins_of_ext_twice(<4 x float> %x, float %y) { +; CHECK-LABEL: @ins_of_ext_twice( +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3 +; CHECK-NEXT: ret <4 x float> [[I3]] +; + %e0 = extractelement <4 x float> %x, i32 0 + %i0 = insertelement <4 x float> undef, float %e0, i32 0 + %e1 = extractelement <4 x float> %x, i32 1 + %i1 = insertelement <4 x float> %i0, float %e1, i32 1 + %i2 = insertelement <4 x float> %i1, float %y, i32 2 + %i3 = insertelement <4 x float> %i2, float %y, i32 3 + ret <4 x float> %i3 +} + +; Negative test - element 3 of the result must be undef to be poison safe. +; TODO: Could convert insert/extract to identity shuffle with undef mask elements. + +define <4 x float> @ins_of_ext_wrong_demand(<4 x float> %x, float %y) { +; CHECK-LABEL: @ins_of_ext_wrong_demand( +; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0 +; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[E0]], i32 0 +; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[Y:%.*]], i32 1 +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y]], i32 2 +; CHECK-NEXT: ret <4 x float> [[I2]] +; + %e0 = extractelement <4 x float> %x, i32 0 + %i0 = insertelement <4 x float> undef, float %e0, i32 0 + %i1 = insertelement <4 x float> %i0, float %y, i32 1 + %i2 = insertelement <4 x float> %i1, float %y, i32 2 + ret <4 x float> %i2 +} + +; Negative test - can't replace i0 with x. +; TODO: Could convert insert/extract to identity shuffle with undef mask elements. + +define <4 x float> @ins_of_ext_wrong_type(<5 x float> %x, float %y) { +; CHECK-LABEL: @ins_of_ext_wrong_type( +; CHECK-NEXT: [[E0:%.*]] = extractelement <5 x float> [[X:%.*]], i32 0 +; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[E0]], i32 0 +; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[Y:%.*]], i32 1 +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3 +; CHECK-NEXT: ret <4 x float> [[I3]] +; + %e0 = extractelement <5 x float> %x, i32 0 + %i0 = insertelement <4 x float> undef, float %e0, i32 0 + %i1 = insertelement <4 x float> %i0, float %y, i32 1 + %i2 = insertelement <4 x float> %i1, float %y, i32 2 + %i3 = insertelement <4 x float> %i2, float %y, i32 3 + ret <4 x float> %i3 +} + +; This should reduce, but the shuffle mask must remain as-is (no extra undef). + +define <4 x i4> @ins_of_ext_undef_elts_propagation(<4 x i4> %v, <4 x i4> %v2, i4 %x) { +; CHECK-LABEL: @ins_of_ext_undef_elts_propagation( +; CHECK-NEXT: [[T2:%.*]] = insertelement <4 x i4> [[V:%.*]], i4 [[X:%.*]], i32 2 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i4> [[T2]], <4 x i4> [[V2:%.*]], <4 x i32> +; CHECK-NEXT: ret <4 x i4> [[R]] +; + %v0 = extractelement <4 x i4> %v, i32 0 + %t0 = insertelement <4 x i4> undef, i4 %v0, i32 0 + %t2 = insertelement <4 x i4> %t0, i4 %x, i32 2 + %r = shufflevector <4 x i4> %t2, <4 x i4> %v2, <4 x i32> + ret <4 x i4> %r +} + +; Similar to above, but more ops/uses to verify things work in more complicated cases. + +define <8 x i4> @ins_of_ext_undef_elts_propagation2(<8 x i4> %v, <8 x i4> %v2, i4 %x) { +; CHECK-LABEL: @ins_of_ext_undef_elts_propagation2( +; CHECK-NEXT: [[I19:%.*]] = insertelement <8 x i4> [[V:%.*]], i4 [[X:%.*]], i32 2 +; CHECK-NEXT: [[I20:%.*]] = shufflevector <8 x i4> [[I19]], <8 x i4> [[V2:%.*]], <8 x i32> +; CHECK-NEXT: [[I21:%.*]] = shufflevector <8 x i4> [[I20]], <8 x i4> [[V]], <8 x i32> +; CHECK-NEXT: ret <8 x i4> [[I21]] +; + %i15 = extractelement <8 x i4> %v, i32 0 + %i16 = insertelement <8 x i4> undef, i4 %i15, i32 0 + %i17 = extractelement <8 x i4> %v, i32 1 + %i18 = insertelement <8 x i4> %i16, i4 %i17, i32 1 + %i19 = insertelement <8 x i4> %i18, i4 %x, i32 2 + %i20 = shufflevector <8 x i4> %i19, <8 x i4> %v2, <8 x i32> + %i21 = shufflevector <8 x i4> %i20, <8 x i4> %v, <8 x i32> + ret <8 x i4> %i21 +} diff --git a/llvm/test/Other/2002-03-11-ConstPropCrash.ll b/llvm/test/Transforms/InstSimplify/ConstProp/2002-03-11-ConstPropCrash.ll similarity index 96% rename from llvm/test/Other/2002-03-11-ConstPropCrash.ll rename to llvm/test/Transforms/InstSimplify/ConstProp/2002-03-11-ConstPropCrash.ll index a6d4f5b3dbcc3..fd74fff636ae0 100644 --- a/llvm/test/Other/2002-03-11-ConstPropCrash.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/2002-03-11-ConstPropCrash.ll @@ -5,7 +5,7 @@ ; ; Fixed by adding new arguments to ConstantFoldTerminator ; -; RUN: opt < %s -constprop +; RUN: opt < %s -instsimplify define void @build_tree(i32 %ml) { ;